ocfs2: ocfs2_claim_*() don't need an ocfs2_super argument.
[linux-2.6-block.git] / fs / ocfs2 / alloc.c
1 /* -*- mode: c; c-basic-offset: 8; -*-
2  * vim: noexpandtab sw=8 ts=8 sts=0:
3  *
4  * alloc.c
5  *
6  * Extent allocs and frees
7  *
8  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
9  *
10  * This program is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU General Public
12  * License as published by the Free Software Foundation; either
13  * version 2 of the License, or (at your option) any later version.
14  *
15  * This program is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18  * General Public License for more details.
19  *
20  * You should have received a copy of the GNU General Public
21  * License along with this program; if not, write to the
22  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23  * Boston, MA 021110-1307, USA.
24  */
25
26 #include <linux/fs.h>
27 #include <linux/types.h>
28 #include <linux/slab.h>
29 #include <linux/highmem.h>
30 #include <linux/swap.h>
31 #include <linux/quotaops.h>
32
33 #define MLOG_MASK_PREFIX ML_DISK_ALLOC
34 #include <cluster/masklog.h>
35
36 #include "ocfs2.h"
37
38 #include "alloc.h"
39 #include "aops.h"
40 #include "blockcheck.h"
41 #include "dlmglue.h"
42 #include "extent_map.h"
43 #include "inode.h"
44 #include "journal.h"
45 #include "localalloc.h"
46 #include "suballoc.h"
47 #include "sysfile.h"
48 #include "file.h"
49 #include "super.h"
50 #include "uptodate.h"
51 #include "xattr.h"
52 #include "refcounttree.h"
53
54 #include "buffer_head_io.h"
55
56 enum ocfs2_contig_type {
57         CONTIG_NONE = 0,
58         CONTIG_LEFT,
59         CONTIG_RIGHT,
60         CONTIG_LEFTRIGHT,
61 };
62
63 static enum ocfs2_contig_type
64         ocfs2_extent_rec_contig(struct super_block *sb,
65                                 struct ocfs2_extent_rec *ext,
66                                 struct ocfs2_extent_rec *insert_rec);
67 /*
68  * Operations for a specific extent tree type.
69  *
70  * To implement an on-disk btree (extent tree) type in ocfs2, add
71  * an ocfs2_extent_tree_operations structure and the matching
72  * ocfs2_init_<thingy>_extent_tree() function.  That's pretty much it
73  * for the allocation portion of the extent tree.
74  */
75 struct ocfs2_extent_tree_operations {
76         /*
77          * last_eb_blk is the block number of the right most leaf extent
78          * block.  Most on-disk structures containing an extent tree store
79          * this value for fast access.  The ->eo_set_last_eb_blk() and
80          * ->eo_get_last_eb_blk() operations access this value.  They are
81          *  both required.
82          */
83         void (*eo_set_last_eb_blk)(struct ocfs2_extent_tree *et,
84                                    u64 blkno);
85         u64 (*eo_get_last_eb_blk)(struct ocfs2_extent_tree *et);
86
87         /*
88          * The on-disk structure usually keeps track of how many total
89          * clusters are stored in this extent tree.  This function updates
90          * that value.  new_clusters is the delta, and must be
91          * added to the total.  Required.
92          */
93         void (*eo_update_clusters)(struct ocfs2_extent_tree *et,
94                                    u32 new_clusters);
95
96         /*
97          * If this extent tree is supported by an extent map, insert
98          * a record into the map.
99          */
100         void (*eo_extent_map_insert)(struct ocfs2_extent_tree *et,
101                                      struct ocfs2_extent_rec *rec);
102
103         /*
104          * If this extent tree is supported by an extent map, truncate the
105          * map to clusters,
106          */
107         void (*eo_extent_map_truncate)(struct ocfs2_extent_tree *et,
108                                        u32 clusters);
109
110         /*
111          * If ->eo_insert_check() exists, it is called before rec is
112          * inserted into the extent tree.  It is optional.
113          */
114         int (*eo_insert_check)(struct ocfs2_extent_tree *et,
115                                struct ocfs2_extent_rec *rec);
116         int (*eo_sanity_check)(struct ocfs2_extent_tree *et);
117
118         /*
119          * --------------------------------------------------------------
120          * The remaining are internal to ocfs2_extent_tree and don't have
121          * accessor functions
122          */
123
124         /*
125          * ->eo_fill_root_el() takes et->et_object and sets et->et_root_el.
126          * It is required.
127          */
128         void (*eo_fill_root_el)(struct ocfs2_extent_tree *et);
129
130         /*
131          * ->eo_fill_max_leaf_clusters sets et->et_max_leaf_clusters if
132          * it exists.  If it does not, et->et_max_leaf_clusters is set
133          * to 0 (unlimited).  Optional.
134          */
135         void (*eo_fill_max_leaf_clusters)(struct ocfs2_extent_tree *et);
136
137         /*
138          * ->eo_extent_contig test whether the 2 ocfs2_extent_rec
139          * are contiguous or not. Optional. Don't need to set it if use
140          * ocfs2_extent_rec as the tree leaf.
141          */
142         enum ocfs2_contig_type
143                 (*eo_extent_contig)(struct ocfs2_extent_tree *et,
144                                     struct ocfs2_extent_rec *ext,
145                                     struct ocfs2_extent_rec *insert_rec);
146 };
147
148
149 /*
150  * Pre-declare ocfs2_dinode_et_ops so we can use it as a sanity check
151  * in the methods.
152  */
153 static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et);
154 static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et,
155                                          u64 blkno);
156 static void ocfs2_dinode_update_clusters(struct ocfs2_extent_tree *et,
157                                          u32 clusters);
158 static void ocfs2_dinode_extent_map_insert(struct ocfs2_extent_tree *et,
159                                            struct ocfs2_extent_rec *rec);
160 static void ocfs2_dinode_extent_map_truncate(struct ocfs2_extent_tree *et,
161                                              u32 clusters);
162 static int ocfs2_dinode_insert_check(struct ocfs2_extent_tree *et,
163                                      struct ocfs2_extent_rec *rec);
164 static int ocfs2_dinode_sanity_check(struct ocfs2_extent_tree *et);
165 static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et);
166 static struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = {
167         .eo_set_last_eb_blk     = ocfs2_dinode_set_last_eb_blk,
168         .eo_get_last_eb_blk     = ocfs2_dinode_get_last_eb_blk,
169         .eo_update_clusters     = ocfs2_dinode_update_clusters,
170         .eo_extent_map_insert   = ocfs2_dinode_extent_map_insert,
171         .eo_extent_map_truncate = ocfs2_dinode_extent_map_truncate,
172         .eo_insert_check        = ocfs2_dinode_insert_check,
173         .eo_sanity_check        = ocfs2_dinode_sanity_check,
174         .eo_fill_root_el        = ocfs2_dinode_fill_root_el,
175 };
176
177 static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et,
178                                          u64 blkno)
179 {
180         struct ocfs2_dinode *di = et->et_object;
181
182         BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
183         di->i_last_eb_blk = cpu_to_le64(blkno);
184 }
185
186 static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et)
187 {
188         struct ocfs2_dinode *di = et->et_object;
189
190         BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
191         return le64_to_cpu(di->i_last_eb_blk);
192 }
193
194 static void ocfs2_dinode_update_clusters(struct ocfs2_extent_tree *et,
195                                          u32 clusters)
196 {
197         struct ocfs2_inode_info *oi = cache_info_to_inode(et->et_ci);
198         struct ocfs2_dinode *di = et->et_object;
199
200         le32_add_cpu(&di->i_clusters, clusters);
201         spin_lock(&oi->ip_lock);
202         oi->ip_clusters = le32_to_cpu(di->i_clusters);
203         spin_unlock(&oi->ip_lock);
204 }
205
206 static void ocfs2_dinode_extent_map_insert(struct ocfs2_extent_tree *et,
207                                            struct ocfs2_extent_rec *rec)
208 {
209         struct inode *inode = &cache_info_to_inode(et->et_ci)->vfs_inode;
210
211         ocfs2_extent_map_insert_rec(inode, rec);
212 }
213
214 static void ocfs2_dinode_extent_map_truncate(struct ocfs2_extent_tree *et,
215                                              u32 clusters)
216 {
217         struct inode *inode = &cache_info_to_inode(et->et_ci)->vfs_inode;
218
219         ocfs2_extent_map_trunc(inode, clusters);
220 }
221
222 static int ocfs2_dinode_insert_check(struct ocfs2_extent_tree *et,
223                                      struct ocfs2_extent_rec *rec)
224 {
225         struct ocfs2_inode_info *oi = cache_info_to_inode(et->et_ci);
226         struct ocfs2_super *osb = OCFS2_SB(oi->vfs_inode.i_sb);
227
228         BUG_ON(oi->ip_dyn_features & OCFS2_INLINE_DATA_FL);
229         mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) &&
230                         (oi->ip_clusters != le32_to_cpu(rec->e_cpos)),
231                         "Device %s, asking for sparse allocation: inode %llu, "
232                         "cpos %u, clusters %u\n",
233                         osb->dev_str,
234                         (unsigned long long)oi->ip_blkno,
235                         rec->e_cpos, oi->ip_clusters);
236
237         return 0;
238 }
239
240 static int ocfs2_dinode_sanity_check(struct ocfs2_extent_tree *et)
241 {
242         struct ocfs2_dinode *di = et->et_object;
243
244         BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
245         BUG_ON(!OCFS2_IS_VALID_DINODE(di));
246
247         return 0;
248 }
249
250 static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et)
251 {
252         struct ocfs2_dinode *di = et->et_object;
253
254         et->et_root_el = &di->id2.i_list;
255 }
256
257
258 static void ocfs2_xattr_value_fill_root_el(struct ocfs2_extent_tree *et)
259 {
260         struct ocfs2_xattr_value_buf *vb = et->et_object;
261
262         et->et_root_el = &vb->vb_xv->xr_list;
263 }
264
265 static void ocfs2_xattr_value_set_last_eb_blk(struct ocfs2_extent_tree *et,
266                                               u64 blkno)
267 {
268         struct ocfs2_xattr_value_buf *vb = et->et_object;
269
270         vb->vb_xv->xr_last_eb_blk = cpu_to_le64(blkno);
271 }
272
273 static u64 ocfs2_xattr_value_get_last_eb_blk(struct ocfs2_extent_tree *et)
274 {
275         struct ocfs2_xattr_value_buf *vb = et->et_object;
276
277         return le64_to_cpu(vb->vb_xv->xr_last_eb_blk);
278 }
279
280 static void ocfs2_xattr_value_update_clusters(struct ocfs2_extent_tree *et,
281                                               u32 clusters)
282 {
283         struct ocfs2_xattr_value_buf *vb = et->et_object;
284
285         le32_add_cpu(&vb->vb_xv->xr_clusters, clusters);
286 }
287
288 static struct ocfs2_extent_tree_operations ocfs2_xattr_value_et_ops = {
289         .eo_set_last_eb_blk     = ocfs2_xattr_value_set_last_eb_blk,
290         .eo_get_last_eb_blk     = ocfs2_xattr_value_get_last_eb_blk,
291         .eo_update_clusters     = ocfs2_xattr_value_update_clusters,
292         .eo_fill_root_el        = ocfs2_xattr_value_fill_root_el,
293 };
294
295 static void ocfs2_xattr_tree_fill_root_el(struct ocfs2_extent_tree *et)
296 {
297         struct ocfs2_xattr_block *xb = et->et_object;
298
299         et->et_root_el = &xb->xb_attrs.xb_root.xt_list;
300 }
301
302 static void ocfs2_xattr_tree_fill_max_leaf_clusters(struct ocfs2_extent_tree *et)
303 {
304         struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
305         et->et_max_leaf_clusters =
306                 ocfs2_clusters_for_bytes(sb, OCFS2_MAX_XATTR_TREE_LEAF_SIZE);
307 }
308
309 static void ocfs2_xattr_tree_set_last_eb_blk(struct ocfs2_extent_tree *et,
310                                              u64 blkno)
311 {
312         struct ocfs2_xattr_block *xb = et->et_object;
313         struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root;
314
315         xt->xt_last_eb_blk = cpu_to_le64(blkno);
316 }
317
318 static u64 ocfs2_xattr_tree_get_last_eb_blk(struct ocfs2_extent_tree *et)
319 {
320         struct ocfs2_xattr_block *xb = et->et_object;
321         struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root;
322
323         return le64_to_cpu(xt->xt_last_eb_blk);
324 }
325
326 static void ocfs2_xattr_tree_update_clusters(struct ocfs2_extent_tree *et,
327                                              u32 clusters)
328 {
329         struct ocfs2_xattr_block *xb = et->et_object;
330
331         le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, clusters);
332 }
333
334 static struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = {
335         .eo_set_last_eb_blk     = ocfs2_xattr_tree_set_last_eb_blk,
336         .eo_get_last_eb_blk     = ocfs2_xattr_tree_get_last_eb_blk,
337         .eo_update_clusters     = ocfs2_xattr_tree_update_clusters,
338         .eo_fill_root_el        = ocfs2_xattr_tree_fill_root_el,
339         .eo_fill_max_leaf_clusters = ocfs2_xattr_tree_fill_max_leaf_clusters,
340 };
341
342 static void ocfs2_dx_root_set_last_eb_blk(struct ocfs2_extent_tree *et,
343                                           u64 blkno)
344 {
345         struct ocfs2_dx_root_block *dx_root = et->et_object;
346
347         dx_root->dr_last_eb_blk = cpu_to_le64(blkno);
348 }
349
350 static u64 ocfs2_dx_root_get_last_eb_blk(struct ocfs2_extent_tree *et)
351 {
352         struct ocfs2_dx_root_block *dx_root = et->et_object;
353
354         return le64_to_cpu(dx_root->dr_last_eb_blk);
355 }
356
357 static void ocfs2_dx_root_update_clusters(struct ocfs2_extent_tree *et,
358                                           u32 clusters)
359 {
360         struct ocfs2_dx_root_block *dx_root = et->et_object;
361
362         le32_add_cpu(&dx_root->dr_clusters, clusters);
363 }
364
365 static int ocfs2_dx_root_sanity_check(struct ocfs2_extent_tree *et)
366 {
367         struct ocfs2_dx_root_block *dx_root = et->et_object;
368
369         BUG_ON(!OCFS2_IS_VALID_DX_ROOT(dx_root));
370
371         return 0;
372 }
373
374 static void ocfs2_dx_root_fill_root_el(struct ocfs2_extent_tree *et)
375 {
376         struct ocfs2_dx_root_block *dx_root = et->et_object;
377
378         et->et_root_el = &dx_root->dr_list;
379 }
380
381 static struct ocfs2_extent_tree_operations ocfs2_dx_root_et_ops = {
382         .eo_set_last_eb_blk     = ocfs2_dx_root_set_last_eb_blk,
383         .eo_get_last_eb_blk     = ocfs2_dx_root_get_last_eb_blk,
384         .eo_update_clusters     = ocfs2_dx_root_update_clusters,
385         .eo_sanity_check        = ocfs2_dx_root_sanity_check,
386         .eo_fill_root_el        = ocfs2_dx_root_fill_root_el,
387 };
388
389 static void ocfs2_refcount_tree_fill_root_el(struct ocfs2_extent_tree *et)
390 {
391         struct ocfs2_refcount_block *rb = et->et_object;
392
393         et->et_root_el = &rb->rf_list;
394 }
395
396 static void ocfs2_refcount_tree_set_last_eb_blk(struct ocfs2_extent_tree *et,
397                                                 u64 blkno)
398 {
399         struct ocfs2_refcount_block *rb = et->et_object;
400
401         rb->rf_last_eb_blk = cpu_to_le64(blkno);
402 }
403
404 static u64 ocfs2_refcount_tree_get_last_eb_blk(struct ocfs2_extent_tree *et)
405 {
406         struct ocfs2_refcount_block *rb = et->et_object;
407
408         return le64_to_cpu(rb->rf_last_eb_blk);
409 }
410
411 static void ocfs2_refcount_tree_update_clusters(struct ocfs2_extent_tree *et,
412                                                 u32 clusters)
413 {
414         struct ocfs2_refcount_block *rb = et->et_object;
415
416         le32_add_cpu(&rb->rf_clusters, clusters);
417 }
418
419 static enum ocfs2_contig_type
420 ocfs2_refcount_tree_extent_contig(struct ocfs2_extent_tree *et,
421                                   struct ocfs2_extent_rec *ext,
422                                   struct ocfs2_extent_rec *insert_rec)
423 {
424         return CONTIG_NONE;
425 }
426
427 static struct ocfs2_extent_tree_operations ocfs2_refcount_tree_et_ops = {
428         .eo_set_last_eb_blk     = ocfs2_refcount_tree_set_last_eb_blk,
429         .eo_get_last_eb_blk     = ocfs2_refcount_tree_get_last_eb_blk,
430         .eo_update_clusters     = ocfs2_refcount_tree_update_clusters,
431         .eo_fill_root_el        = ocfs2_refcount_tree_fill_root_el,
432         .eo_extent_contig       = ocfs2_refcount_tree_extent_contig,
433 };
434
435 static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et,
436                                      struct ocfs2_caching_info *ci,
437                                      struct buffer_head *bh,
438                                      ocfs2_journal_access_func access,
439                                      void *obj,
440                                      struct ocfs2_extent_tree_operations *ops)
441 {
442         et->et_ops = ops;
443         et->et_root_bh = bh;
444         et->et_ci = ci;
445         et->et_root_journal_access = access;
446         if (!obj)
447                 obj = (void *)bh->b_data;
448         et->et_object = obj;
449
450         et->et_ops->eo_fill_root_el(et);
451         if (!et->et_ops->eo_fill_max_leaf_clusters)
452                 et->et_max_leaf_clusters = 0;
453         else
454                 et->et_ops->eo_fill_max_leaf_clusters(et);
455 }
456
457 void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et,
458                                    struct ocfs2_caching_info *ci,
459                                    struct buffer_head *bh)
460 {
461         __ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_di,
462                                  NULL, &ocfs2_dinode_et_ops);
463 }
464
465 void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et,
466                                        struct ocfs2_caching_info *ci,
467                                        struct buffer_head *bh)
468 {
469         __ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_xb,
470                                  NULL, &ocfs2_xattr_tree_et_ops);
471 }
472
473 void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
474                                         struct ocfs2_caching_info *ci,
475                                         struct ocfs2_xattr_value_buf *vb)
476 {
477         __ocfs2_init_extent_tree(et, ci, vb->vb_bh, vb->vb_access, vb,
478                                  &ocfs2_xattr_value_et_ops);
479 }
480
481 void ocfs2_init_dx_root_extent_tree(struct ocfs2_extent_tree *et,
482                                     struct ocfs2_caching_info *ci,
483                                     struct buffer_head *bh)
484 {
485         __ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_dr,
486                                  NULL, &ocfs2_dx_root_et_ops);
487 }
488
489 void ocfs2_init_refcount_extent_tree(struct ocfs2_extent_tree *et,
490                                      struct ocfs2_caching_info *ci,
491                                      struct buffer_head *bh)
492 {
493         __ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_rb,
494                                  NULL, &ocfs2_refcount_tree_et_ops);
495 }
496
497 static inline void ocfs2_et_set_last_eb_blk(struct ocfs2_extent_tree *et,
498                                             u64 new_last_eb_blk)
499 {
500         et->et_ops->eo_set_last_eb_blk(et, new_last_eb_blk);
501 }
502
503 static inline u64 ocfs2_et_get_last_eb_blk(struct ocfs2_extent_tree *et)
504 {
505         return et->et_ops->eo_get_last_eb_blk(et);
506 }
507
508 static inline void ocfs2_et_update_clusters(struct ocfs2_extent_tree *et,
509                                             u32 clusters)
510 {
511         et->et_ops->eo_update_clusters(et, clusters);
512 }
513
514 static inline void ocfs2_et_extent_map_insert(struct ocfs2_extent_tree *et,
515                                               struct ocfs2_extent_rec *rec)
516 {
517         if (et->et_ops->eo_extent_map_insert)
518                 et->et_ops->eo_extent_map_insert(et, rec);
519 }
520
521 static inline void ocfs2_et_extent_map_truncate(struct ocfs2_extent_tree *et,
522                                                 u32 clusters)
523 {
524         if (et->et_ops->eo_extent_map_truncate)
525                 et->et_ops->eo_extent_map_truncate(et, clusters);
526 }
527
528 static inline int ocfs2_et_root_journal_access(handle_t *handle,
529                                                struct ocfs2_extent_tree *et,
530                                                int type)
531 {
532         return et->et_root_journal_access(handle, et->et_ci, et->et_root_bh,
533                                           type);
534 }
535
536 static inline enum ocfs2_contig_type
537         ocfs2_et_extent_contig(struct ocfs2_extent_tree *et,
538                                struct ocfs2_extent_rec *rec,
539                                struct ocfs2_extent_rec *insert_rec)
540 {
541         if (et->et_ops->eo_extent_contig)
542                 return et->et_ops->eo_extent_contig(et, rec, insert_rec);
543
544         return ocfs2_extent_rec_contig(
545                                 ocfs2_metadata_cache_get_super(et->et_ci),
546                                 rec, insert_rec);
547 }
548
549 static inline int ocfs2_et_insert_check(struct ocfs2_extent_tree *et,
550                                         struct ocfs2_extent_rec *rec)
551 {
552         int ret = 0;
553
554         if (et->et_ops->eo_insert_check)
555                 ret = et->et_ops->eo_insert_check(et, rec);
556         return ret;
557 }
558
559 static inline int ocfs2_et_sanity_check(struct ocfs2_extent_tree *et)
560 {
561         int ret = 0;
562
563         if (et->et_ops->eo_sanity_check)
564                 ret = et->et_ops->eo_sanity_check(et);
565         return ret;
566 }
567
568 static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
569 static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
570                                          struct ocfs2_extent_block *eb);
571 static void ocfs2_adjust_rightmost_records(handle_t *handle,
572                                            struct ocfs2_extent_tree *et,
573                                            struct ocfs2_path *path,
574                                            struct ocfs2_extent_rec *insert_rec);
575 /*
576  * Reset the actual path elements so that we can re-use the structure
577  * to build another path. Generally, this involves freeing the buffer
578  * heads.
579  */
580 void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root)
581 {
582         int i, start = 0, depth = 0;
583         struct ocfs2_path_item *node;
584
585         if (keep_root)
586                 start = 1;
587
588         for(i = start; i < path_num_items(path); i++) {
589                 node = &path->p_node[i];
590
591                 brelse(node->bh);
592                 node->bh = NULL;
593                 node->el = NULL;
594         }
595
596         /*
597          * Tree depth may change during truncate, or insert. If we're
598          * keeping the root extent list, then make sure that our path
599          * structure reflects the proper depth.
600          */
601         if (keep_root)
602                 depth = le16_to_cpu(path_root_el(path)->l_tree_depth);
603         else
604                 path_root_access(path) = NULL;
605
606         path->p_tree_depth = depth;
607 }
608
609 void ocfs2_free_path(struct ocfs2_path *path)
610 {
611         if (path) {
612                 ocfs2_reinit_path(path, 0);
613                 kfree(path);
614         }
615 }
616
617 /*
618  * All the elements of src into dest. After this call, src could be freed
619  * without affecting dest.
620  *
621  * Both paths should have the same root. Any non-root elements of dest
622  * will be freed.
623  */
624 static void ocfs2_cp_path(struct ocfs2_path *dest, struct ocfs2_path *src)
625 {
626         int i;
627
628         BUG_ON(path_root_bh(dest) != path_root_bh(src));
629         BUG_ON(path_root_el(dest) != path_root_el(src));
630         BUG_ON(path_root_access(dest) != path_root_access(src));
631
632         ocfs2_reinit_path(dest, 1);
633
634         for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
635                 dest->p_node[i].bh = src->p_node[i].bh;
636                 dest->p_node[i].el = src->p_node[i].el;
637
638                 if (dest->p_node[i].bh)
639                         get_bh(dest->p_node[i].bh);
640         }
641 }
642
643 /*
644  * Make the *dest path the same as src and re-initialize src path to
645  * have a root only.
646  */
647 static void ocfs2_mv_path(struct ocfs2_path *dest, struct ocfs2_path *src)
648 {
649         int i;
650
651         BUG_ON(path_root_bh(dest) != path_root_bh(src));
652         BUG_ON(path_root_access(dest) != path_root_access(src));
653
654         for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
655                 brelse(dest->p_node[i].bh);
656
657                 dest->p_node[i].bh = src->p_node[i].bh;
658                 dest->p_node[i].el = src->p_node[i].el;
659
660                 src->p_node[i].bh = NULL;
661                 src->p_node[i].el = NULL;
662         }
663 }
664
665 /*
666  * Insert an extent block at given index.
667  *
668  * This will not take an additional reference on eb_bh.
669  */
670 static inline void ocfs2_path_insert_eb(struct ocfs2_path *path, int index,
671                                         struct buffer_head *eb_bh)
672 {
673         struct ocfs2_extent_block *eb = (struct ocfs2_extent_block *)eb_bh->b_data;
674
675         /*
676          * Right now, no root bh is an extent block, so this helps
677          * catch code errors with dinode trees. The assertion can be
678          * safely removed if we ever need to insert extent block
679          * structures at the root.
680          */
681         BUG_ON(index == 0);
682
683         path->p_node[index].bh = eb_bh;
684         path->p_node[index].el = &eb->h_list;
685 }
686
687 static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh,
688                                          struct ocfs2_extent_list *root_el,
689                                          ocfs2_journal_access_func access)
690 {
691         struct ocfs2_path *path;
692
693         BUG_ON(le16_to_cpu(root_el->l_tree_depth) >= OCFS2_MAX_PATH_DEPTH);
694
695         path = kzalloc(sizeof(*path), GFP_NOFS);
696         if (path) {
697                 path->p_tree_depth = le16_to_cpu(root_el->l_tree_depth);
698                 get_bh(root_bh);
699                 path_root_bh(path) = root_bh;
700                 path_root_el(path) = root_el;
701                 path_root_access(path) = access;
702         }
703
704         return path;
705 }
706
707 struct ocfs2_path *ocfs2_new_path_from_path(struct ocfs2_path *path)
708 {
709         return ocfs2_new_path(path_root_bh(path), path_root_el(path),
710                               path_root_access(path));
711 }
712
713 struct ocfs2_path *ocfs2_new_path_from_et(struct ocfs2_extent_tree *et)
714 {
715         return ocfs2_new_path(et->et_root_bh, et->et_root_el,
716                               et->et_root_journal_access);
717 }
718
719 /*
720  * Journal the buffer at depth idx.  All idx>0 are extent_blocks,
721  * otherwise it's the root_access function.
722  *
723  * I don't like the way this function's name looks next to
724  * ocfs2_journal_access_path(), but I don't have a better one.
725  */
726 int ocfs2_path_bh_journal_access(handle_t *handle,
727                                  struct ocfs2_caching_info *ci,
728                                  struct ocfs2_path *path,
729                                  int idx)
730 {
731         ocfs2_journal_access_func access = path_root_access(path);
732
733         if (!access)
734                 access = ocfs2_journal_access;
735
736         if (idx)
737                 access = ocfs2_journal_access_eb;
738
739         return access(handle, ci, path->p_node[idx].bh,
740                       OCFS2_JOURNAL_ACCESS_WRITE);
741 }
742
743 /*
744  * Convenience function to journal all components in a path.
745  */
746 int ocfs2_journal_access_path(struct ocfs2_caching_info *ci,
747                               handle_t *handle,
748                               struct ocfs2_path *path)
749 {
750         int i, ret = 0;
751
752         if (!path)
753                 goto out;
754
755         for(i = 0; i < path_num_items(path); i++) {
756                 ret = ocfs2_path_bh_journal_access(handle, ci, path, i);
757                 if (ret < 0) {
758                         mlog_errno(ret);
759                         goto out;
760                 }
761         }
762
763 out:
764         return ret;
765 }
766
767 /*
768  * Return the index of the extent record which contains cluster #v_cluster.
769  * -1 is returned if it was not found.
770  *
771  * Should work fine on interior and exterior nodes.
772  */
773 int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster)
774 {
775         int ret = -1;
776         int i;
777         struct ocfs2_extent_rec *rec;
778         u32 rec_end, rec_start, clusters;
779
780         for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
781                 rec = &el->l_recs[i];
782
783                 rec_start = le32_to_cpu(rec->e_cpos);
784                 clusters = ocfs2_rec_clusters(el, rec);
785
786                 rec_end = rec_start + clusters;
787
788                 if (v_cluster >= rec_start && v_cluster < rec_end) {
789                         ret = i;
790                         break;
791                 }
792         }
793
794         return ret;
795 }
796
797 /*
798  * NOTE: ocfs2_block_extent_contig(), ocfs2_extents_adjacent() and
799  * ocfs2_extent_rec_contig only work properly against leaf nodes!
800  */
801 static int ocfs2_block_extent_contig(struct super_block *sb,
802                                      struct ocfs2_extent_rec *ext,
803                                      u64 blkno)
804 {
805         u64 blk_end = le64_to_cpu(ext->e_blkno);
806
807         blk_end += ocfs2_clusters_to_blocks(sb,
808                                     le16_to_cpu(ext->e_leaf_clusters));
809
810         return blkno == blk_end;
811 }
812
813 static int ocfs2_extents_adjacent(struct ocfs2_extent_rec *left,
814                                   struct ocfs2_extent_rec *right)
815 {
816         u32 left_range;
817
818         left_range = le32_to_cpu(left->e_cpos) +
819                 le16_to_cpu(left->e_leaf_clusters);
820
821         return (left_range == le32_to_cpu(right->e_cpos));
822 }
823
824 static enum ocfs2_contig_type
825         ocfs2_extent_rec_contig(struct super_block *sb,
826                                 struct ocfs2_extent_rec *ext,
827                                 struct ocfs2_extent_rec *insert_rec)
828 {
829         u64 blkno = le64_to_cpu(insert_rec->e_blkno);
830
831         /*
832          * Refuse to coalesce extent records with different flag
833          * fields - we don't want to mix unwritten extents with user
834          * data.
835          */
836         if (ext->e_flags != insert_rec->e_flags)
837                 return CONTIG_NONE;
838
839         if (ocfs2_extents_adjacent(ext, insert_rec) &&
840             ocfs2_block_extent_contig(sb, ext, blkno))
841                         return CONTIG_RIGHT;
842
843         blkno = le64_to_cpu(ext->e_blkno);
844         if (ocfs2_extents_adjacent(insert_rec, ext) &&
845             ocfs2_block_extent_contig(sb, insert_rec, blkno))
846                 return CONTIG_LEFT;
847
848         return CONTIG_NONE;
849 }
850
851 /*
852  * NOTE: We can have pretty much any combination of contiguousness and
853  * appending.
854  *
855  * The usefulness of APPEND_TAIL is more in that it lets us know that
856  * we'll have to update the path to that leaf.
857  */
858 enum ocfs2_append_type {
859         APPEND_NONE = 0,
860         APPEND_TAIL,
861 };
862
863 enum ocfs2_split_type {
864         SPLIT_NONE = 0,
865         SPLIT_LEFT,
866         SPLIT_RIGHT,
867 };
868
869 struct ocfs2_insert_type {
870         enum ocfs2_split_type   ins_split;
871         enum ocfs2_append_type  ins_appending;
872         enum ocfs2_contig_type  ins_contig;
873         int                     ins_contig_index;
874         int                     ins_tree_depth;
875 };
876
877 struct ocfs2_merge_ctxt {
878         enum ocfs2_contig_type  c_contig_type;
879         int                     c_has_empty_extent;
880         int                     c_split_covers_rec;
881 };
882
883 static int ocfs2_validate_extent_block(struct super_block *sb,
884                                        struct buffer_head *bh)
885 {
886         int rc;
887         struct ocfs2_extent_block *eb =
888                 (struct ocfs2_extent_block *)bh->b_data;
889
890         mlog(0, "Validating extent block %llu\n",
891              (unsigned long long)bh->b_blocknr);
892
893         BUG_ON(!buffer_uptodate(bh));
894
895         /*
896          * If the ecc fails, we return the error but otherwise
897          * leave the filesystem running.  We know any error is
898          * local to this block.
899          */
900         rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &eb->h_check);
901         if (rc) {
902                 mlog(ML_ERROR, "Checksum failed for extent block %llu\n",
903                      (unsigned long long)bh->b_blocknr);
904                 return rc;
905         }
906
907         /*
908          * Errors after here are fatal.
909          */
910
911         if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
912                 ocfs2_error(sb,
913                             "Extent block #%llu has bad signature %.*s",
914                             (unsigned long long)bh->b_blocknr, 7,
915                             eb->h_signature);
916                 return -EINVAL;
917         }
918
919         if (le64_to_cpu(eb->h_blkno) != bh->b_blocknr) {
920                 ocfs2_error(sb,
921                             "Extent block #%llu has an invalid h_blkno "
922                             "of %llu",
923                             (unsigned long long)bh->b_blocknr,
924                             (unsigned long long)le64_to_cpu(eb->h_blkno));
925                 return -EINVAL;
926         }
927
928         if (le32_to_cpu(eb->h_fs_generation) != OCFS2_SB(sb)->fs_generation) {
929                 ocfs2_error(sb,
930                             "Extent block #%llu has an invalid "
931                             "h_fs_generation of #%u",
932                             (unsigned long long)bh->b_blocknr,
933                             le32_to_cpu(eb->h_fs_generation));
934                 return -EINVAL;
935         }
936
937         return 0;
938 }
939
940 int ocfs2_read_extent_block(struct ocfs2_caching_info *ci, u64 eb_blkno,
941                             struct buffer_head **bh)
942 {
943         int rc;
944         struct buffer_head *tmp = *bh;
945
946         rc = ocfs2_read_block(ci, eb_blkno, &tmp,
947                               ocfs2_validate_extent_block);
948
949         /* If ocfs2_read_block() got us a new bh, pass it up. */
950         if (!rc && !*bh)
951                 *bh = tmp;
952
953         return rc;
954 }
955
956
957 /*
958  * How many free extents have we got before we need more meta data?
959  */
960 int ocfs2_num_free_extents(struct ocfs2_super *osb,
961                            struct ocfs2_extent_tree *et)
962 {
963         int retval;
964         struct ocfs2_extent_list *el = NULL;
965         struct ocfs2_extent_block *eb;
966         struct buffer_head *eb_bh = NULL;
967         u64 last_eb_blk = 0;
968
969         mlog_entry_void();
970
971         el = et->et_root_el;
972         last_eb_blk = ocfs2_et_get_last_eb_blk(et);
973
974         if (last_eb_blk) {
975                 retval = ocfs2_read_extent_block(et->et_ci, last_eb_blk,
976                                                  &eb_bh);
977                 if (retval < 0) {
978                         mlog_errno(retval);
979                         goto bail;
980                 }
981                 eb = (struct ocfs2_extent_block *) eb_bh->b_data;
982                 el = &eb->h_list;
983         }
984
985         BUG_ON(el->l_tree_depth != 0);
986
987         retval = le16_to_cpu(el->l_count) - le16_to_cpu(el->l_next_free_rec);
988 bail:
989         brelse(eb_bh);
990
991         mlog_exit(retval);
992         return retval;
993 }
994
995 /* expects array to already be allocated
996  *
997  * sets h_signature, h_blkno, h_suballoc_bit, h_suballoc_slot, and
998  * l_count for you
999  */
1000 static int ocfs2_create_new_meta_bhs(handle_t *handle,
1001                                      struct ocfs2_extent_tree *et,
1002                                      int wanted,
1003                                      struct ocfs2_alloc_context *meta_ac,
1004                                      struct buffer_head *bhs[])
1005 {
1006         int count, status, i;
1007         u16 suballoc_bit_start;
1008         u32 num_got;
1009         u64 first_blkno;
1010         struct ocfs2_super *osb =
1011                 OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci));
1012         struct ocfs2_extent_block *eb;
1013
1014         mlog_entry_void();
1015
1016         count = 0;
1017         while (count < wanted) {
1018                 status = ocfs2_claim_metadata(handle,
1019                                               meta_ac,
1020                                               wanted - count,
1021                                               &suballoc_bit_start,
1022                                               &num_got,
1023                                               &first_blkno);
1024                 if (status < 0) {
1025                         mlog_errno(status);
1026                         goto bail;
1027                 }
1028
1029                 for(i = count;  i < (num_got + count); i++) {
1030                         bhs[i] = sb_getblk(osb->sb, first_blkno);
1031                         if (bhs[i] == NULL) {
1032                                 status = -EIO;
1033                                 mlog_errno(status);
1034                                 goto bail;
1035                         }
1036                         ocfs2_set_new_buffer_uptodate(et->et_ci, bhs[i]);
1037
1038                         status = ocfs2_journal_access_eb(handle, et->et_ci,
1039                                                          bhs[i],
1040                                                          OCFS2_JOURNAL_ACCESS_CREATE);
1041                         if (status < 0) {
1042                                 mlog_errno(status);
1043                                 goto bail;
1044                         }
1045
1046                         memset(bhs[i]->b_data, 0, osb->sb->s_blocksize);
1047                         eb = (struct ocfs2_extent_block *) bhs[i]->b_data;
1048                         /* Ok, setup the minimal stuff here. */
1049                         strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
1050                         eb->h_blkno = cpu_to_le64(first_blkno);
1051                         eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
1052                         eb->h_suballoc_slot =
1053                                 cpu_to_le16(meta_ac->ac_alloc_slot);
1054                         eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start);
1055                         eb->h_list.l_count =
1056                                 cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
1057
1058                         suballoc_bit_start++;
1059                         first_blkno++;
1060
1061                         /* We'll also be dirtied by the caller, so
1062                          * this isn't absolutely necessary. */
1063                         ocfs2_journal_dirty(handle, bhs[i]);
1064                 }
1065
1066                 count += num_got;
1067         }
1068
1069         status = 0;
1070 bail:
1071         if (status < 0) {
1072                 for(i = 0; i < wanted; i++) {
1073                         brelse(bhs[i]);
1074                         bhs[i] = NULL;
1075                 }
1076         }
1077         mlog_exit(status);
1078         return status;
1079 }
1080
1081 /*
1082  * Helper function for ocfs2_add_branch() and ocfs2_shift_tree_depth().
1083  *
1084  * Returns the sum of the rightmost extent rec logical offset and
1085  * cluster count.
1086  *
1087  * ocfs2_add_branch() uses this to determine what logical cluster
1088  * value should be populated into the leftmost new branch records.
1089  *
1090  * ocfs2_shift_tree_depth() uses this to determine the # clusters
1091  * value for the new topmost tree record.
1092  */
1093 static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list  *el)
1094 {
1095         int i;
1096
1097         i = le16_to_cpu(el->l_next_free_rec) - 1;
1098
1099         return le32_to_cpu(el->l_recs[i].e_cpos) +
1100                 ocfs2_rec_clusters(el, &el->l_recs[i]);
1101 }
1102
1103 /*
1104  * Change range of the branches in the right most path according to the leaf
1105  * extent block's rightmost record.
1106  */
1107 static int ocfs2_adjust_rightmost_branch(handle_t *handle,
1108                                          struct ocfs2_extent_tree *et)
1109 {
1110         int status;
1111         struct ocfs2_path *path = NULL;
1112         struct ocfs2_extent_list *el;
1113         struct ocfs2_extent_rec *rec;
1114
1115         path = ocfs2_new_path_from_et(et);
1116         if (!path) {
1117                 status = -ENOMEM;
1118                 return status;
1119         }
1120
1121         status = ocfs2_find_path(et->et_ci, path, UINT_MAX);
1122         if (status < 0) {
1123                 mlog_errno(status);
1124                 goto out;
1125         }
1126
1127         status = ocfs2_extend_trans(handle, path_num_items(path));
1128         if (status < 0) {
1129                 mlog_errno(status);
1130                 goto out;
1131         }
1132
1133         status = ocfs2_journal_access_path(et->et_ci, handle, path);
1134         if (status < 0) {
1135                 mlog_errno(status);
1136                 goto out;
1137         }
1138
1139         el = path_leaf_el(path);
1140         rec = &el->l_recs[le32_to_cpu(el->l_next_free_rec) - 1];
1141
1142         ocfs2_adjust_rightmost_records(handle, et, path, rec);
1143
1144 out:
1145         ocfs2_free_path(path);
1146         return status;
1147 }
1148
1149 /*
1150  * Add an entire tree branch to our inode. eb_bh is the extent block
1151  * to start at, if we don't want to start the branch at the root
1152  * structure.
1153  *
1154  * last_eb_bh is required as we have to update it's next_leaf pointer
1155  * for the new last extent block.
1156  *
1157  * the new branch will be 'empty' in the sense that every block will
1158  * contain a single record with cluster count == 0.
1159  */
1160 static int ocfs2_add_branch(handle_t *handle,
1161                             struct ocfs2_extent_tree *et,
1162                             struct buffer_head *eb_bh,
1163                             struct buffer_head **last_eb_bh,
1164                             struct ocfs2_alloc_context *meta_ac)
1165 {
1166         int status, new_blocks, i;
1167         u64 next_blkno, new_last_eb_blk;
1168         struct buffer_head *bh;
1169         struct buffer_head **new_eb_bhs = NULL;
1170         struct ocfs2_extent_block *eb;
1171         struct ocfs2_extent_list  *eb_el;
1172         struct ocfs2_extent_list  *el;
1173         u32 new_cpos, root_end;
1174
1175         mlog_entry_void();
1176
1177         BUG_ON(!last_eb_bh || !*last_eb_bh);
1178
1179         if (eb_bh) {
1180                 eb = (struct ocfs2_extent_block *) eb_bh->b_data;
1181                 el = &eb->h_list;
1182         } else
1183                 el = et->et_root_el;
1184
1185         /* we never add a branch to a leaf. */
1186         BUG_ON(!el->l_tree_depth);
1187
1188         new_blocks = le16_to_cpu(el->l_tree_depth);
1189
1190         eb = (struct ocfs2_extent_block *)(*last_eb_bh)->b_data;
1191         new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list);
1192         root_end = ocfs2_sum_rightmost_rec(et->et_root_el);
1193
1194         /*
1195          * If there is a gap before the root end and the real end
1196          * of the righmost leaf block, we need to remove the gap
1197          * between new_cpos and root_end first so that the tree
1198          * is consistent after we add a new branch(it will start
1199          * from new_cpos).
1200          */
1201         if (root_end > new_cpos) {
1202                 mlog(0, "adjust the cluster end from %u to %u\n",
1203                      root_end, new_cpos);
1204                 status = ocfs2_adjust_rightmost_branch(handle, et);
1205                 if (status) {
1206                         mlog_errno(status);
1207                         goto bail;
1208                 }
1209         }
1210
1211         /* allocate the number of new eb blocks we need */
1212         new_eb_bhs = kcalloc(new_blocks, sizeof(struct buffer_head *),
1213                              GFP_KERNEL);
1214         if (!new_eb_bhs) {
1215                 status = -ENOMEM;
1216                 mlog_errno(status);
1217                 goto bail;
1218         }
1219
1220         status = ocfs2_create_new_meta_bhs(handle, et, new_blocks,
1221                                            meta_ac, new_eb_bhs);
1222         if (status < 0) {
1223                 mlog_errno(status);
1224                 goto bail;
1225         }
1226
1227         /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be
1228          * linked with the rest of the tree.
1229          * conversly, new_eb_bhs[0] is the new bottommost leaf.
1230          *
1231          * when we leave the loop, new_last_eb_blk will point to the
1232          * newest leaf, and next_blkno will point to the topmost extent
1233          * block. */
1234         next_blkno = new_last_eb_blk = 0;
1235         for(i = 0; i < new_blocks; i++) {
1236                 bh = new_eb_bhs[i];
1237                 eb = (struct ocfs2_extent_block *) bh->b_data;
1238                 /* ocfs2_create_new_meta_bhs() should create it right! */
1239                 BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
1240                 eb_el = &eb->h_list;
1241
1242                 status = ocfs2_journal_access_eb(handle, et->et_ci, bh,
1243                                                  OCFS2_JOURNAL_ACCESS_CREATE);
1244                 if (status < 0) {
1245                         mlog_errno(status);
1246                         goto bail;
1247                 }
1248
1249                 eb->h_next_leaf_blk = 0;
1250                 eb_el->l_tree_depth = cpu_to_le16(i);
1251                 eb_el->l_next_free_rec = cpu_to_le16(1);
1252                 /*
1253                  * This actually counts as an empty extent as
1254                  * c_clusters == 0
1255                  */
1256                 eb_el->l_recs[0].e_cpos = cpu_to_le32(new_cpos);
1257                 eb_el->l_recs[0].e_blkno = cpu_to_le64(next_blkno);
1258                 /*
1259                  * eb_el isn't always an interior node, but even leaf
1260                  * nodes want a zero'd flags and reserved field so
1261                  * this gets the whole 32 bits regardless of use.
1262                  */
1263                 eb_el->l_recs[0].e_int_clusters = cpu_to_le32(0);
1264                 if (!eb_el->l_tree_depth)
1265                         new_last_eb_blk = le64_to_cpu(eb->h_blkno);
1266
1267                 ocfs2_journal_dirty(handle, bh);
1268                 next_blkno = le64_to_cpu(eb->h_blkno);
1269         }
1270
1271         /* This is a bit hairy. We want to update up to three blocks
1272          * here without leaving any of them in an inconsistent state
1273          * in case of error. We don't have to worry about
1274          * journal_dirty erroring as it won't unless we've aborted the
1275          * handle (in which case we would never be here) so reserving
1276          * the write with journal_access is all we need to do. */
1277         status = ocfs2_journal_access_eb(handle, et->et_ci, *last_eb_bh,
1278                                          OCFS2_JOURNAL_ACCESS_WRITE);
1279         if (status < 0) {
1280                 mlog_errno(status);
1281                 goto bail;
1282         }
1283         status = ocfs2_et_root_journal_access(handle, et,
1284                                               OCFS2_JOURNAL_ACCESS_WRITE);
1285         if (status < 0) {
1286                 mlog_errno(status);
1287                 goto bail;
1288         }
1289         if (eb_bh) {
1290                 status = ocfs2_journal_access_eb(handle, et->et_ci, eb_bh,
1291                                                  OCFS2_JOURNAL_ACCESS_WRITE);
1292                 if (status < 0) {
1293                         mlog_errno(status);
1294                         goto bail;
1295                 }
1296         }
1297
1298         /* Link the new branch into the rest of the tree (el will
1299          * either be on the root_bh, or the extent block passed in. */
1300         i = le16_to_cpu(el->l_next_free_rec);
1301         el->l_recs[i].e_blkno = cpu_to_le64(next_blkno);
1302         el->l_recs[i].e_cpos = cpu_to_le32(new_cpos);
1303         el->l_recs[i].e_int_clusters = 0;
1304         le16_add_cpu(&el->l_next_free_rec, 1);
1305
1306         /* fe needs a new last extent block pointer, as does the
1307          * next_leaf on the previously last-extent-block. */
1308         ocfs2_et_set_last_eb_blk(et, new_last_eb_blk);
1309
1310         eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
1311         eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk);
1312
1313         ocfs2_journal_dirty(handle, *last_eb_bh);
1314         ocfs2_journal_dirty(handle, et->et_root_bh);
1315         if (eb_bh)
1316                 ocfs2_journal_dirty(handle, eb_bh);
1317
1318         /*
1319          * Some callers want to track the rightmost leaf so pass it
1320          * back here.
1321          */
1322         brelse(*last_eb_bh);
1323         get_bh(new_eb_bhs[0]);
1324         *last_eb_bh = new_eb_bhs[0];
1325
1326         status = 0;
1327 bail:
1328         if (new_eb_bhs) {
1329                 for (i = 0; i < new_blocks; i++)
1330                         brelse(new_eb_bhs[i]);
1331                 kfree(new_eb_bhs);
1332         }
1333
1334         mlog_exit(status);
1335         return status;
1336 }
1337
1338 /*
1339  * adds another level to the allocation tree.
1340  * returns back the new extent block so you can add a branch to it
1341  * after this call.
1342  */
1343 static int ocfs2_shift_tree_depth(handle_t *handle,
1344                                   struct ocfs2_extent_tree *et,
1345                                   struct ocfs2_alloc_context *meta_ac,
1346                                   struct buffer_head **ret_new_eb_bh)
1347 {
1348         int status, i;
1349         u32 new_clusters;
1350         struct buffer_head *new_eb_bh = NULL;
1351         struct ocfs2_extent_block *eb;
1352         struct ocfs2_extent_list  *root_el;
1353         struct ocfs2_extent_list  *eb_el;
1354
1355         mlog_entry_void();
1356
1357         status = ocfs2_create_new_meta_bhs(handle, et, 1, meta_ac,
1358                                            &new_eb_bh);
1359         if (status < 0) {
1360                 mlog_errno(status);
1361                 goto bail;
1362         }
1363
1364         eb = (struct ocfs2_extent_block *) new_eb_bh->b_data;
1365         /* ocfs2_create_new_meta_bhs() should create it right! */
1366         BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
1367
1368         eb_el = &eb->h_list;
1369         root_el = et->et_root_el;
1370
1371         status = ocfs2_journal_access_eb(handle, et->et_ci, new_eb_bh,
1372                                          OCFS2_JOURNAL_ACCESS_CREATE);
1373         if (status < 0) {
1374                 mlog_errno(status);
1375                 goto bail;
1376         }
1377
1378         /* copy the root extent list data into the new extent block */
1379         eb_el->l_tree_depth = root_el->l_tree_depth;
1380         eb_el->l_next_free_rec = root_el->l_next_free_rec;
1381         for (i = 0; i < le16_to_cpu(root_el->l_next_free_rec); i++)
1382                 eb_el->l_recs[i] = root_el->l_recs[i];
1383
1384         ocfs2_journal_dirty(handle, new_eb_bh);
1385
1386         status = ocfs2_et_root_journal_access(handle, et,
1387                                               OCFS2_JOURNAL_ACCESS_WRITE);
1388         if (status < 0) {
1389                 mlog_errno(status);
1390                 goto bail;
1391         }
1392
1393         new_clusters = ocfs2_sum_rightmost_rec(eb_el);
1394
1395         /* update root_bh now */
1396         le16_add_cpu(&root_el->l_tree_depth, 1);
1397         root_el->l_recs[0].e_cpos = 0;
1398         root_el->l_recs[0].e_blkno = eb->h_blkno;
1399         root_el->l_recs[0].e_int_clusters = cpu_to_le32(new_clusters);
1400         for (i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++)
1401                 memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
1402         root_el->l_next_free_rec = cpu_to_le16(1);
1403
1404         /* If this is our 1st tree depth shift, then last_eb_blk
1405          * becomes the allocated extent block */
1406         if (root_el->l_tree_depth == cpu_to_le16(1))
1407                 ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
1408
1409         ocfs2_journal_dirty(handle, et->et_root_bh);
1410
1411         *ret_new_eb_bh = new_eb_bh;
1412         new_eb_bh = NULL;
1413         status = 0;
1414 bail:
1415         brelse(new_eb_bh);
1416
1417         mlog_exit(status);
1418         return status;
1419 }
1420
1421 /*
1422  * Should only be called when there is no space left in any of the
1423  * leaf nodes. What we want to do is find the lowest tree depth
1424  * non-leaf extent block with room for new records. There are three
1425  * valid results of this search:
1426  *
1427  * 1) a lowest extent block is found, then we pass it back in
1428  *    *lowest_eb_bh and return '0'
1429  *
1430  * 2) the search fails to find anything, but the root_el has room. We
1431  *    pass NULL back in *lowest_eb_bh, but still return '0'
1432  *
1433  * 3) the search fails to find anything AND the root_el is full, in
1434  *    which case we return > 0
1435  *
1436  * return status < 0 indicates an error.
1437  */
1438 static int ocfs2_find_branch_target(struct ocfs2_extent_tree *et,
1439                                     struct buffer_head **target_bh)
1440 {
1441         int status = 0, i;
1442         u64 blkno;
1443         struct ocfs2_extent_block *eb;
1444         struct ocfs2_extent_list  *el;
1445         struct buffer_head *bh = NULL;
1446         struct buffer_head *lowest_bh = NULL;
1447
1448         mlog_entry_void();
1449
1450         *target_bh = NULL;
1451
1452         el = et->et_root_el;
1453
1454         while(le16_to_cpu(el->l_tree_depth) > 1) {
1455                 if (le16_to_cpu(el->l_next_free_rec) == 0) {
1456                         ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
1457                                     "Owner %llu has empty "
1458                                     "extent list (next_free_rec == 0)",
1459                                     (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci));
1460                         status = -EIO;
1461                         goto bail;
1462                 }
1463                 i = le16_to_cpu(el->l_next_free_rec) - 1;
1464                 blkno = le64_to_cpu(el->l_recs[i].e_blkno);
1465                 if (!blkno) {
1466                         ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
1467                                     "Owner %llu has extent "
1468                                     "list where extent # %d has no physical "
1469                                     "block start",
1470                                     (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), i);
1471                         status = -EIO;
1472                         goto bail;
1473                 }
1474
1475                 brelse(bh);
1476                 bh = NULL;
1477
1478                 status = ocfs2_read_extent_block(et->et_ci, blkno, &bh);
1479                 if (status < 0) {
1480                         mlog_errno(status);
1481                         goto bail;
1482                 }
1483
1484                 eb = (struct ocfs2_extent_block *) bh->b_data;
1485                 el = &eb->h_list;
1486
1487                 if (le16_to_cpu(el->l_next_free_rec) <
1488                     le16_to_cpu(el->l_count)) {
1489                         brelse(lowest_bh);
1490                         lowest_bh = bh;
1491                         get_bh(lowest_bh);
1492                 }
1493         }
1494
1495         /* If we didn't find one and the fe doesn't have any room,
1496          * then return '1' */
1497         el = et->et_root_el;
1498         if (!lowest_bh && (el->l_next_free_rec == el->l_count))
1499                 status = 1;
1500
1501         *target_bh = lowest_bh;
1502 bail:
1503         brelse(bh);
1504
1505         mlog_exit(status);
1506         return status;
1507 }
1508
1509 /*
1510  * Grow a b-tree so that it has more records.
1511  *
1512  * We might shift the tree depth in which case existing paths should
1513  * be considered invalid.
1514  *
1515  * Tree depth after the grow is returned via *final_depth.
1516  *
1517  * *last_eb_bh will be updated by ocfs2_add_branch().
1518  */
1519 static int ocfs2_grow_tree(handle_t *handle, struct ocfs2_extent_tree *et,
1520                            int *final_depth, struct buffer_head **last_eb_bh,
1521                            struct ocfs2_alloc_context *meta_ac)
1522 {
1523         int ret, shift;
1524         struct ocfs2_extent_list *el = et->et_root_el;
1525         int depth = le16_to_cpu(el->l_tree_depth);
1526         struct buffer_head *bh = NULL;
1527
1528         BUG_ON(meta_ac == NULL);
1529
1530         shift = ocfs2_find_branch_target(et, &bh);
1531         if (shift < 0) {
1532                 ret = shift;
1533                 mlog_errno(ret);
1534                 goto out;
1535         }
1536
1537         /* We traveled all the way to the bottom of the allocation tree
1538          * and didn't find room for any more extents - we need to add
1539          * another tree level */
1540         if (shift) {
1541                 BUG_ON(bh);
1542                 mlog(0, "need to shift tree depth (current = %d)\n", depth);
1543
1544                 /* ocfs2_shift_tree_depth will return us a buffer with
1545                  * the new extent block (so we can pass that to
1546                  * ocfs2_add_branch). */
1547                 ret = ocfs2_shift_tree_depth(handle, et, meta_ac, &bh);
1548                 if (ret < 0) {
1549                         mlog_errno(ret);
1550                         goto out;
1551                 }
1552                 depth++;
1553                 if (depth == 1) {
1554                         /*
1555                          * Special case: we have room now if we shifted from
1556                          * tree_depth 0, so no more work needs to be done.
1557                          *
1558                          * We won't be calling add_branch, so pass
1559                          * back *last_eb_bh as the new leaf. At depth
1560                          * zero, it should always be null so there's
1561                          * no reason to brelse.
1562                          */
1563                         BUG_ON(*last_eb_bh);
1564                         get_bh(bh);
1565                         *last_eb_bh = bh;
1566                         goto out;
1567                 }
1568         }
1569
1570         /* call ocfs2_add_branch to add the final part of the tree with
1571          * the new data. */
1572         mlog(0, "add branch. bh = %p\n", bh);
1573         ret = ocfs2_add_branch(handle, et, bh, last_eb_bh,
1574                                meta_ac);
1575         if (ret < 0) {
1576                 mlog_errno(ret);
1577                 goto out;
1578         }
1579
1580 out:
1581         if (final_depth)
1582                 *final_depth = depth;
1583         brelse(bh);
1584         return ret;
1585 }
1586
1587 /*
1588  * This function will discard the rightmost extent record.
1589  */
1590 static void ocfs2_shift_records_right(struct ocfs2_extent_list *el)
1591 {
1592         int next_free = le16_to_cpu(el->l_next_free_rec);
1593         int count = le16_to_cpu(el->l_count);
1594         unsigned int num_bytes;
1595
1596         BUG_ON(!next_free);
1597         /* This will cause us to go off the end of our extent list. */
1598         BUG_ON(next_free >= count);
1599
1600         num_bytes = sizeof(struct ocfs2_extent_rec) * next_free;
1601
1602         memmove(&el->l_recs[1], &el->l_recs[0], num_bytes);
1603 }
1604
1605 static void ocfs2_rotate_leaf(struct ocfs2_extent_list *el,
1606                               struct ocfs2_extent_rec *insert_rec)
1607 {
1608         int i, insert_index, next_free, has_empty, num_bytes;
1609         u32 insert_cpos = le32_to_cpu(insert_rec->e_cpos);
1610         struct ocfs2_extent_rec *rec;
1611
1612         next_free = le16_to_cpu(el->l_next_free_rec);
1613         has_empty = ocfs2_is_empty_extent(&el->l_recs[0]);
1614
1615         BUG_ON(!next_free);
1616
1617         /* The tree code before us didn't allow enough room in the leaf. */
1618         BUG_ON(el->l_next_free_rec == el->l_count && !has_empty);
1619
1620         /*
1621          * The easiest way to approach this is to just remove the
1622          * empty extent and temporarily decrement next_free.
1623          */
1624         if (has_empty) {
1625                 /*
1626                  * If next_free was 1 (only an empty extent), this
1627                  * loop won't execute, which is fine. We still want
1628                  * the decrement above to happen.
1629                  */
1630                 for(i = 0; i < (next_free - 1); i++)
1631                         el->l_recs[i] = el->l_recs[i+1];
1632
1633                 next_free--;
1634         }
1635
1636         /*
1637          * Figure out what the new record index should be.
1638          */
1639         for(i = 0; i < next_free; i++) {
1640                 rec = &el->l_recs[i];
1641
1642                 if (insert_cpos < le32_to_cpu(rec->e_cpos))
1643                         break;
1644         }
1645         insert_index = i;
1646
1647         mlog(0, "ins %u: index %d, has_empty %d, next_free %d, count %d\n",
1648              insert_cpos, insert_index, has_empty, next_free, le16_to_cpu(el->l_count));
1649
1650         BUG_ON(insert_index < 0);
1651         BUG_ON(insert_index >= le16_to_cpu(el->l_count));
1652         BUG_ON(insert_index > next_free);
1653
1654         /*
1655          * No need to memmove if we're just adding to the tail.
1656          */
1657         if (insert_index != next_free) {
1658                 BUG_ON(next_free >= le16_to_cpu(el->l_count));
1659
1660                 num_bytes = next_free - insert_index;
1661                 num_bytes *= sizeof(struct ocfs2_extent_rec);
1662                 memmove(&el->l_recs[insert_index + 1],
1663                         &el->l_recs[insert_index],
1664                         num_bytes);
1665         }
1666
1667         /*
1668          * Either we had an empty extent, and need to re-increment or
1669          * there was no empty extent on a non full rightmost leaf node,
1670          * in which case we still need to increment.
1671          */
1672         next_free++;
1673         el->l_next_free_rec = cpu_to_le16(next_free);
1674         /*
1675          * Make sure none of the math above just messed up our tree.
1676          */
1677         BUG_ON(le16_to_cpu(el->l_next_free_rec) > le16_to_cpu(el->l_count));
1678
1679         el->l_recs[insert_index] = *insert_rec;
1680
1681 }
1682
1683 static void ocfs2_remove_empty_extent(struct ocfs2_extent_list *el)
1684 {
1685         int size, num_recs = le16_to_cpu(el->l_next_free_rec);
1686
1687         BUG_ON(num_recs == 0);
1688
1689         if (ocfs2_is_empty_extent(&el->l_recs[0])) {
1690                 num_recs--;
1691                 size = num_recs * sizeof(struct ocfs2_extent_rec);
1692                 memmove(&el->l_recs[0], &el->l_recs[1], size);
1693                 memset(&el->l_recs[num_recs], 0,
1694                        sizeof(struct ocfs2_extent_rec));
1695                 el->l_next_free_rec = cpu_to_le16(num_recs);
1696         }
1697 }
1698
1699 /*
1700  * Create an empty extent record .
1701  *
1702  * l_next_free_rec may be updated.
1703  *
1704  * If an empty extent already exists do nothing.
1705  */
1706 static void ocfs2_create_empty_extent(struct ocfs2_extent_list *el)
1707 {
1708         int next_free = le16_to_cpu(el->l_next_free_rec);
1709
1710         BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
1711
1712         if (next_free == 0)
1713                 goto set_and_inc;
1714
1715         if (ocfs2_is_empty_extent(&el->l_recs[0]))
1716                 return;
1717
1718         mlog_bug_on_msg(el->l_count == el->l_next_free_rec,
1719                         "Asked to create an empty extent in a full list:\n"
1720                         "count = %u, tree depth = %u",
1721                         le16_to_cpu(el->l_count),
1722                         le16_to_cpu(el->l_tree_depth));
1723
1724         ocfs2_shift_records_right(el);
1725
1726 set_and_inc:
1727         le16_add_cpu(&el->l_next_free_rec, 1);
1728         memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
1729 }
1730
1731 /*
1732  * For a rotation which involves two leaf nodes, the "root node" is
1733  * the lowest level tree node which contains a path to both leafs. This
1734  * resulting set of information can be used to form a complete "subtree"
1735  *
1736  * This function is passed two full paths from the dinode down to a
1737  * pair of adjacent leaves. It's task is to figure out which path
1738  * index contains the subtree root - this can be the root index itself
1739  * in a worst-case rotation.
1740  *
1741  * The array index of the subtree root is passed back.
1742  */
1743 int ocfs2_find_subtree_root(struct ocfs2_extent_tree *et,
1744                             struct ocfs2_path *left,
1745                             struct ocfs2_path *right)
1746 {
1747         int i = 0;
1748
1749         /*
1750          * Check that the caller passed in two paths from the same tree.
1751          */
1752         BUG_ON(path_root_bh(left) != path_root_bh(right));
1753
1754         do {
1755                 i++;
1756
1757                 /*
1758                  * The caller didn't pass two adjacent paths.
1759                  */
1760                 mlog_bug_on_msg(i > left->p_tree_depth,
1761                                 "Owner %llu, left depth %u, right depth %u\n"
1762                                 "left leaf blk %llu, right leaf blk %llu\n",
1763                                 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
1764                                 left->p_tree_depth, right->p_tree_depth,
1765                                 (unsigned long long)path_leaf_bh(left)->b_blocknr,
1766                                 (unsigned long long)path_leaf_bh(right)->b_blocknr);
1767         } while (left->p_node[i].bh->b_blocknr ==
1768                  right->p_node[i].bh->b_blocknr);
1769
1770         return i - 1;
1771 }
1772
1773 typedef void (path_insert_t)(void *, struct buffer_head *);
1774
1775 /*
1776  * Traverse a btree path in search of cpos, starting at root_el.
1777  *
1778  * This code can be called with a cpos larger than the tree, in which
1779  * case it will return the rightmost path.
1780  */
1781 static int __ocfs2_find_path(struct ocfs2_caching_info *ci,
1782                              struct ocfs2_extent_list *root_el, u32 cpos,
1783                              path_insert_t *func, void *data)
1784 {
1785         int i, ret = 0;
1786         u32 range;
1787         u64 blkno;
1788         struct buffer_head *bh = NULL;
1789         struct ocfs2_extent_block *eb;
1790         struct ocfs2_extent_list *el;
1791         struct ocfs2_extent_rec *rec;
1792
1793         el = root_el;
1794         while (el->l_tree_depth) {
1795                 if (le16_to_cpu(el->l_next_free_rec) == 0) {
1796                         ocfs2_error(ocfs2_metadata_cache_get_super(ci),
1797                                     "Owner %llu has empty extent list at "
1798                                     "depth %u\n",
1799                                     (unsigned long long)ocfs2_metadata_cache_owner(ci),
1800                                     le16_to_cpu(el->l_tree_depth));
1801                         ret = -EROFS;
1802                         goto out;
1803
1804                 }
1805
1806                 for(i = 0; i < le16_to_cpu(el->l_next_free_rec) - 1; i++) {
1807                         rec = &el->l_recs[i];
1808
1809                         /*
1810                          * In the case that cpos is off the allocation
1811                          * tree, this should just wind up returning the
1812                          * rightmost record.
1813                          */
1814                         range = le32_to_cpu(rec->e_cpos) +
1815                                 ocfs2_rec_clusters(el, rec);
1816                         if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range)
1817                             break;
1818                 }
1819
1820                 blkno = le64_to_cpu(el->l_recs[i].e_blkno);
1821                 if (blkno == 0) {
1822                         ocfs2_error(ocfs2_metadata_cache_get_super(ci),
1823                                     "Owner %llu has bad blkno in extent list "
1824                                     "at depth %u (index %d)\n",
1825                                     (unsigned long long)ocfs2_metadata_cache_owner(ci),
1826                                     le16_to_cpu(el->l_tree_depth), i);
1827                         ret = -EROFS;
1828                         goto out;
1829                 }
1830
1831                 brelse(bh);
1832                 bh = NULL;
1833                 ret = ocfs2_read_extent_block(ci, blkno, &bh);
1834                 if (ret) {
1835                         mlog_errno(ret);
1836                         goto out;
1837                 }
1838
1839                 eb = (struct ocfs2_extent_block *) bh->b_data;
1840                 el = &eb->h_list;
1841
1842                 if (le16_to_cpu(el->l_next_free_rec) >
1843                     le16_to_cpu(el->l_count)) {
1844                         ocfs2_error(ocfs2_metadata_cache_get_super(ci),
1845                                     "Owner %llu has bad count in extent list "
1846                                     "at block %llu (next free=%u, count=%u)\n",
1847                                     (unsigned long long)ocfs2_metadata_cache_owner(ci),
1848                                     (unsigned long long)bh->b_blocknr,
1849                                     le16_to_cpu(el->l_next_free_rec),
1850                                     le16_to_cpu(el->l_count));
1851                         ret = -EROFS;
1852                         goto out;
1853                 }
1854
1855                 if (func)
1856                         func(data, bh);
1857         }
1858
1859 out:
1860         /*
1861          * Catch any trailing bh that the loop didn't handle.
1862          */
1863         brelse(bh);
1864
1865         return ret;
1866 }
1867
1868 /*
1869  * Given an initialized path (that is, it has a valid root extent
1870  * list), this function will traverse the btree in search of the path
1871  * which would contain cpos.
1872  *
1873  * The path traveled is recorded in the path structure.
1874  *
1875  * Note that this will not do any comparisons on leaf node extent
1876  * records, so it will work fine in the case that we just added a tree
1877  * branch.
1878  */
1879 struct find_path_data {
1880         int index;
1881         struct ocfs2_path *path;
1882 };
1883 static void find_path_ins(void *data, struct buffer_head *bh)
1884 {
1885         struct find_path_data *fp = data;
1886
1887         get_bh(bh);
1888         ocfs2_path_insert_eb(fp->path, fp->index, bh);
1889         fp->index++;
1890 }
1891 int ocfs2_find_path(struct ocfs2_caching_info *ci,
1892                     struct ocfs2_path *path, u32 cpos)
1893 {
1894         struct find_path_data data;
1895
1896         data.index = 1;
1897         data.path = path;
1898         return __ocfs2_find_path(ci, path_root_el(path), cpos,
1899                                  find_path_ins, &data);
1900 }
1901
1902 static void find_leaf_ins(void *data, struct buffer_head *bh)
1903 {
1904         struct ocfs2_extent_block *eb =(struct ocfs2_extent_block *)bh->b_data;
1905         struct ocfs2_extent_list *el = &eb->h_list;
1906         struct buffer_head **ret = data;
1907
1908         /* We want to retain only the leaf block. */
1909         if (le16_to_cpu(el->l_tree_depth) == 0) {
1910                 get_bh(bh);
1911                 *ret = bh;
1912         }
1913 }
1914 /*
1915  * Find the leaf block in the tree which would contain cpos. No
1916  * checking of the actual leaf is done.
1917  *
1918  * Some paths want to call this instead of allocating a path structure
1919  * and calling ocfs2_find_path().
1920  *
1921  * This function doesn't handle non btree extent lists.
1922  */
1923 int ocfs2_find_leaf(struct ocfs2_caching_info *ci,
1924                     struct ocfs2_extent_list *root_el, u32 cpos,
1925                     struct buffer_head **leaf_bh)
1926 {
1927         int ret;
1928         struct buffer_head *bh = NULL;
1929
1930         ret = __ocfs2_find_path(ci, root_el, cpos, find_leaf_ins, &bh);
1931         if (ret) {
1932                 mlog_errno(ret);
1933                 goto out;
1934         }
1935
1936         *leaf_bh = bh;
1937 out:
1938         return ret;
1939 }
1940
1941 /*
1942  * Adjust the adjacent records (left_rec, right_rec) involved in a rotation.
1943  *
1944  * Basically, we've moved stuff around at the bottom of the tree and
1945  * we need to fix up the extent records above the changes to reflect
1946  * the new changes.
1947  *
1948  * left_rec: the record on the left.
1949  * left_child_el: is the child list pointed to by left_rec
1950  * right_rec: the record to the right of left_rec
1951  * right_child_el: is the child list pointed to by right_rec
1952  *
1953  * By definition, this only works on interior nodes.
1954  */
1955 static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec,
1956                                   struct ocfs2_extent_list *left_child_el,
1957                                   struct ocfs2_extent_rec *right_rec,
1958                                   struct ocfs2_extent_list *right_child_el)
1959 {
1960         u32 left_clusters, right_end;
1961
1962         /*
1963          * Interior nodes never have holes. Their cpos is the cpos of
1964          * the leftmost record in their child list. Their cluster
1965          * count covers the full theoretical range of their child list
1966          * - the range between their cpos and the cpos of the record
1967          * immediately to their right.
1968          */
1969         left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos);
1970         if (!ocfs2_rec_clusters(right_child_el, &right_child_el->l_recs[0])) {
1971                 BUG_ON(right_child_el->l_tree_depth);
1972                 BUG_ON(le16_to_cpu(right_child_el->l_next_free_rec) <= 1);
1973                 left_clusters = le32_to_cpu(right_child_el->l_recs[1].e_cpos);
1974         }
1975         left_clusters -= le32_to_cpu(left_rec->e_cpos);
1976         left_rec->e_int_clusters = cpu_to_le32(left_clusters);
1977
1978         /*
1979          * Calculate the rightmost cluster count boundary before
1980          * moving cpos - we will need to adjust clusters after
1981          * updating e_cpos to keep the same highest cluster count.
1982          */
1983         right_end = le32_to_cpu(right_rec->e_cpos);
1984         right_end += le32_to_cpu(right_rec->e_int_clusters);
1985
1986         right_rec->e_cpos = left_rec->e_cpos;
1987         le32_add_cpu(&right_rec->e_cpos, left_clusters);
1988
1989         right_end -= le32_to_cpu(right_rec->e_cpos);
1990         right_rec->e_int_clusters = cpu_to_le32(right_end);
1991 }
1992
1993 /*
1994  * Adjust the adjacent root node records involved in a
1995  * rotation. left_el_blkno is passed in as a key so that we can easily
1996  * find it's index in the root list.
1997  */
1998 static void ocfs2_adjust_root_records(struct ocfs2_extent_list *root_el,
1999                                       struct ocfs2_extent_list *left_el,
2000                                       struct ocfs2_extent_list *right_el,
2001                                       u64 left_el_blkno)
2002 {
2003         int i;
2004
2005         BUG_ON(le16_to_cpu(root_el->l_tree_depth) <=
2006                le16_to_cpu(left_el->l_tree_depth));
2007
2008         for(i = 0; i < le16_to_cpu(root_el->l_next_free_rec) - 1; i++) {
2009                 if (le64_to_cpu(root_el->l_recs[i].e_blkno) == left_el_blkno)
2010                         break;
2011         }
2012
2013         /*
2014          * The path walking code should have never returned a root and
2015          * two paths which are not adjacent.
2016          */
2017         BUG_ON(i >= (le16_to_cpu(root_el->l_next_free_rec) - 1));
2018
2019         ocfs2_adjust_adjacent_records(&root_el->l_recs[i], left_el,
2020                                       &root_el->l_recs[i + 1], right_el);
2021 }
2022
2023 /*
2024  * We've changed a leaf block (in right_path) and need to reflect that
2025  * change back up the subtree.
2026  *
2027  * This happens in multiple places:
2028  *   - When we've moved an extent record from the left path leaf to the right
2029  *     path leaf to make room for an empty extent in the left path leaf.
2030  *   - When our insert into the right path leaf is at the leftmost edge
2031  *     and requires an update of the path immediately to it's left. This
2032  *     can occur at the end of some types of rotation and appending inserts.
2033  *   - When we've adjusted the last extent record in the left path leaf and the
2034  *     1st extent record in the right path leaf during cross extent block merge.
2035  */
2036 static void ocfs2_complete_edge_insert(handle_t *handle,
2037                                        struct ocfs2_path *left_path,
2038                                        struct ocfs2_path *right_path,
2039                                        int subtree_index)
2040 {
2041         int i, idx;
2042         struct ocfs2_extent_list *el, *left_el, *right_el;
2043         struct ocfs2_extent_rec *left_rec, *right_rec;
2044         struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
2045
2046         /*
2047          * Update the counts and position values within all the
2048          * interior nodes to reflect the leaf rotation we just did.
2049          *
2050          * The root node is handled below the loop.
2051          *
2052          * We begin the loop with right_el and left_el pointing to the
2053          * leaf lists and work our way up.
2054          *
2055          * NOTE: within this loop, left_el and right_el always refer
2056          * to the *child* lists.
2057          */
2058         left_el = path_leaf_el(left_path);
2059         right_el = path_leaf_el(right_path);
2060         for(i = left_path->p_tree_depth - 1; i > subtree_index; i--) {
2061                 mlog(0, "Adjust records at index %u\n", i);
2062
2063                 /*
2064                  * One nice property of knowing that all of these
2065                  * nodes are below the root is that we only deal with
2066                  * the leftmost right node record and the rightmost
2067                  * left node record.
2068                  */
2069                 el = left_path->p_node[i].el;
2070                 idx = le16_to_cpu(left_el->l_next_free_rec) - 1;
2071                 left_rec = &el->l_recs[idx];
2072
2073                 el = right_path->p_node[i].el;
2074                 right_rec = &el->l_recs[0];
2075
2076                 ocfs2_adjust_adjacent_records(left_rec, left_el, right_rec,
2077                                               right_el);
2078
2079                 ocfs2_journal_dirty(handle, left_path->p_node[i].bh);
2080                 ocfs2_journal_dirty(handle, right_path->p_node[i].bh);
2081
2082                 /*
2083                  * Setup our list pointers now so that the current
2084                  * parents become children in the next iteration.
2085                  */
2086                 left_el = left_path->p_node[i].el;
2087                 right_el = right_path->p_node[i].el;
2088         }
2089
2090         /*
2091          * At the root node, adjust the two adjacent records which
2092          * begin our path to the leaves.
2093          */
2094
2095         el = left_path->p_node[subtree_index].el;
2096         left_el = left_path->p_node[subtree_index + 1].el;
2097         right_el = right_path->p_node[subtree_index + 1].el;
2098
2099         ocfs2_adjust_root_records(el, left_el, right_el,
2100                                   left_path->p_node[subtree_index + 1].bh->b_blocknr);
2101
2102         root_bh = left_path->p_node[subtree_index].bh;
2103
2104         ocfs2_journal_dirty(handle, root_bh);
2105 }
2106
2107 static int ocfs2_rotate_subtree_right(handle_t *handle,
2108                                       struct ocfs2_extent_tree *et,
2109                                       struct ocfs2_path *left_path,
2110                                       struct ocfs2_path *right_path,
2111                                       int subtree_index)
2112 {
2113         int ret, i;
2114         struct buffer_head *right_leaf_bh;
2115         struct buffer_head *left_leaf_bh = NULL;
2116         struct buffer_head *root_bh;
2117         struct ocfs2_extent_list *right_el, *left_el;
2118         struct ocfs2_extent_rec move_rec;
2119
2120         left_leaf_bh = path_leaf_bh(left_path);
2121         left_el = path_leaf_el(left_path);
2122
2123         if (left_el->l_next_free_rec != left_el->l_count) {
2124                 ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
2125                             "Inode %llu has non-full interior leaf node %llu"
2126                             "(next free = %u)",
2127                             (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
2128                             (unsigned long long)left_leaf_bh->b_blocknr,
2129                             le16_to_cpu(left_el->l_next_free_rec));
2130                 return -EROFS;
2131         }
2132
2133         /*
2134          * This extent block may already have an empty record, so we
2135          * return early if so.
2136          */
2137         if (ocfs2_is_empty_extent(&left_el->l_recs[0]))
2138                 return 0;
2139
2140         root_bh = left_path->p_node[subtree_index].bh;
2141         BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
2142
2143         ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path,
2144                                            subtree_index);
2145         if (ret) {
2146                 mlog_errno(ret);
2147                 goto out;
2148         }
2149
2150         for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
2151                 ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
2152                                                    right_path, i);
2153                 if (ret) {
2154                         mlog_errno(ret);
2155                         goto out;
2156                 }
2157
2158                 ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
2159                                                    left_path, i);
2160                 if (ret) {
2161                         mlog_errno(ret);
2162                         goto out;
2163                 }
2164         }
2165
2166         right_leaf_bh = path_leaf_bh(right_path);
2167         right_el = path_leaf_el(right_path);
2168
2169         /* This is a code error, not a disk corruption. */
2170         mlog_bug_on_msg(!right_el->l_next_free_rec, "Inode %llu: Rotate fails "
2171                         "because rightmost leaf block %llu is empty\n",
2172                         (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
2173                         (unsigned long long)right_leaf_bh->b_blocknr);
2174
2175         ocfs2_create_empty_extent(right_el);
2176
2177         ocfs2_journal_dirty(handle, right_leaf_bh);
2178
2179         /* Do the copy now. */
2180         i = le16_to_cpu(left_el->l_next_free_rec) - 1;
2181         move_rec = left_el->l_recs[i];
2182         right_el->l_recs[0] = move_rec;
2183
2184         /*
2185          * Clear out the record we just copied and shift everything
2186          * over, leaving an empty extent in the left leaf.
2187          *
2188          * We temporarily subtract from next_free_rec so that the
2189          * shift will lose the tail record (which is now defunct).
2190          */
2191         le16_add_cpu(&left_el->l_next_free_rec, -1);
2192         ocfs2_shift_records_right(left_el);
2193         memset(&left_el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
2194         le16_add_cpu(&left_el->l_next_free_rec, 1);
2195
2196         ocfs2_journal_dirty(handle, left_leaf_bh);
2197
2198         ocfs2_complete_edge_insert(handle, left_path, right_path,
2199                                    subtree_index);
2200
2201 out:
2202         return ret;
2203 }
2204
2205 /*
2206  * Given a full path, determine what cpos value would return us a path
2207  * containing the leaf immediately to the left of the current one.
2208  *
2209  * Will return zero if the path passed in is already the leftmost path.
2210  */
2211 static int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
2212                                          struct ocfs2_path *path, u32 *cpos)
2213 {
2214         int i, j, ret = 0;
2215         u64 blkno;
2216         struct ocfs2_extent_list *el;
2217
2218         BUG_ON(path->p_tree_depth == 0);
2219
2220         *cpos = 0;
2221
2222         blkno = path_leaf_bh(path)->b_blocknr;
2223
2224         /* Start at the tree node just above the leaf and work our way up. */
2225         i = path->p_tree_depth - 1;
2226         while (i >= 0) {
2227                 el = path->p_node[i].el;
2228
2229                 /*
2230                  * Find the extent record just before the one in our
2231                  * path.
2232                  */
2233                 for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) {
2234                         if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) {
2235                                 if (j == 0) {
2236                                         if (i == 0) {
2237                                                 /*
2238                                                  * We've determined that the
2239                                                  * path specified is already
2240                                                  * the leftmost one - return a
2241                                                  * cpos of zero.
2242                                                  */
2243                                                 goto out;
2244                                         }
2245                                         /*
2246                                          * The leftmost record points to our
2247                                          * leaf - we need to travel up the
2248                                          * tree one level.
2249                                          */
2250                                         goto next_node;
2251                                 }
2252
2253                                 *cpos = le32_to_cpu(el->l_recs[j - 1].e_cpos);
2254                                 *cpos = *cpos + ocfs2_rec_clusters(el,
2255                                                            &el->l_recs[j - 1]);
2256                                 *cpos = *cpos - 1;
2257                                 goto out;
2258                         }
2259                 }
2260
2261                 /*
2262                  * If we got here, we never found a valid node where
2263                  * the tree indicated one should be.
2264                  */
2265                 ocfs2_error(sb,
2266                             "Invalid extent tree at extent block %llu\n",
2267                             (unsigned long long)blkno);
2268                 ret = -EROFS;
2269                 goto out;
2270
2271 next_node:
2272                 blkno = path->p_node[i].bh->b_blocknr;
2273                 i--;
2274         }
2275
2276 out:
2277         return ret;
2278 }
2279
2280 /*
2281  * Extend the transaction by enough credits to complete the rotation,
2282  * and still leave at least the original number of credits allocated
2283  * to this transaction.
2284  */
2285 static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth,
2286                                            int op_credits,
2287                                            struct ocfs2_path *path)
2288 {
2289         int ret = 0;
2290         int credits = (path->p_tree_depth - subtree_depth) * 2 + 1 + op_credits;
2291
2292         if (handle->h_buffer_credits < credits)
2293                 ret = ocfs2_extend_trans(handle,
2294                                          credits - handle->h_buffer_credits);
2295
2296         return ret;
2297 }
2298
2299 /*
2300  * Trap the case where we're inserting into the theoretical range past
2301  * the _actual_ left leaf range. Otherwise, we'll rotate a record
2302  * whose cpos is less than ours into the right leaf.
2303  *
2304  * It's only necessary to look at the rightmost record of the left
2305  * leaf because the logic that calls us should ensure that the
2306  * theoretical ranges in the path components above the leaves are
2307  * correct.
2308  */
2309 static int ocfs2_rotate_requires_path_adjustment(struct ocfs2_path *left_path,
2310                                                  u32 insert_cpos)
2311 {
2312         struct ocfs2_extent_list *left_el;
2313         struct ocfs2_extent_rec *rec;
2314         int next_free;
2315
2316         left_el = path_leaf_el(left_path);
2317         next_free = le16_to_cpu(left_el->l_next_free_rec);
2318         rec = &left_el->l_recs[next_free - 1];
2319
2320         if (insert_cpos > le32_to_cpu(rec->e_cpos))
2321                 return 1;
2322         return 0;
2323 }
2324
2325 static int ocfs2_leftmost_rec_contains(struct ocfs2_extent_list *el, u32 cpos)
2326 {
2327         int next_free = le16_to_cpu(el->l_next_free_rec);
2328         unsigned int range;
2329         struct ocfs2_extent_rec *rec;
2330
2331         if (next_free == 0)
2332                 return 0;
2333
2334         rec = &el->l_recs[0];
2335         if (ocfs2_is_empty_extent(rec)) {
2336                 /* Empty list. */
2337                 if (next_free == 1)
2338                         return 0;
2339                 rec = &el->l_recs[1];
2340         }
2341
2342         range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
2343         if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range)
2344                 return 1;
2345         return 0;
2346 }
2347
2348 /*
2349  * Rotate all the records in a btree right one record, starting at insert_cpos.
2350  *
2351  * The path to the rightmost leaf should be passed in.
2352  *
2353  * The array is assumed to be large enough to hold an entire path (tree depth).
2354  *
2355  * Upon successful return from this function:
2356  *
2357  * - The 'right_path' array will contain a path to the leaf block
2358  *   whose range contains e_cpos.
2359  * - That leaf block will have a single empty extent in list index 0.
2360  * - In the case that the rotation requires a post-insert update,
2361  *   *ret_left_path will contain a valid path which can be passed to
2362  *   ocfs2_insert_path().
2363  */
2364 static int ocfs2_rotate_tree_right(handle_t *handle,
2365                                    struct ocfs2_extent_tree *et,
2366                                    enum ocfs2_split_type split,
2367                                    u32 insert_cpos,
2368                                    struct ocfs2_path *right_path,
2369                                    struct ocfs2_path **ret_left_path)
2370 {
2371         int ret, start, orig_credits = handle->h_buffer_credits;
2372         u32 cpos;
2373         struct ocfs2_path *left_path = NULL;
2374         struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
2375
2376         *ret_left_path = NULL;
2377
2378         left_path = ocfs2_new_path_from_path(right_path);
2379         if (!left_path) {
2380                 ret = -ENOMEM;
2381                 mlog_errno(ret);
2382                 goto out;
2383         }
2384
2385         ret = ocfs2_find_cpos_for_left_leaf(sb, right_path, &cpos);
2386         if (ret) {
2387                 mlog_errno(ret);
2388                 goto out;
2389         }
2390
2391         mlog(0, "Insert: %u, first left path cpos: %u\n", insert_cpos, cpos);
2392
2393         /*
2394          * What we want to do here is:
2395          *
2396          * 1) Start with the rightmost path.
2397          *
2398          * 2) Determine a path to the leaf block directly to the left
2399          *    of that leaf.
2400          *
2401          * 3) Determine the 'subtree root' - the lowest level tree node
2402          *    which contains a path to both leaves.
2403          *
2404          * 4) Rotate the subtree.
2405          *
2406          * 5) Find the next subtree by considering the left path to be
2407          *    the new right path.
2408          *
2409          * The check at the top of this while loop also accepts
2410          * insert_cpos == cpos because cpos is only a _theoretical_
2411          * value to get us the left path - insert_cpos might very well
2412          * be filling that hole.
2413          *
2414          * Stop at a cpos of '0' because we either started at the
2415          * leftmost branch (i.e., a tree with one branch and a
2416          * rotation inside of it), or we've gone as far as we can in
2417          * rotating subtrees.
2418          */
2419         while (cpos && insert_cpos <= cpos) {
2420                 mlog(0, "Rotating a tree: ins. cpos: %u, left path cpos: %u\n",
2421                      insert_cpos, cpos);
2422
2423                 ret = ocfs2_find_path(et->et_ci, left_path, cpos);
2424                 if (ret) {
2425                         mlog_errno(ret);
2426                         goto out;
2427                 }
2428
2429                 mlog_bug_on_msg(path_leaf_bh(left_path) ==
2430                                 path_leaf_bh(right_path),
2431                                 "Owner %llu: error during insert of %u "
2432                                 "(left path cpos %u) results in two identical "
2433                                 "paths ending at %llu\n",
2434                                 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
2435                                 insert_cpos, cpos,
2436                                 (unsigned long long)
2437                                 path_leaf_bh(left_path)->b_blocknr);
2438
2439                 if (split == SPLIT_NONE &&
2440                     ocfs2_rotate_requires_path_adjustment(left_path,
2441                                                           insert_cpos)) {
2442
2443                         /*
2444                          * We've rotated the tree as much as we
2445                          * should. The rest is up to
2446                          * ocfs2_insert_path() to complete, after the
2447                          * record insertion. We indicate this
2448                          * situation by returning the left path.
2449                          *
2450                          * The reason we don't adjust the records here
2451                          * before the record insert is that an error
2452                          * later might break the rule where a parent
2453                          * record e_cpos will reflect the actual
2454                          * e_cpos of the 1st nonempty record of the
2455                          * child list.
2456                          */
2457                         *ret_left_path = left_path;
2458                         goto out_ret_path;
2459                 }
2460
2461                 start = ocfs2_find_subtree_root(et, left_path, right_path);
2462
2463                 mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n",
2464                      start,
2465                      (unsigned long long) right_path->p_node[start].bh->b_blocknr,
2466                      right_path->p_tree_depth);
2467
2468                 ret = ocfs2_extend_rotate_transaction(handle, start,
2469                                                       orig_credits, right_path);
2470                 if (ret) {
2471                         mlog_errno(ret);
2472                         goto out;
2473                 }
2474
2475                 ret = ocfs2_rotate_subtree_right(handle, et, left_path,
2476                                                  right_path, start);
2477                 if (ret) {
2478                         mlog_errno(ret);
2479                         goto out;
2480                 }
2481
2482                 if (split != SPLIT_NONE &&
2483                     ocfs2_leftmost_rec_contains(path_leaf_el(right_path),
2484                                                 insert_cpos)) {
2485                         /*
2486                          * A rotate moves the rightmost left leaf
2487                          * record over to the leftmost right leaf
2488                          * slot. If we're doing an extent split
2489                          * instead of a real insert, then we have to
2490                          * check that the extent to be split wasn't
2491                          * just moved over. If it was, then we can
2492                          * exit here, passing left_path back -
2493                          * ocfs2_split_extent() is smart enough to
2494                          * search both leaves.
2495                          */
2496                         *ret_left_path = left_path;
2497                         goto out_ret_path;
2498                 }
2499
2500                 /*
2501                  * There is no need to re-read the next right path
2502                  * as we know that it'll be our current left
2503                  * path. Optimize by copying values instead.
2504                  */
2505                 ocfs2_mv_path(right_path, left_path);
2506
2507                 ret = ocfs2_find_cpos_for_left_leaf(sb, right_path, &cpos);
2508                 if (ret) {
2509                         mlog_errno(ret);
2510                         goto out;
2511                 }
2512         }
2513
2514 out:
2515         ocfs2_free_path(left_path);
2516
2517 out_ret_path:
2518         return ret;
2519 }
2520
2521 static int ocfs2_update_edge_lengths(handle_t *handle,
2522                                      struct ocfs2_extent_tree *et,
2523                                      int subtree_index, struct ocfs2_path *path)
2524 {
2525         int i, idx, ret;
2526         struct ocfs2_extent_rec *rec;
2527         struct ocfs2_extent_list *el;
2528         struct ocfs2_extent_block *eb;
2529         u32 range;
2530
2531         /*
2532          * In normal tree rotation process, we will never touch the
2533          * tree branch above subtree_index and ocfs2_extend_rotate_transaction
2534          * doesn't reserve the credits for them either.
2535          *
2536          * But we do have a special case here which will update the rightmost
2537          * records for all the bh in the path.
2538          * So we have to allocate extra credits and access them.
2539          */
2540         ret = ocfs2_extend_trans(handle, subtree_index);
2541         if (ret) {
2542                 mlog_errno(ret);
2543                 goto out;
2544         }
2545
2546         ret = ocfs2_journal_access_path(et->et_ci, handle, path);
2547         if (ret) {
2548                 mlog_errno(ret);
2549                 goto out;
2550         }
2551
2552         /* Path should always be rightmost. */
2553         eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
2554         BUG_ON(eb->h_next_leaf_blk != 0ULL);
2555
2556         el = &eb->h_list;
2557         BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
2558         idx = le16_to_cpu(el->l_next_free_rec) - 1;
2559         rec = &el->l_recs[idx];
2560         range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
2561
2562         for (i = 0; i < path->p_tree_depth; i++) {
2563                 el = path->p_node[i].el;
2564                 idx = le16_to_cpu(el->l_next_free_rec) - 1;
2565                 rec = &el->l_recs[idx];
2566
2567                 rec->e_int_clusters = cpu_to_le32(range);
2568                 le32_add_cpu(&rec->e_int_clusters, -le32_to_cpu(rec->e_cpos));
2569
2570                 ocfs2_journal_dirty(handle, path->p_node[i].bh);
2571         }
2572 out:
2573         return ret;
2574 }
2575
2576 static void ocfs2_unlink_path(handle_t *handle,
2577                               struct ocfs2_extent_tree *et,
2578                               struct ocfs2_cached_dealloc_ctxt *dealloc,
2579                               struct ocfs2_path *path, int unlink_start)
2580 {
2581         int ret, i;
2582         struct ocfs2_extent_block *eb;
2583         struct ocfs2_extent_list *el;
2584         struct buffer_head *bh;
2585
2586         for(i = unlink_start; i < path_num_items(path); i++) {
2587                 bh = path->p_node[i].bh;
2588
2589                 eb = (struct ocfs2_extent_block *)bh->b_data;
2590                 /*
2591                  * Not all nodes might have had their final count
2592                  * decremented by the caller - handle this here.
2593                  */
2594                 el = &eb->h_list;
2595                 if (le16_to_cpu(el->l_next_free_rec) > 1) {
2596                         mlog(ML_ERROR,
2597                              "Inode %llu, attempted to remove extent block "
2598                              "%llu with %u records\n",
2599                              (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
2600                              (unsigned long long)le64_to_cpu(eb->h_blkno),
2601                              le16_to_cpu(el->l_next_free_rec));
2602
2603                         ocfs2_journal_dirty(handle, bh);
2604                         ocfs2_remove_from_cache(et->et_ci, bh);
2605                         continue;
2606                 }
2607
2608                 el->l_next_free_rec = 0;
2609                 memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
2610
2611                 ocfs2_journal_dirty(handle, bh);
2612
2613                 ret = ocfs2_cache_extent_block_free(dealloc, eb);
2614                 if (ret)
2615                         mlog_errno(ret);
2616
2617                 ocfs2_remove_from_cache(et->et_ci, bh);
2618         }
2619 }
2620
2621 static void ocfs2_unlink_subtree(handle_t *handle,
2622                                  struct ocfs2_extent_tree *et,
2623                                  struct ocfs2_path *left_path,
2624                                  struct ocfs2_path *right_path,
2625                                  int subtree_index,
2626                                  struct ocfs2_cached_dealloc_ctxt *dealloc)
2627 {
2628         int i;
2629         struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
2630         struct ocfs2_extent_list *root_el = left_path->p_node[subtree_index].el;
2631         struct ocfs2_extent_list *el;
2632         struct ocfs2_extent_block *eb;
2633
2634         el = path_leaf_el(left_path);
2635
2636         eb = (struct ocfs2_extent_block *)right_path->p_node[subtree_index + 1].bh->b_data;
2637
2638         for(i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++)
2639                 if (root_el->l_recs[i].e_blkno == eb->h_blkno)
2640                         break;
2641
2642         BUG_ON(i >= le16_to_cpu(root_el->l_next_free_rec));
2643
2644         memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
2645         le16_add_cpu(&root_el->l_next_free_rec, -1);
2646
2647         eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2648         eb->h_next_leaf_blk = 0;
2649
2650         ocfs2_journal_dirty(handle, root_bh);
2651         ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
2652
2653         ocfs2_unlink_path(handle, et, dealloc, right_path,
2654                           subtree_index + 1);
2655 }
2656
2657 static int ocfs2_rotate_subtree_left(handle_t *handle,
2658                                      struct ocfs2_extent_tree *et,
2659                                      struct ocfs2_path *left_path,
2660                                      struct ocfs2_path *right_path,
2661                                      int subtree_index,
2662                                      struct ocfs2_cached_dealloc_ctxt *dealloc,
2663                                      int *deleted)
2664 {
2665         int ret, i, del_right_subtree = 0, right_has_empty = 0;
2666         struct buffer_head *root_bh, *et_root_bh = path_root_bh(right_path);
2667         struct ocfs2_extent_list *right_leaf_el, *left_leaf_el;
2668         struct ocfs2_extent_block *eb;
2669
2670         *deleted = 0;
2671
2672         right_leaf_el = path_leaf_el(right_path);
2673         left_leaf_el = path_leaf_el(left_path);
2674         root_bh = left_path->p_node[subtree_index].bh;
2675         BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
2676
2677         if (!ocfs2_is_empty_extent(&left_leaf_el->l_recs[0]))
2678                 return 0;
2679
2680         eb = (struct ocfs2_extent_block *)path_leaf_bh(right_path)->b_data;
2681         if (ocfs2_is_empty_extent(&right_leaf_el->l_recs[0])) {
2682                 /*
2683                  * It's legal for us to proceed if the right leaf is
2684                  * the rightmost one and it has an empty extent. There
2685                  * are two cases to handle - whether the leaf will be
2686                  * empty after removal or not. If the leaf isn't empty
2687                  * then just remove the empty extent up front. The
2688                  * next block will handle empty leaves by flagging
2689                  * them for unlink.
2690                  *
2691                  * Non rightmost leaves will throw -EAGAIN and the
2692                  * caller can manually move the subtree and retry.
2693                  */
2694
2695                 if (eb->h_next_leaf_blk != 0ULL)
2696                         return -EAGAIN;
2697
2698                 if (le16_to_cpu(right_leaf_el->l_next_free_rec) > 1) {
2699                         ret = ocfs2_journal_access_eb(handle, et->et_ci,
2700                                                       path_leaf_bh(right_path),
2701                                                       OCFS2_JOURNAL_ACCESS_WRITE);
2702                         if (ret) {
2703                                 mlog_errno(ret);
2704                                 goto out;
2705                         }
2706
2707                         ocfs2_remove_empty_extent(right_leaf_el);
2708                 } else
2709                         right_has_empty = 1;
2710         }
2711
2712         if (eb->h_next_leaf_blk == 0ULL &&
2713             le16_to_cpu(right_leaf_el->l_next_free_rec) == 1) {
2714                 /*
2715                  * We have to update i_last_eb_blk during the meta
2716                  * data delete.
2717                  */
2718                 ret = ocfs2_et_root_journal_access(handle, et,
2719                                                    OCFS2_JOURNAL_ACCESS_WRITE);
2720                 if (ret) {
2721                         mlog_errno(ret);
2722                         goto out;
2723                 }
2724
2725                 del_right_subtree = 1;
2726         }
2727
2728         /*
2729          * Getting here with an empty extent in the right path implies
2730          * that it's the rightmost path and will be deleted.
2731          */
2732         BUG_ON(right_has_empty && !del_right_subtree);
2733
2734         ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path,
2735                                            subtree_index);
2736         if (ret) {
2737                 mlog_errno(ret);
2738                 goto out;
2739         }
2740
2741         for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
2742                 ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
2743                                                    right_path, i);
2744                 if (ret) {
2745                         mlog_errno(ret);
2746                         goto out;
2747                 }
2748
2749                 ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
2750                                                    left_path, i);
2751                 if (ret) {
2752                         mlog_errno(ret);
2753                         goto out;
2754                 }
2755         }
2756
2757         if (!right_has_empty) {
2758                 /*
2759                  * Only do this if we're moving a real
2760                  * record. Otherwise, the action is delayed until
2761                  * after removal of the right path in which case we
2762                  * can do a simple shift to remove the empty extent.
2763                  */
2764                 ocfs2_rotate_leaf(left_leaf_el, &right_leaf_el->l_recs[0]);
2765                 memset(&right_leaf_el->l_recs[0], 0,
2766                        sizeof(struct ocfs2_extent_rec));
2767         }
2768         if (eb->h_next_leaf_blk == 0ULL) {
2769                 /*
2770                  * Move recs over to get rid of empty extent, decrease
2771                  * next_free. This is allowed to remove the last
2772                  * extent in our leaf (setting l_next_free_rec to
2773                  * zero) - the delete code below won't care.
2774                  */
2775                 ocfs2_remove_empty_extent(right_leaf_el);
2776         }
2777
2778         ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
2779         ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
2780
2781         if (del_right_subtree) {
2782                 ocfs2_unlink_subtree(handle, et, left_path, right_path,
2783                                      subtree_index, dealloc);
2784                 ret = ocfs2_update_edge_lengths(handle, et, subtree_index,
2785                                                 left_path);
2786                 if (ret) {
2787                         mlog_errno(ret);
2788                         goto out;
2789                 }
2790
2791                 eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2792                 ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
2793
2794                 /*
2795                  * Removal of the extent in the left leaf was skipped
2796                  * above so we could delete the right path
2797                  * 1st.
2798                  */
2799                 if (right_has_empty)
2800                         ocfs2_remove_empty_extent(left_leaf_el);
2801
2802                 ocfs2_journal_dirty(handle, et_root_bh);
2803
2804                 *deleted = 1;
2805         } else
2806                 ocfs2_complete_edge_insert(handle, left_path, right_path,
2807                                            subtree_index);
2808
2809 out:
2810         return ret;
2811 }
2812
2813 /*
2814  * Given a full path, determine what cpos value would return us a path
2815  * containing the leaf immediately to the right of the current one.
2816  *
2817  * Will return zero if the path passed in is already the rightmost path.
2818  *
2819  * This looks similar, but is subtly different to
2820  * ocfs2_find_cpos_for_left_leaf().
2821  */
2822 int ocfs2_find_cpos_for_right_leaf(struct super_block *sb,
2823                                    struct ocfs2_path *path, u32 *cpos)
2824 {
2825         int i, j, ret = 0;
2826         u64 blkno;
2827         struct ocfs2_extent_list *el;
2828
2829         *cpos = 0;
2830
2831         if (path->p_tree_depth == 0)
2832                 return 0;
2833
2834         blkno = path_leaf_bh(path)->b_blocknr;
2835
2836         /* Start at the tree node just above the leaf and work our way up. */
2837         i = path->p_tree_depth - 1;
2838         while (i >= 0) {
2839                 int next_free;
2840
2841                 el = path->p_node[i].el;
2842
2843                 /*
2844                  * Find the extent record just after the one in our
2845                  * path.
2846                  */
2847                 next_free = le16_to_cpu(el->l_next_free_rec);
2848                 for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) {
2849                         if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) {
2850                                 if (j == (next_free - 1)) {
2851                                         if (i == 0) {
2852                                                 /*
2853                                                  * We've determined that the
2854                                                  * path specified is already
2855                                                  * the rightmost one - return a
2856                                                  * cpos of zero.
2857                                                  */
2858                                                 goto out;
2859                                         }
2860                                         /*
2861                                          * The rightmost record points to our
2862                                          * leaf - we need to travel up the
2863                                          * tree one level.
2864                                          */
2865                                         goto next_node;
2866                                 }
2867
2868                                 *cpos = le32_to_cpu(el->l_recs[j + 1].e_cpos);
2869                                 goto out;
2870                         }
2871                 }
2872
2873                 /*
2874                  * If we got here, we never found a valid node where
2875                  * the tree indicated one should be.
2876                  */
2877                 ocfs2_error(sb,
2878                             "Invalid extent tree at extent block %llu\n",
2879                             (unsigned long long)blkno);
2880                 ret = -EROFS;
2881                 goto out;
2882
2883 next_node:
2884                 blkno = path->p_node[i].bh->b_blocknr;
2885                 i--;
2886         }
2887
2888 out:
2889         return ret;
2890 }
2891
2892 static int ocfs2_rotate_rightmost_leaf_left(handle_t *handle,
2893                                             struct ocfs2_extent_tree *et,
2894                                             struct ocfs2_path *path)
2895 {
2896         int ret;
2897         struct buffer_head *bh = path_leaf_bh(path);
2898         struct ocfs2_extent_list *el = path_leaf_el(path);
2899
2900         if (!ocfs2_is_empty_extent(&el->l_recs[0]))
2901                 return 0;
2902
2903         ret = ocfs2_path_bh_journal_access(handle, et->et_ci, path,
2904                                            path_num_items(path) - 1);
2905         if (ret) {
2906                 mlog_errno(ret);
2907                 goto out;
2908         }
2909
2910         ocfs2_remove_empty_extent(el);
2911         ocfs2_journal_dirty(handle, bh);
2912
2913 out:
2914         return ret;
2915 }
2916
2917 static int __ocfs2_rotate_tree_left(handle_t *handle,
2918                                     struct ocfs2_extent_tree *et,
2919                                     int orig_credits,
2920                                     struct ocfs2_path *path,
2921                                     struct ocfs2_cached_dealloc_ctxt *dealloc,
2922                                     struct ocfs2_path **empty_extent_path)
2923 {
2924         int ret, subtree_root, deleted;
2925         u32 right_cpos;
2926         struct ocfs2_path *left_path = NULL;
2927         struct ocfs2_path *right_path = NULL;
2928         struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
2929
2930         BUG_ON(!ocfs2_is_empty_extent(&(path_leaf_el(path)->l_recs[0])));
2931
2932         *empty_extent_path = NULL;
2933
2934         ret = ocfs2_find_cpos_for_right_leaf(sb, path, &right_cpos);
2935         if (ret) {
2936                 mlog_errno(ret);
2937                 goto out;
2938         }
2939
2940         left_path = ocfs2_new_path_from_path(path);
2941         if (!left_path) {
2942                 ret = -ENOMEM;
2943                 mlog_errno(ret);
2944                 goto out;
2945         }
2946
2947         ocfs2_cp_path(left_path, path);
2948
2949         right_path = ocfs2_new_path_from_path(path);
2950         if (!right_path) {
2951                 ret = -ENOMEM;
2952                 mlog_errno(ret);
2953                 goto out;
2954         }
2955
2956         while (right_cpos) {
2957                 ret = ocfs2_find_path(et->et_ci, right_path, right_cpos);
2958                 if (ret) {
2959                         mlog_errno(ret);
2960                         goto out;
2961                 }
2962
2963                 subtree_root = ocfs2_find_subtree_root(et, left_path,
2964                                                        right_path);
2965
2966                 mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n",
2967                      subtree_root,
2968                      (unsigned long long)
2969                      right_path->p_node[subtree_root].bh->b_blocknr,
2970                      right_path->p_tree_depth);
2971
2972                 ret = ocfs2_extend_rotate_transaction(handle, subtree_root,
2973                                                       orig_credits, left_path);
2974                 if (ret) {
2975                         mlog_errno(ret);
2976                         goto out;
2977                 }
2978
2979                 /*
2980                  * Caller might still want to make changes to the
2981                  * tree root, so re-add it to the journal here.
2982                  */
2983                 ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
2984                                                    left_path, 0);
2985                 if (ret) {
2986                         mlog_errno(ret);
2987                         goto out;
2988                 }
2989
2990                 ret = ocfs2_rotate_subtree_left(handle, et, left_path,
2991                                                 right_path, subtree_root,
2992                                                 dealloc, &deleted);
2993                 if (ret == -EAGAIN) {
2994                         /*
2995                          * The rotation has to temporarily stop due to
2996                          * the right subtree having an empty
2997                          * extent. Pass it back to the caller for a
2998                          * fixup.
2999                          */
3000                         *empty_extent_path = right_path;
3001                         right_path = NULL;
3002                         goto out;
3003                 }
3004                 if (ret) {
3005                         mlog_errno(ret);
3006                         goto out;
3007                 }
3008
3009                 /*
3010                  * The subtree rotate might have removed records on
3011                  * the rightmost edge. If so, then rotation is
3012                  * complete.
3013                  */
3014                 if (deleted)
3015                         break;
3016
3017                 ocfs2_mv_path(left_path, right_path);
3018
3019                 ret = ocfs2_find_cpos_for_right_leaf(sb, left_path,
3020                                                      &right_cpos);
3021                 if (ret) {
3022                         mlog_errno(ret);
3023                         goto out;
3024                 }
3025         }
3026
3027 out:
3028         ocfs2_free_path(right_path);
3029         ocfs2_free_path(left_path);
3030
3031         return ret;
3032 }
3033
3034 static int ocfs2_remove_rightmost_path(handle_t *handle,
3035                                 struct ocfs2_extent_tree *et,
3036                                 struct ocfs2_path *path,
3037                                 struct ocfs2_cached_dealloc_ctxt *dealloc)
3038 {
3039         int ret, subtree_index;
3040         u32 cpos;
3041         struct ocfs2_path *left_path = NULL;
3042         struct ocfs2_extent_block *eb;
3043         struct ocfs2_extent_list *el;
3044
3045
3046         ret = ocfs2_et_sanity_check(et);
3047         if (ret)
3048                 goto out;
3049         /*
3050          * There's two ways we handle this depending on
3051          * whether path is the only existing one.
3052          */
3053         ret = ocfs2_extend_rotate_transaction(handle, 0,
3054                                               handle->h_buffer_credits,
3055                                               path);
3056         if (ret) {
3057                 mlog_errno(ret);
3058                 goto out;
3059         }
3060
3061         ret = ocfs2_journal_access_path(et->et_ci, handle, path);
3062         if (ret) {
3063                 mlog_errno(ret);
3064                 goto out;
3065         }
3066
3067         ret = ocfs2_find_cpos_for_left_leaf(ocfs2_metadata_cache_get_super(et->et_ci),
3068                                             path, &cpos);
3069         if (ret) {
3070                 mlog_errno(ret);
3071                 goto out;
3072         }
3073
3074         if (cpos) {
3075                 /*
3076                  * We have a path to the left of this one - it needs
3077                  * an update too.
3078                  */
3079                 left_path = ocfs2_new_path_from_path(path);
3080                 if (!left_path) {
3081                         ret = -ENOMEM;
3082                         mlog_errno(ret);
3083                         goto out;
3084                 }
3085
3086                 ret = ocfs2_find_path(et->et_ci, left_path, cpos);
3087                 if (ret) {
3088                         mlog_errno(ret);
3089                         goto out;
3090                 }
3091
3092                 ret = ocfs2_journal_access_path(et->et_ci, handle, left_path);
3093                 if (ret) {
3094                         mlog_errno(ret);
3095                         goto out;
3096                 }
3097
3098                 subtree_index = ocfs2_find_subtree_root(et, left_path, path);
3099
3100                 ocfs2_unlink_subtree(handle, et, left_path, path,
3101                                      subtree_index, dealloc);
3102                 ret = ocfs2_update_edge_lengths(handle, et, subtree_index,
3103                                                 left_path);
3104                 if (ret) {
3105                         mlog_errno(ret);
3106                         goto out;
3107                 }
3108
3109                 eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
3110                 ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
3111         } else {
3112                 /*
3113                  * 'path' is also the leftmost path which
3114                  * means it must be the only one. This gets
3115                  * handled differently because we want to
3116                  * revert the root back to having extents
3117                  * in-line.
3118                  */
3119                 ocfs2_unlink_path(handle, et, dealloc, path, 1);
3120
3121                 el = et->et_root_el;
3122                 el->l_tree_depth = 0;
3123                 el->l_next_free_rec = 0;
3124                 memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
3125
3126                 ocfs2_et_set_last_eb_blk(et, 0);
3127         }
3128
3129         ocfs2_journal_dirty(handle, path_root_bh(path));
3130
3131 out:
3132         ocfs2_free_path(left_path);
3133         return ret;
3134 }
3135
3136 /*
3137  * Left rotation of btree records.
3138  *
3139  * In many ways, this is (unsurprisingly) the opposite of right
3140  * rotation. We start at some non-rightmost path containing an empty
3141  * extent in the leaf block. The code works its way to the rightmost
3142  * path by rotating records to the left in every subtree.
3143  *
3144  * This is used by any code which reduces the number of extent records
3145  * in a leaf. After removal, an empty record should be placed in the
3146  * leftmost list position.
3147  *
3148  * This won't handle a length update of the rightmost path records if
3149  * the rightmost tree leaf record is removed so the caller is
3150  * responsible for detecting and correcting that.
3151  */
3152 static int ocfs2_rotate_tree_left(handle_t *handle,
3153                                   struct ocfs2_extent_tree *et,
3154                                   struct ocfs2_path *path,
3155                                   struct ocfs2_cached_dealloc_ctxt *dealloc)
3156 {
3157         int ret, orig_credits = handle->h_buffer_credits;
3158         struct ocfs2_path *tmp_path = NULL, *restart_path = NULL;
3159         struct ocfs2_extent_block *eb;
3160         struct ocfs2_extent_list *el;
3161
3162         el = path_leaf_el(path);
3163         if (!ocfs2_is_empty_extent(&el->l_recs[0]))
3164                 return 0;
3165
3166         if (path->p_tree_depth == 0) {
3167 rightmost_no_delete:
3168                 /*
3169                  * Inline extents. This is trivially handled, so do
3170                  * it up front.
3171                  */
3172                 ret = ocfs2_rotate_rightmost_leaf_left(handle, et, path);
3173                 if (ret)
3174                         mlog_errno(ret);
3175                 goto out;
3176         }
3177
3178         /*
3179          * Handle rightmost branch now. There's several cases:
3180          *  1) simple rotation leaving records in there. That's trivial.
3181          *  2) rotation requiring a branch delete - there's no more
3182          *     records left. Two cases of this:
3183          *     a) There are branches to the left.
3184          *     b) This is also the leftmost (the only) branch.
3185          *
3186          *  1) is handled via ocfs2_rotate_rightmost_leaf_left()
3187          *  2a) we need the left branch so that we can update it with the unlink
3188          *  2b) we need to bring the root back to inline extents.
3189          */
3190
3191         eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
3192         el = &eb->h_list;
3193         if (eb->h_next_leaf_blk == 0) {
3194                 /*
3195                  * This gets a bit tricky if we're going to delete the
3196                  * rightmost path. Get the other cases out of the way
3197                  * 1st.
3198                  */
3199                 if (le16_to_cpu(el->l_next_free_rec) > 1)
3200                         goto rightmost_no_delete;
3201
3202                 if (le16_to_cpu(el->l_next_free_rec) == 0) {
3203                         ret = -EIO;
3204                         ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
3205                                     "Owner %llu has empty extent block at %llu",
3206                                     (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
3207                                     (unsigned long long)le64_to_cpu(eb->h_blkno));
3208                         goto out;
3209                 }
3210
3211                 /*
3212                  * XXX: The caller can not trust "path" any more after
3213                  * this as it will have been deleted. What do we do?
3214                  *
3215                  * In theory the rotate-for-merge code will never get
3216                  * here because it'll always ask for a rotate in a
3217                  * nonempty list.
3218                  */
3219
3220                 ret = ocfs2_remove_rightmost_path(handle, et, path,
3221                                                   dealloc);
3222                 if (ret)
3223                         mlog_errno(ret);
3224                 goto out;
3225         }
3226
3227         /*
3228          * Now we can loop, remembering the path we get from -EAGAIN
3229          * and restarting from there.
3230          */
3231 try_rotate:
3232         ret = __ocfs2_rotate_tree_left(handle, et, orig_credits, path,
3233                                        dealloc, &restart_path);
3234         if (ret && ret != -EAGAIN) {
3235                 mlog_errno(ret);
3236                 goto out;
3237         }
3238
3239         while (ret == -EAGAIN) {
3240                 tmp_path = restart_path;
3241                 restart_path = NULL;
3242
3243                 ret = __ocfs2_rotate_tree_left(handle, et, orig_credits,
3244                                                tmp_path, dealloc,
3245                                                &restart_path);
3246                 if (ret && ret != -EAGAIN) {
3247                         mlog_errno(ret);
3248                         goto out;
3249                 }
3250
3251                 ocfs2_free_path(tmp_path);
3252                 tmp_path = NULL;
3253
3254                 if (ret == 0)
3255                         goto try_rotate;
3256         }
3257
3258 out:
3259         ocfs2_free_path(tmp_path);
3260         ocfs2_free_path(restart_path);
3261         return ret;
3262 }
3263
3264 static void ocfs2_cleanup_merge(struct ocfs2_extent_list *el,
3265                                 int index)
3266 {
3267         struct ocfs2_extent_rec *rec = &el->l_recs[index];
3268         unsigned int size;
3269
3270         if (rec->e_leaf_clusters == 0) {
3271                 /*
3272                  * We consumed all of the merged-from record. An empty
3273                  * extent cannot exist anywhere but the 1st array
3274                  * position, so move things over if the merged-from
3275                  * record doesn't occupy that position.
3276                  *
3277                  * This creates a new empty extent so the caller
3278                  * should be smart enough to have removed any existing
3279                  * ones.
3280                  */
3281                 if (index > 0) {
3282                         BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0]));
3283                         size = index * sizeof(struct ocfs2_extent_rec);
3284                         memmove(&el->l_recs[1], &el->l_recs[0], size);
3285                 }
3286
3287                 /*
3288                  * Always memset - the caller doesn't check whether it
3289                  * created an empty extent, so there could be junk in
3290                  * the other fields.
3291                  */
3292                 memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
3293         }
3294 }
3295
3296 static int ocfs2_get_right_path(struct ocfs2_extent_tree *et,
3297                                 struct ocfs2_path *left_path,
3298                                 struct ocfs2_path **ret_right_path)
3299 {
3300         int ret;
3301         u32 right_cpos;
3302         struct ocfs2_path *right_path = NULL;
3303         struct ocfs2_extent_list *left_el;
3304
3305         *ret_right_path = NULL;
3306
3307         /* This function shouldn't be called for non-trees. */
3308         BUG_ON(left_path->p_tree_depth == 0);
3309
3310         left_el = path_leaf_el(left_path);
3311         BUG_ON(left_el->l_next_free_rec != left_el->l_count);
3312
3313         ret = ocfs2_find_cpos_for_right_leaf(ocfs2_metadata_cache_get_super(et->et_ci),
3314                                              left_path, &right_cpos);
3315         if (ret) {
3316                 mlog_errno(ret);
3317                 goto out;
3318         }
3319
3320         /* This function shouldn't be called for the rightmost leaf. */
3321         BUG_ON(right_cpos == 0);
3322
3323         right_path = ocfs2_new_path_from_path(left_path);
3324         if (!right_path) {
3325                 ret = -ENOMEM;
3326                 mlog_errno(ret);
3327                 goto out;
3328         }
3329
3330         ret = ocfs2_find_path(et->et_ci, right_path, right_cpos);
3331         if (ret) {
3332                 mlog_errno(ret);
3333                 goto out;
3334         }
3335
3336         *ret_right_path = right_path;
3337 out:
3338         if (ret)
3339                 ocfs2_free_path(right_path);
3340         return ret;
3341 }
3342
3343 /*
3344  * Remove split_rec clusters from the record at index and merge them
3345  * onto the beginning of the record "next" to it.
3346  * For index < l_count - 1, the next means the extent rec at index + 1.
3347  * For index == l_count - 1, the "next" means the 1st extent rec of the
3348  * next extent block.
3349  */
3350 static int ocfs2_merge_rec_right(struct ocfs2_path *left_path,
3351                                  handle_t *handle,
3352                                  struct ocfs2_extent_tree *et,
3353                                  struct ocfs2_extent_rec *split_rec,
3354                                  int index)
3355 {
3356         int ret, next_free, i;
3357         unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
3358         struct ocfs2_extent_rec *left_rec;
3359         struct ocfs2_extent_rec *right_rec;
3360         struct ocfs2_extent_list *right_el;
3361         struct ocfs2_path *right_path = NULL;
3362         int subtree_index = 0;
3363         struct ocfs2_extent_list *el = path_leaf_el(left_path);
3364         struct buffer_head *bh = path_leaf_bh(left_path);
3365         struct buffer_head *root_bh = NULL;
3366
3367         BUG_ON(index >= le16_to_cpu(el->l_next_free_rec));
3368         left_rec = &el->l_recs[index];
3369
3370         if (index == le16_to_cpu(el->l_next_free_rec) - 1 &&
3371             le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count)) {
3372                 /* we meet with a cross extent block merge. */
3373                 ret = ocfs2_get_right_path(et, left_path, &right_path);
3374                 if (ret) {
3375                         mlog_errno(ret);
3376                         goto out;
3377                 }
3378
3379                 right_el = path_leaf_el(right_path);
3380                 next_free = le16_to_cpu(right_el->l_next_free_rec);
3381                 BUG_ON(next_free <= 0);
3382                 right_rec = &right_el->l_recs[0];
3383                 if (ocfs2_is_empty_extent(right_rec)) {
3384                         BUG_ON(next_free <= 1);
3385                         right_rec = &right_el->l_recs[1];
3386                 }
3387
3388                 BUG_ON(le32_to_cpu(left_rec->e_cpos) +
3389                        le16_to_cpu(left_rec->e_leaf_clusters) !=
3390                        le32_to_cpu(right_rec->e_cpos));
3391
3392                 subtree_index = ocfs2_find_subtree_root(et, left_path,
3393                                                         right_path);
3394
3395                 ret = ocfs2_extend_rotate_transaction(handle, subtree_index,
3396                                                       handle->h_buffer_credits,
3397                                                       right_path);
3398                 if (ret) {
3399                         mlog_errno(ret);
3400                         goto out;
3401                 }
3402
3403                 root_bh = left_path->p_node[subtree_index].bh;
3404                 BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
3405
3406                 ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path,
3407                                                    subtree_index);
3408                 if (ret) {
3409                         mlog_errno(ret);
3410                         goto out;
3411                 }
3412
3413                 for (i = subtree_index + 1;
3414                      i < path_num_items(right_path); i++) {
3415                         ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
3416                                                            right_path, i);
3417                         if (ret) {
3418                                 mlog_errno(ret);
3419                                 goto out;
3420                         }
3421
3422                         ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
3423                                                            left_path, i);
3424                         if (ret) {
3425                                 mlog_errno(ret);
3426                                 goto out;
3427                         }
3428                 }
3429
3430         } else {
3431                 BUG_ON(index == le16_to_cpu(el->l_next_free_rec) - 1);
3432                 right_rec = &el->l_recs[index + 1];
3433         }
3434
3435         ret = ocfs2_path_bh_journal_access(handle, et->et_ci, left_path,
3436                                            path_num_items(left_path) - 1);
3437         if (ret) {
3438                 mlog_errno(ret);
3439                 goto out;
3440         }
3441
3442         le16_add_cpu(&left_rec->e_leaf_clusters, -split_clusters);
3443
3444         le32_add_cpu(&right_rec->e_cpos, -split_clusters);
3445         le64_add_cpu(&right_rec->e_blkno,
3446                      -ocfs2_clusters_to_blocks(ocfs2_metadata_cache_get_super(et->et_ci),
3447                                                split_clusters));
3448         le16_add_cpu(&right_rec->e_leaf_clusters, split_clusters);
3449
3450         ocfs2_cleanup_merge(el, index);
3451
3452         ocfs2_journal_dirty(handle, bh);
3453         if (right_path) {
3454                 ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
3455                 ocfs2_complete_edge_insert(handle, left_path, right_path,
3456                                            subtree_index);
3457         }
3458 out:
3459         if (right_path)
3460                 ocfs2_free_path(right_path);
3461         return ret;
3462 }
3463
3464 static int ocfs2_get_left_path(struct ocfs2_extent_tree *et,
3465                                struct ocfs2_path *right_path,
3466                                struct ocfs2_path **ret_left_path)
3467 {
3468         int ret;
3469         u32 left_cpos;
3470         struct ocfs2_path *left_path = NULL;
3471
3472         *ret_left_path = NULL;
3473
3474         /* This function shouldn't be called for non-trees. */
3475         BUG_ON(right_path->p_tree_depth == 0);
3476
3477         ret = ocfs2_find_cpos_for_left_leaf(ocfs2_metadata_cache_get_super(et->et_ci),
3478                                             right_path, &left_cpos);
3479         if (ret) {
3480                 mlog_errno(ret);
3481                 goto out;
3482         }
3483
3484         /* This function shouldn't be called for the leftmost leaf. */
3485         BUG_ON(left_cpos == 0);
3486
3487         left_path = ocfs2_new_path_from_path(right_path);
3488         if (!left_path) {
3489                 ret = -ENOMEM;
3490                 mlog_errno(ret);
3491                 goto out;
3492         }
3493
3494         ret = ocfs2_find_path(et->et_ci, left_path, left_cpos);
3495         if (ret) {
3496                 mlog_errno(ret);
3497                 goto out;
3498         }
3499
3500         *ret_left_path = left_path;
3501 out:
3502         if (ret)
3503                 ocfs2_free_path(left_path);
3504         return ret;
3505 }
3506
3507 /*
3508  * Remove split_rec clusters from the record at index and merge them
3509  * onto the tail of the record "before" it.
3510  * For index > 0, the "before" means the extent rec at index - 1.
3511  *
3512  * For index == 0, the "before" means the last record of the previous
3513  * extent block. And there is also a situation that we may need to
3514  * remove the rightmost leaf extent block in the right_path and change
3515  * the right path to indicate the new rightmost path.
3516  */
3517 static int ocfs2_merge_rec_left(struct ocfs2_path *right_path,
3518                                 handle_t *handle,
3519                                 struct ocfs2_extent_tree *et,
3520                                 struct ocfs2_extent_rec *split_rec,
3521                                 struct ocfs2_cached_dealloc_ctxt *dealloc,
3522                                 int index)
3523 {
3524         int ret, i, subtree_index = 0, has_empty_extent = 0;
3525         unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
3526         struct ocfs2_extent_rec *left_rec;
3527         struct ocfs2_extent_rec *right_rec;
3528         struct ocfs2_extent_list *el = path_leaf_el(right_path);
3529         struct buffer_head *bh = path_leaf_bh(right_path);
3530         struct buffer_head *root_bh = NULL;
3531         struct ocfs2_path *left_path = NULL;
3532         struct ocfs2_extent_list *left_el;
3533
3534         BUG_ON(index < 0);
3535
3536         right_rec = &el->l_recs[index];
3537         if (index == 0) {
3538                 /* we meet with a cross extent block merge. */
3539                 ret = ocfs2_get_left_path(et, right_path, &left_path);
3540                 if (ret) {
3541                         mlog_errno(ret);
3542                         goto out;
3543                 }
3544
3545                 left_el = path_leaf_el(left_path);
3546                 BUG_ON(le16_to_cpu(left_el->l_next_free_rec) !=
3547                        le16_to_cpu(left_el->l_count));
3548
3549                 left_rec = &left_el->l_recs[
3550                                 le16_to_cpu(left_el->l_next_free_rec) - 1];
3551                 BUG_ON(le32_to_cpu(left_rec->e_cpos) +
3552                        le16_to_cpu(left_rec->e_leaf_clusters) !=
3553                        le32_to_cpu(split_rec->e_cpos));
3554
3555                 subtree_index = ocfs2_find_subtree_root(et, left_path,
3556                                                         right_path);
3557
3558                 ret = ocfs2_extend_rotate_transaction(handle, subtree_index,
3559                                                       handle->h_buffer_credits,
3560                                                       left_path);
3561                 if (ret) {
3562                         mlog_errno(ret);
3563                         goto out;
3564                 }
3565
3566                 root_bh = left_path->p_node[subtree_index].bh;
3567                 BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
3568
3569                 ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path,
3570                                                    subtree_index);
3571                 if (ret) {
3572                         mlog_errno(ret);
3573                         goto out;
3574                 }
3575
3576                 for (i = subtree_index + 1;
3577                      i < path_num_items(right_path); i++) {
3578                         ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
3579                                                            right_path, i);
3580                         if (ret) {
3581                                 mlog_errno(ret);
3582                                 goto out;
3583                         }
3584
3585                         ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
3586                                                            left_path, i);
3587                         if (ret) {
3588                                 mlog_errno(ret);
3589                                 goto out;
3590                         }
3591                 }
3592         } else {
3593                 left_rec = &el->l_recs[index - 1];
3594                 if (ocfs2_is_empty_extent(&el->l_recs[0]))
3595                         has_empty_extent = 1;
3596         }
3597
3598         ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path,
3599                                            path_num_items(right_path) - 1);
3600         if (ret) {
3601                 mlog_errno(ret);
3602                 goto out;
3603         }
3604
3605         if (has_empty_extent && index == 1) {
3606                 /*
3607                  * The easy case - we can just plop the record right in.
3608                  */
3609                 *left_rec = *split_rec;
3610
3611                 has_empty_extent = 0;
3612         } else
3613                 le16_add_cpu(&left_rec->e_leaf_clusters, split_clusters);
3614
3615         le32_add_cpu(&right_rec->e_cpos, split_clusters);
3616         le64_add_cpu(&right_rec->e_blkno,
3617                      ocfs2_clusters_to_blocks(ocfs2_metadata_cache_get_super(et->et_ci),
3618                                               split_clusters));
3619         le16_add_cpu(&right_rec->e_leaf_clusters, -split_clusters);
3620
3621         ocfs2_cleanup_merge(el, index);
3622
3623         ocfs2_journal_dirty(handle, bh);
3624         if (left_path) {
3625                 ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
3626
3627                 /*
3628                  * In the situation that the right_rec is empty and the extent
3629                  * block is empty also,  ocfs2_complete_edge_insert can't handle
3630                  * it and we need to delete the right extent block.
3631                  */
3632                 if (le16_to_cpu(right_rec->e_leaf_clusters) == 0 &&
3633                     le16_to_cpu(el->l_next_free_rec) == 1) {
3634
3635                         ret = ocfs2_remove_rightmost_path(handle, et,
3636                                                           right_path,
3637                                                           dealloc);
3638                         if (ret) {
3639                                 mlog_errno(ret);
3640                                 goto out;
3641                         }
3642
3643                         /* Now the rightmost extent block has been deleted.
3644                          * So we use the new rightmost path.
3645                          */
3646                         ocfs2_mv_path(right_path, left_path);
3647                         left_path = NULL;
3648                 } else
3649                         ocfs2_complete_edge_insert(handle, left_path,
3650                                                    right_path, subtree_index);
3651         }
3652 out:
3653         if (left_path)
3654                 ocfs2_free_path(left_path);
3655         return ret;
3656 }
3657
3658 static int ocfs2_try_to_merge_extent(handle_t *handle,
3659                                      struct ocfs2_extent_tree *et,
3660                                      struct ocfs2_path *path,
3661                                      int split_index,
3662                                      struct ocfs2_extent_rec *split_rec,
3663                                      struct ocfs2_cached_dealloc_ctxt *dealloc,
3664                                      struct ocfs2_merge_ctxt *ctxt)
3665 {
3666         int ret = 0;
3667         struct ocfs2_extent_list *el = path_leaf_el(path);
3668         struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
3669
3670         BUG_ON(ctxt->c_contig_type == CONTIG_NONE);
3671
3672         if (ctxt->c_split_covers_rec && ctxt->c_has_empty_extent) {
3673                 /*
3674                  * The merge code will need to create an empty
3675                  * extent to take the place of the newly
3676                  * emptied slot. Remove any pre-existing empty
3677                  * extents - having more than one in a leaf is
3678                  * illegal.
3679                  */
3680                 ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
3681                 if (ret) {
3682                         mlog_errno(ret);
3683                         goto out;
3684                 }
3685                 split_index--;
3686                 rec = &el->l_recs[split_index];
3687         }
3688
3689         if (ctxt->c_contig_type == CONTIG_LEFTRIGHT) {
3690                 /*
3691                  * Left-right contig implies this.
3692                  */
3693                 BUG_ON(!ctxt->c_split_covers_rec);
3694
3695                 /*
3696                  * Since the leftright insert always covers the entire
3697                  * extent, this call will delete the insert record
3698                  * entirely, resulting in an empty extent record added to
3699                  * the extent block.
3700                  *
3701                  * Since the adding of an empty extent shifts
3702                  * everything back to the right, there's no need to
3703                  * update split_index here.
3704                  *
3705                  * When the split_index is zero, we need to merge it to the
3706                  * prevoius extent block. It is more efficient and easier
3707                  * if we do merge_right first and merge_left later.
3708                  */
3709                 ret = ocfs2_merge_rec_right(path, handle, et, split_rec,
3710                                             split_index);
3711                 if (ret) {
3712                         mlog_errno(ret);
3713                         goto out;
3714                 }
3715
3716                 /*
3717                  * We can only get this from logic error above.
3718                  */
3719                 BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0]));
3720
3721                 /* The merge left us with an empty extent, remove it. */
3722                 ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
3723                 if (ret) {
3724                         mlog_errno(ret);
3725                         goto out;
3726                 }
3727
3728                 rec = &el->l_recs[split_index];
3729
3730                 /*
3731                  * Note that we don't pass split_rec here on purpose -
3732                  * we've merged it into the rec already.
3733                  */
3734                 ret = ocfs2_merge_rec_left(path, handle, et, rec,
3735                                            dealloc, split_index);
3736
3737                 if (ret) {
3738                         mlog_errno(ret);
3739                         goto out;
3740                 }
3741
3742                 ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
3743                 /*
3744                  * Error from this last rotate is not critical, so
3745                  * print but don't bubble it up.
3746                  */
3747                 if (ret)
3748                         mlog_errno(ret);
3749                 ret = 0;
3750         } else {
3751                 /*
3752                  * Merge a record to the left or right.
3753                  *
3754                  * 'contig_type' is relative to the existing record,
3755                  * so for example, if we're "right contig", it's to
3756                  * the record on the left (hence the left merge).
3757                  */
3758                 if (ctxt->c_contig_type == CONTIG_RIGHT) {
3759                         ret = ocfs2_merge_rec_left(path, handle, et,
3760                                                    split_rec, dealloc,
3761                                                    split_index);
3762                         if (ret) {
3763                                 mlog_errno(ret);
3764                                 goto out;
3765                         }
3766                 } else {
3767                         ret = ocfs2_merge_rec_right(path, handle,
3768                                                     et, split_rec,
3769                                                     split_index);
3770                         if (ret) {
3771                                 mlog_errno(ret);
3772                                 goto out;
3773                         }
3774                 }
3775
3776                 if (ctxt->c_split_covers_rec) {
3777                         /*
3778                          * The merge may have left an empty extent in
3779                          * our leaf. Try to rotate it away.
3780                          */
3781                         ret = ocfs2_rotate_tree_left(handle, et, path,
3782                                                      dealloc);
3783                         if (ret)
3784                                 mlog_errno(ret);
3785                         ret = 0;
3786                 }
3787         }
3788
3789 out:
3790         return ret;
3791 }
3792
3793 static void ocfs2_subtract_from_rec(struct super_block *sb,
3794                                     enum ocfs2_split_type split,
3795                                     struct ocfs2_extent_rec *rec,
3796                                     struct ocfs2_extent_rec *split_rec)
3797 {
3798         u64 len_blocks;
3799
3800         len_blocks = ocfs2_clusters_to_blocks(sb,
3801                                 le16_to_cpu(split_rec->e_leaf_clusters));
3802
3803         if (split == SPLIT_LEFT) {
3804                 /*
3805                  * Region is on the left edge of the existing
3806                  * record.
3807                  */
3808                 le32_add_cpu(&rec->e_cpos,
3809                              le16_to_cpu(split_rec->e_leaf_clusters));
3810                 le64_add_cpu(&rec->e_blkno, len_blocks);
3811                 le16_add_cpu(&rec->e_leaf_clusters,
3812                              -le16_to_cpu(split_rec->e_leaf_clusters));
3813         } else {
3814                 /*
3815                  * Region is on the right edge of the existing
3816                  * record.
3817                  */
3818                 le16_add_cpu(&rec->e_leaf_clusters,
3819                              -le16_to_cpu(split_rec->e_leaf_clusters));
3820         }
3821 }
3822
3823 /*
3824  * Do the final bits of extent record insertion at the target leaf
3825  * list. If this leaf is part of an allocation tree, it is assumed
3826  * that the tree above has been prepared.
3827  */
3828 static void ocfs2_insert_at_leaf(struct ocfs2_extent_tree *et,
3829                                  struct ocfs2_extent_rec *insert_rec,
3830                                  struct ocfs2_extent_list *el,
3831                                  struct ocfs2_insert_type *insert)
3832 {
3833         int i = insert->ins_contig_index;
3834         unsigned int range;
3835         struct ocfs2_extent_rec *rec;
3836
3837         BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
3838
3839         if (insert->ins_split != SPLIT_NONE) {
3840                 i = ocfs2_search_extent_list(el, le32_to_cpu(insert_rec->e_cpos));
3841                 BUG_ON(i == -1);
3842                 rec = &el->l_recs[i];
3843                 ocfs2_subtract_from_rec(ocfs2_metadata_cache_get_super(et->et_ci),
3844                                         insert->ins_split, rec,
3845                                         insert_rec);
3846                 goto rotate;
3847         }
3848
3849         /*
3850          * Contiguous insert - either left or right.
3851          */
3852         if (insert->ins_contig != CONTIG_NONE) {
3853                 rec = &el->l_recs[i];
3854                 if (insert->ins_contig == CONTIG_LEFT) {
3855                         rec->e_blkno = insert_rec->e_blkno;
3856                         rec->e_cpos = insert_rec->e_cpos;
3857                 }
3858                 le16_add_cpu(&rec->e_leaf_clusters,
3859                              le16_to_cpu(insert_rec->e_leaf_clusters));
3860                 return;
3861         }
3862
3863         /*
3864          * Handle insert into an empty leaf.
3865          */
3866         if (le16_to_cpu(el->l_next_free_rec) == 0 ||
3867             ((le16_to_cpu(el->l_next_free_rec) == 1) &&
3868              ocfs2_is_empty_extent(&el->l_recs[0]))) {
3869                 el->l_recs[0] = *insert_rec;
3870                 el->l_next_free_rec = cpu_to_le16(1);
3871                 return;
3872         }
3873
3874         /*
3875          * Appending insert.
3876          */
3877         if (insert->ins_appending == APPEND_TAIL) {
3878                 i = le16_to_cpu(el->l_next_free_rec) - 1;
3879                 rec = &el->l_recs[i];
3880                 range = le32_to_cpu(rec->e_cpos)
3881                         + le16_to_cpu(rec->e_leaf_clusters);
3882                 BUG_ON(le32_to_cpu(insert_rec->e_cpos) < range);
3883
3884                 mlog_bug_on_msg(le16_to_cpu(el->l_next_free_rec) >=
3885                                 le16_to_cpu(el->l_count),
3886                                 "owner %llu, depth %u, count %u, next free %u, "
3887                                 "rec.cpos %u, rec.clusters %u, "
3888                                 "insert.cpos %u, insert.clusters %u\n",
3889                                 ocfs2_metadata_cache_owner(et->et_ci),
3890                                 le16_to_cpu(el->l_tree_depth),
3891                                 le16_to_cpu(el->l_count),
3892                                 le16_to_cpu(el->l_next_free_rec),
3893                                 le32_to_cpu(el->l_recs[i].e_cpos),
3894                                 le16_to_cpu(el->l_recs[i].e_leaf_clusters),
3895                                 le32_to_cpu(insert_rec->e_cpos),
3896                                 le16_to_cpu(insert_rec->e_leaf_clusters));
3897                 i++;
3898                 el->l_recs[i] = *insert_rec;
3899                 le16_add_cpu(&el->l_next_free_rec, 1);
3900                 return;
3901         }
3902
3903 rotate:
3904         /*
3905          * Ok, we have to rotate.
3906          *
3907          * At this point, it is safe to assume that inserting into an
3908          * empty leaf and appending to a leaf have both been handled
3909          * above.
3910          *
3911          * This leaf needs to have space, either by the empty 1st
3912          * extent record, or by virtue of an l_next_rec < l_count.
3913          */
3914         ocfs2_rotate_leaf(el, insert_rec);
3915 }
3916
3917 static void ocfs2_adjust_rightmost_records(handle_t *handle,
3918                                            struct ocfs2_extent_tree *et,
3919                                            struct ocfs2_path *path,
3920                                            struct ocfs2_extent_rec *insert_rec)
3921 {
3922         int ret, i, next_free;
3923         struct buffer_head *bh;
3924         struct ocfs2_extent_list *el;
3925         struct ocfs2_extent_rec *rec;
3926
3927         /*
3928          * Update everything except the leaf block.
3929          */
3930         for (i = 0; i < path->p_tree_depth; i++) {
3931                 bh = path->p_node[i].bh;
3932                 el = path->p_node[i].el;
3933
3934                 next_free = le16_to_cpu(el->l_next_free_rec);
3935                 if (next_free == 0) {
3936                         ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
3937                                     "Owner %llu has a bad extent list",
3938                                     (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci));
3939                         ret = -EIO;
3940                         return;
3941                 }
3942
3943                 rec = &el->l_recs[next_free - 1];
3944
3945                 rec->e_int_clusters = insert_rec->e_cpos;
3946                 le32_add_cpu(&rec->e_int_clusters,
3947                              le16_to_cpu(insert_rec->e_leaf_clusters));
3948                 le32_add_cpu(&rec->e_int_clusters,
3949                              -le32_to_cpu(rec->e_cpos));
3950
3951                 ocfs2_journal_dirty(handle, bh);
3952         }
3953 }
3954
3955 static int ocfs2_append_rec_to_path(handle_t *handle,
3956                                     struct ocfs2_extent_tree *et,
3957                                     struct ocfs2_extent_rec *insert_rec,
3958                                     struct ocfs2_path *right_path,
3959                                     struct ocfs2_path **ret_left_path)
3960 {
3961         int ret, next_free;
3962         struct ocfs2_extent_list *el;
3963         struct ocfs2_path *left_path = NULL;
3964
3965         *ret_left_path = NULL;
3966
3967         /*
3968          * This shouldn't happen for non-trees. The extent rec cluster
3969          * count manipulation below only works for interior nodes.
3970          */
3971         BUG_ON(right_path->p_tree_depth == 0);
3972
3973         /*
3974          * If our appending insert is at the leftmost edge of a leaf,
3975          * then we might need to update the rightmost records of the
3976          * neighboring path.
3977          */
3978         el = path_leaf_el(right_path);
3979         next_free = le16_to_cpu(el->l_next_free_rec);
3980         if (next_free == 0 ||
3981             (next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0]))) {
3982                 u32 left_cpos;
3983
3984                 ret = ocfs2_find_cpos_for_left_leaf(ocfs2_metadata_cache_get_super(et->et_ci),
3985                                                     right_path, &left_cpos);
3986                 if (ret) {
3987                         mlog_errno(ret);
3988                         goto out;
3989                 }
3990
3991                 mlog(0, "Append may need a left path update. cpos: %u, "
3992                      "left_cpos: %u\n", le32_to_cpu(insert_rec->e_cpos),
3993                      left_cpos);
3994
3995                 /*
3996                  * No need to worry if the append is already in the
3997                  * leftmost leaf.
3998                  */
3999                 if (left_cpos) {
4000                         left_path = ocfs2_new_path_from_path(right_path);
4001                         if (!left_path) {
4002                                 ret = -ENOMEM;
4003                                 mlog_errno(ret);
4004                                 goto out;
4005                         }
4006
4007                         ret = ocfs2_find_path(et->et_ci, left_path,
4008                                               left_cpos);
4009                         if (ret) {
4010                                 mlog_errno(ret);
4011                                 goto out;
4012                         }
4013
4014                         /*
4015                          * ocfs2_insert_path() will pass the left_path to the
4016                          * journal for us.
4017                          */
4018                 }
4019         }
4020
4021         ret = ocfs2_journal_access_path(et->et_ci, handle, right_path);
4022         if (ret) {
4023                 mlog_errno(ret);
4024                 goto out;
4025         }
4026
4027         ocfs2_adjust_rightmost_records(handle, et, right_path, insert_rec);
4028
4029         *ret_left_path = left_path;
4030         ret = 0;
4031 out:
4032         if (ret != 0)
4033                 ocfs2_free_path(left_path);
4034
4035         return ret;
4036 }
4037
4038 static void ocfs2_split_record(struct ocfs2_extent_tree *et,
4039                                struct ocfs2_path *left_path,
4040                                struct ocfs2_path *right_path,
4041                                struct ocfs2_extent_rec *split_rec,
4042                                enum ocfs2_split_type split)
4043 {
4044         int index;
4045         u32 cpos = le32_to_cpu(split_rec->e_cpos);
4046         struct ocfs2_extent_list *left_el = NULL, *right_el, *insert_el, *el;
4047         struct ocfs2_extent_rec *rec, *tmprec;
4048
4049         right_el = path_leaf_el(right_path);
4050         if (left_path)
4051                 left_el = path_leaf_el(left_path);
4052
4053         el = right_el;
4054         insert_el = right_el;
4055         index = ocfs2_search_extent_list(el, cpos);
4056         if (index != -1) {
4057                 if (index == 0 && left_path) {
4058                         BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0]));
4059
4060                         /*
4061                          * This typically means that the record
4062                          * started in the left path but moved to the
4063                          * right as a result of rotation. We either
4064                          * move the existing record to the left, or we
4065                          * do the later insert there.
4066                          *
4067                          * In this case, the left path should always
4068                          * exist as the rotate code will have passed
4069                          * it back for a post-insert update.
4070                          */
4071
4072                         if (split == SPLIT_LEFT) {
4073                                 /*
4074                                  * It's a left split. Since we know
4075                                  * that the rotate code gave us an
4076                                  * empty extent in the left path, we
4077                                  * can just do the insert there.
4078                                  */
4079                                 insert_el = left_el;
4080                         } else {
4081                                 /*
4082                                  * Right split - we have to move the
4083                                  * existing record over to the left
4084                                  * leaf. The insert will be into the
4085                                  * newly created empty extent in the
4086                                  * right leaf.
4087                                  */
4088                                 tmprec = &right_el->l_recs[index];
4089                                 ocfs2_rotate_leaf(left_el, tmprec);
4090                                 el = left_el;
4091
4092                                 memset(tmprec, 0, sizeof(*tmprec));
4093                                 index = ocfs2_search_extent_list(left_el, cpos);
4094                                 BUG_ON(index == -1);
4095                         }
4096                 }
4097         } else {
4098                 BUG_ON(!left_path);
4099                 BUG_ON(!ocfs2_is_empty_extent(&left_el->l_recs[0]));
4100                 /*
4101                  * Left path is easy - we can just allow the insert to
4102                  * happen.
4103                  */
4104                 el = left_el;
4105                 insert_el = left_el;
4106                 index = ocfs2_search_extent_list(el, cpos);
4107                 BUG_ON(index == -1);
4108         }
4109
4110         rec = &el->l_recs[index];
4111         ocfs2_subtract_from_rec(ocfs2_metadata_cache_get_super(et->et_ci),
4112                                 split, rec, split_rec);
4113         ocfs2_rotate_leaf(insert_el, split_rec);
4114 }
4115
4116 /*
4117  * This function only does inserts on an allocation b-tree. For tree
4118  * depth = 0, ocfs2_insert_at_leaf() is called directly.
4119  *
4120  * right_path is the path we want to do the actual insert
4121  * in. left_path should only be passed in if we need to update that
4122  * portion of the tree after an edge insert.
4123  */
4124 static int ocfs2_insert_path(handle_t *handle,
4125                              struct ocfs2_extent_tree *et,
4126                              struct ocfs2_path *left_path,
4127                              struct ocfs2_path *right_path,
4128                              struct ocfs2_extent_rec *insert_rec,
4129                              struct ocfs2_insert_type *insert)
4130 {
4131         int ret, subtree_index;
4132         struct buffer_head *leaf_bh = path_leaf_bh(right_path);
4133
4134         if (left_path) {
4135                 /*
4136                  * There's a chance that left_path got passed back to
4137                  * us without being accounted for in the
4138                  * journal. Extend our transaction here to be sure we
4139                  * can change those blocks.
4140                  */
4141                 ret = ocfs2_extend_trans(handle, left_path->p_tree_depth);
4142                 if (ret < 0) {
4143                         mlog_errno(ret);
4144                         goto out;
4145                 }
4146
4147                 ret = ocfs2_journal_access_path(et->et_ci, handle, left_path);
4148                 if (ret < 0) {
4149                         mlog_errno(ret);
4150                         goto out;
4151                 }
4152         }
4153
4154         /*
4155          * Pass both paths to the journal. The majority of inserts
4156          * will be touching all components anyway.
4157          */
4158         ret = ocfs2_journal_access_path(et->et_ci, handle, right_path);
4159         if (ret < 0) {
4160                 mlog_errno(ret);
4161                 goto out;
4162         }
4163
4164         if (insert->ins_split != SPLIT_NONE) {
4165                 /*
4166                  * We could call ocfs2_insert_at_leaf() for some types
4167                  * of splits, but it's easier to just let one separate
4168                  * function sort it all out.
4169                  */
4170                 ocfs2_split_record(et, left_path, right_path,
4171                                    insert_rec, insert->ins_split);
4172
4173                 /*
4174                  * Split might have modified either leaf and we don't
4175                  * have a guarantee that the later edge insert will
4176                  * dirty this for us.
4177                  */
4178                 if (left_path)
4179                         ocfs2_journal_dirty(handle,
4180                                             path_leaf_bh(left_path));
4181         } else
4182                 ocfs2_insert_at_leaf(et, insert_rec, path_leaf_el(right_path),
4183                                      insert);
4184
4185         ocfs2_journal_dirty(handle, leaf_bh);
4186
4187         if (left_path) {
4188                 /*
4189                  * The rotate code has indicated that we need to fix
4190                  * up portions of the tree after the insert.
4191                  *
4192                  * XXX: Should we extend the transaction here?
4193                  */
4194                 subtree_index = ocfs2_find_subtree_root(et, left_path,
4195                                                         right_path);
4196                 ocfs2_complete_edge_insert(handle, left_path, right_path,
4197                                            subtree_index);
4198         }
4199
4200         ret = 0;
4201 out:
4202         return ret;
4203 }
4204
4205 static int ocfs2_do_insert_extent(handle_t *handle,
4206                                   struct ocfs2_extent_tree *et,
4207                                   struct ocfs2_extent_rec *insert_rec,
4208                                   struct ocfs2_insert_type *type)
4209 {
4210         int ret, rotate = 0;
4211         u32 cpos;
4212         struct ocfs2_path *right_path = NULL;
4213         struct ocfs2_path *left_path = NULL;
4214         struct ocfs2_extent_list *el;
4215
4216         el = et->et_root_el;
4217
4218         ret = ocfs2_et_root_journal_access(handle, et,
4219                                            OCFS2_JOURNAL_ACCESS_WRITE);
4220         if (ret) {
4221                 mlog_errno(ret);
4222                 goto out;
4223         }
4224
4225         if (le16_to_cpu(el->l_tree_depth) == 0) {
4226                 ocfs2_insert_at_leaf(et, insert_rec, el, type);
4227                 goto out_update_clusters;
4228         }
4229
4230         right_path = ocfs2_new_path_from_et(et);
4231         if (!right_path) {
4232                 ret = -ENOMEM;
4233                 mlog_errno(ret);
4234                 goto out;
4235         }
4236
4237         /*
4238          * Determine the path to start with. Rotations need the
4239          * rightmost path, everything else can go directly to the
4240          * target leaf.
4241          */
4242         cpos = le32_to_cpu(insert_rec->e_cpos);
4243         if (type->ins_appending == APPEND_NONE &&
4244             type->ins_contig == CONTIG_NONE) {
4245                 rotate = 1;
4246                 cpos = UINT_MAX;
4247         }
4248
4249         ret = ocfs2_find_path(et->et_ci, right_path, cpos);
4250         if (ret) {
4251                 mlog_errno(ret);
4252                 goto out;
4253         }
4254
4255         /*
4256          * Rotations and appends need special treatment - they modify
4257          * parts of the tree's above them.
4258          *
4259          * Both might pass back a path immediate to the left of the
4260          * one being inserted to. This will be cause
4261          * ocfs2_insert_path() to modify the rightmost records of
4262          * left_path to account for an edge insert.
4263          *
4264          * XXX: When modifying this code, keep in mind that an insert
4265          * can wind up skipping both of these two special cases...
4266          */
4267         if (rotate) {
4268                 ret = ocfs2_rotate_tree_right(handle, et, type->ins_split,
4269                                               le32_to_cpu(insert_rec->e_cpos),
4270                                               right_path, &left_path);
4271                 if (ret) {
4272                         mlog_errno(ret);
4273                         goto out;
4274                 }
4275
4276                 /*
4277                  * ocfs2_rotate_tree_right() might have extended the
4278                  * transaction without re-journaling our tree root.
4279                  */
4280                 ret = ocfs2_et_root_journal_access(handle, et,
4281                                                    OCFS2_JOURNAL_ACCESS_WRITE);
4282                 if (ret) {
4283                         mlog_errno(ret);
4284                         goto out;
4285                 }
4286         } else if (type->ins_appending == APPEND_TAIL
4287                    && type->ins_contig != CONTIG_LEFT) {
4288                 ret = ocfs2_append_rec_to_path(handle, et, insert_rec,
4289                                                right_path, &left_path);
4290                 if (ret) {
4291                         mlog_errno(ret);
4292                         goto out;
4293                 }
4294         }
4295
4296         ret = ocfs2_insert_path(handle, et, left_path, right_path,
4297                                 insert_rec, type);
4298         if (ret) {
4299                 mlog_errno(ret);
4300                 goto out;
4301         }
4302
4303 out_update_clusters:
4304         if (type->ins_split == SPLIT_NONE)
4305                 ocfs2_et_update_clusters(et,
4306                                          le16_to_cpu(insert_rec->e_leaf_clusters));
4307
4308         ocfs2_journal_dirty(handle, et->et_root_bh);
4309
4310 out:
4311         ocfs2_free_path(left_path);
4312         ocfs2_free_path(right_path);
4313
4314         return ret;
4315 }
4316
4317 static enum ocfs2_contig_type
4318 ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et,
4319                                struct ocfs2_path *path,
4320                                struct ocfs2_extent_list *el, int index,
4321                                struct ocfs2_extent_rec *split_rec)
4322 {
4323         int status;
4324         enum ocfs2_contig_type ret = CONTIG_NONE;
4325         u32 left_cpos, right_cpos;
4326         struct ocfs2_extent_rec *rec = NULL;
4327         struct ocfs2_extent_list *new_el;
4328         struct ocfs2_path *left_path = NULL, *right_path = NULL;
4329         struct buffer_head *bh;
4330         struct ocfs2_extent_block *eb;
4331         struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
4332
4333         if (index > 0) {
4334                 rec = &el->l_recs[index - 1];
4335         } else if (path->p_tree_depth > 0) {
4336                 status = ocfs2_find_cpos_for_left_leaf(sb, path, &left_cpos);
4337                 if (status)
4338                         goto out;
4339
4340                 if (left_cpos != 0) {
4341                         left_path = ocfs2_new_path_from_path(path);
4342                         if (!left_path)
4343                                 goto out;
4344
4345                         status = ocfs2_find_path(et->et_ci, left_path,
4346                                                  left_cpos);
4347                         if (status)
4348                                 goto out;
4349
4350                         new_el = path_leaf_el(left_path);
4351
4352                         if (le16_to_cpu(new_el->l_next_free_rec) !=
4353                             le16_to_cpu(new_el->l_count)) {
4354                                 bh = path_leaf_bh(left_path);
4355                                 eb = (struct ocfs2_extent_block *)bh->b_data;
4356                                 ocfs2_error(sb,
4357                                             "Extent block #%llu has an "
4358                                             "invalid l_next_free_rec of "
4359                                             "%d.  It should have "
4360                                             "matched the l_count of %d",
4361                                             (unsigned long long)le64_to_cpu(eb->h_blkno),
4362                                             le16_to_cpu(new_el->l_next_free_rec),
4363                                             le16_to_cpu(new_el->l_count));
4364                                 status = -EINVAL;
4365                                 goto out;
4366                         }
4367                         rec = &new_el->l_recs[
4368                                 le16_to_cpu(new_el->l_next_free_rec) - 1];
4369                 }
4370         }
4371
4372         /*
4373          * We're careful to check for an empty extent record here -
4374          * the merge code will know what to do if it sees one.
4375          */
4376         if (rec) {
4377                 if (index == 1 && ocfs2_is_empty_extent(rec)) {
4378                         if (split_rec->e_cpos == el->l_recs[index].e_cpos)
4379                                 ret = CONTIG_RIGHT;
4380                 } else {
4381                         ret = ocfs2_et_extent_contig(et, rec, split_rec);
4382                 }
4383         }
4384
4385         rec = NULL;
4386         if (index < (le16_to_cpu(el->l_next_free_rec) - 1))
4387                 rec = &el->l_recs[index + 1];
4388         else if (le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count) &&
4389                  path->p_tree_depth > 0) {
4390                 status = ocfs2_find_cpos_for_right_leaf(sb, path, &right_cpos);
4391                 if (status)
4392                         goto out;
4393
4394                 if (right_cpos == 0)
4395                         goto out;
4396
4397                 right_path = ocfs2_new_path_from_path(path);
4398                 if (!right_path)
4399                         goto out;
4400
4401                 status = ocfs2_find_path(et->et_ci, right_path, right_cpos);
4402                 if (status)
4403                         goto out;
4404
4405                 new_el = path_leaf_el(right_path);
4406                 rec = &new_el->l_recs[0];
4407                 if (ocfs2_is_empty_extent(rec)) {
4408                         if (le16_to_cpu(new_el->l_next_free_rec) <= 1) {
4409                                 bh = path_leaf_bh(right_path);
4410                                 eb = (struct ocfs2_extent_block *)bh->b_data;
4411                                 ocfs2_error(sb,
4412                                             "Extent block #%llu has an "
4413                                             "invalid l_next_free_rec of %d",
4414                                             (unsigned long long)le64_to_cpu(eb->h_blkno),
4415                                             le16_to_cpu(new_el->l_next_free_rec));
4416                                 status = -EINVAL;
4417                                 goto out;
4418                         }
4419                         rec = &new_el->l_recs[1];
4420                 }
4421         }
4422
4423         if (rec) {
4424                 enum ocfs2_contig_type contig_type;
4425
4426                 contig_type = ocfs2_et_extent_contig(et, rec, split_rec);
4427
4428                 if (contig_type == CONTIG_LEFT && ret == CONTIG_RIGHT)
4429                         ret = CONTIG_LEFTRIGHT;
4430                 else if (ret == CONTIG_NONE)
4431                         ret = contig_type;
4432         }
4433
4434 out:
4435         if (left_path)
4436                 ocfs2_free_path(left_path);
4437         if (right_path)
4438                 ocfs2_free_path(right_path);
4439
4440         return ret;
4441 }
4442
4443 static void ocfs2_figure_contig_type(struct ocfs2_extent_tree *et,
4444                                      struct ocfs2_insert_type *insert,
4445                                      struct ocfs2_extent_list *el,
4446                                      struct ocfs2_extent_rec *insert_rec)
4447 {
4448         int i;
4449         enum ocfs2_contig_type contig_type = CONTIG_NONE;
4450
4451         BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
4452
4453         for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
4454                 contig_type = ocfs2_et_extent_contig(et, &el->l_recs[i],
4455                                                      insert_rec);
4456                 if (contig_type != CONTIG_NONE) {
4457                         insert->ins_contig_index = i;
4458                         break;
4459                 }
4460         }
4461         insert->ins_contig = contig_type;
4462
4463         if (insert->ins_contig != CONTIG_NONE) {
4464                 struct ocfs2_extent_rec *rec =
4465                                 &el->l_recs[insert->ins_contig_index];
4466                 unsigned int len = le16_to_cpu(rec->e_leaf_clusters) +
4467                                    le16_to_cpu(insert_rec->e_leaf_clusters);
4468
4469                 /*
4470                  * Caller might want us to limit the size of extents, don't
4471                  * calculate contiguousness if we might exceed that limit.
4472                  */
4473                 if (et->et_max_leaf_clusters &&
4474                     (len > et->et_max_leaf_clusters))
4475                         insert->ins_contig = CONTIG_NONE;
4476         }
4477 }
4478
4479 /*
4480  * This should only be called against the righmost leaf extent list.
4481  *
4482  * ocfs2_figure_appending_type() will figure out whether we'll have to
4483  * insert at the tail of the rightmost leaf.
4484  *
4485  * This should also work against the root extent list for tree's with 0
4486  * depth. If we consider the root extent list to be the rightmost leaf node
4487  * then the logic here makes sense.
4488  */
4489 static void ocfs2_figure_appending_type(struct ocfs2_insert_type *insert,
4490                                         struct ocfs2_extent_list *el,
4491                                         struct ocfs2_extent_rec *insert_rec)
4492 {
4493         int i;
4494         u32 cpos = le32_to_cpu(insert_rec->e_cpos);
4495         struct ocfs2_extent_rec *rec;
4496
4497         insert->ins_appending = APPEND_NONE;
4498
4499         BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
4500
4501         if (!el->l_next_free_rec)
4502                 goto set_tail_append;
4503
4504         if (ocfs2_is_empty_extent(&el->l_recs[0])) {
4505                 /* Were all records empty? */
4506                 if (le16_to_cpu(el->l_next_free_rec) == 1)
4507                         goto set_tail_append;
4508         }
4509
4510         i = le16_to_cpu(el->l_next_free_rec) - 1;
4511         rec = &el->l_recs[i];
4512
4513         if (cpos >=
4514             (le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)))
4515                 goto set_tail_append;
4516
4517         return;
4518
4519 set_tail_append:
4520         insert->ins_appending = APPEND_TAIL;
4521 }
4522
4523 /*
4524  * Helper function called at the begining of an insert.
4525  *
4526  * This computes a few things that are commonly used in the process of
4527  * inserting into the btree:
4528  *   - Whether the new extent is contiguous with an existing one.
4529  *   - The current tree depth.
4530  *   - Whether the insert is an appending one.
4531  *   - The total # of free records in the tree.
4532  *
4533  * All of the information is stored on the ocfs2_insert_type
4534  * structure.
4535  */
4536 static int ocfs2_figure_insert_type(struct ocfs2_extent_tree *et,
4537                                     struct buffer_head **last_eb_bh,
4538                                     struct ocfs2_extent_rec *insert_rec,
4539                                     int *free_records,
4540                                     struct ocfs2_insert_type *insert)
4541 {
4542         int ret;
4543         struct ocfs2_extent_block *eb;
4544         struct ocfs2_extent_list *el;
4545         struct ocfs2_path *path = NULL;
4546         struct buffer_head *bh = NULL;
4547
4548         insert->ins_split = SPLIT_NONE;
4549
4550         el = et->et_root_el;
4551         insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth);
4552
4553         if (el->l_tree_depth) {
4554                 /*
4555                  * If we have tree depth, we read in the
4556                  * rightmost extent block ahead of time as
4557                  * ocfs2_figure_insert_type() and ocfs2_add_branch()
4558                  * may want it later.
4559                  */
4560                 ret = ocfs2_read_extent_block(et->et_ci,
4561                                               ocfs2_et_get_last_eb_blk(et),
4562                                               &bh);
4563                 if (ret) {
4564                         mlog_exit(ret);
4565                         goto out;
4566                 }
4567                 eb = (struct ocfs2_extent_block *) bh->b_data;
4568                 el = &eb->h_list;
4569         }
4570
4571         /*
4572          * Unless we have a contiguous insert, we'll need to know if
4573          * there is room left in our allocation tree for another
4574          * extent record.
4575          *
4576          * XXX: This test is simplistic, we can search for empty
4577          * extent records too.
4578          */
4579         *free_records = le16_to_cpu(el->l_count) -
4580                 le16_to_cpu(el->l_next_free_rec);
4581
4582         if (!insert->ins_tree_depth) {
4583                 ocfs2_figure_contig_type(et, insert, el, insert_rec);
4584                 ocfs2_figure_appending_type(insert, el, insert_rec);
4585                 return 0;
4586         }
4587
4588         path = ocfs2_new_path_from_et(et);
4589         if (!path) {
4590                 ret = -ENOMEM;
4591                 mlog_errno(ret);
4592                 goto out;
4593         }
4594
4595         /*
4596          * In the case that we're inserting past what the tree
4597          * currently accounts for, ocfs2_find_path() will return for
4598          * us the rightmost tree path. This is accounted for below in
4599          * the appending code.
4600          */
4601         ret = ocfs2_find_path(et->et_ci, path, le32_to_cpu(insert_rec->e_cpos));
4602         if (ret) {
4603                 mlog_errno(ret);
4604                 goto out;
4605         }
4606
4607         el = path_leaf_el(path);
4608
4609         /*
4610          * Now that we have the path, there's two things we want to determine:
4611          * 1) Contiguousness (also set contig_index if this is so)
4612          *
4613          * 2) Are we doing an append? We can trivially break this up
4614          *     into two types of appends: simple record append, or a
4615          *     rotate inside the tail leaf.
4616          */
4617         ocfs2_figure_contig_type(et, insert, el, insert_rec);
4618
4619         /*
4620          * The insert code isn't quite ready to deal with all cases of
4621          * left contiguousness. Specifically, if it's an insert into
4622          * the 1st record in a leaf, it will require the adjustment of
4623          * cluster count on the last record of the path directly to it's
4624          * left. For now, just catch that case and fool the layers
4625          * above us. This works just fine for tree_depth == 0, which
4626          * is why we allow that above.
4627          */
4628         if (insert->ins_contig == CONTIG_LEFT &&
4629             insert->ins_contig_index == 0)
4630                 insert->ins_contig = CONTIG_NONE;
4631
4632         /*
4633          * Ok, so we can simply compare against last_eb to figure out
4634          * whether the path doesn't exist. This will only happen in
4635          * the case that we're doing a tail append, so maybe we can
4636          * take advantage of that information somehow.
4637          */
4638         if (ocfs2_et_get_last_eb_blk(et) ==
4639             path_leaf_bh(path)->b_blocknr) {
4640                 /*
4641                  * Ok, ocfs2_find_path() returned us the rightmost
4642                  * tree path. This might be an appending insert. There are
4643                  * two cases:
4644                  *    1) We're doing a true append at the tail:
4645                  *      -This might even be off the end of the leaf
4646                  *    2) We're "appending" by rotating in the tail
4647                  */
4648                 ocfs2_figure_appending_type(insert, el, insert_rec);
4649         }
4650
4651 out:
4652         ocfs2_free_path(path);
4653
4654         if (ret == 0)
4655                 *last_eb_bh = bh;
4656         else
4657                 brelse(bh);
4658         return ret;
4659 }
4660
4661 /*
4662  * Insert an extent into a btree.
4663  *
4664  * The caller needs to update the owning btree's cluster count.
4665  */
4666 int ocfs2_insert_extent(handle_t *handle,
4667                         struct ocfs2_extent_tree *et,
4668                         u32 cpos,
4669                         u64 start_blk,
4670                         u32 new_clusters,
4671                         u8 flags,
4672                         struct ocfs2_alloc_context *meta_ac)
4673 {
4674         int status;
4675         int uninitialized_var(free_records);
4676         struct buffer_head *last_eb_bh = NULL;
4677         struct ocfs2_insert_type insert = {0, };
4678         struct ocfs2_extent_rec rec;
4679
4680         mlog(0, "add %u clusters at position %u to owner %llu\n",
4681              new_clusters, cpos,
4682              (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci));
4683
4684         memset(&rec, 0, sizeof(rec));
4685         rec.e_cpos = cpu_to_le32(cpos);
4686         rec.e_blkno = cpu_to_le64(start_blk);
4687         rec.e_leaf_clusters = cpu_to_le16(new_clusters);
4688         rec.e_flags = flags;
4689         status = ocfs2_et_insert_check(et, &rec);
4690         if (status) {
4691                 mlog_errno(status);
4692                 goto bail;
4693         }
4694
4695         status = ocfs2_figure_insert_type(et, &last_eb_bh, &rec,
4696                                           &free_records, &insert);
4697         if (status < 0) {
4698                 mlog_errno(status);
4699                 goto bail;
4700         }
4701
4702         mlog(0, "Insert.appending: %u, Insert.Contig: %u, "
4703              "Insert.contig_index: %d, Insert.free_records: %d, "
4704              "Insert.tree_depth: %d\n",
4705              insert.ins_appending, insert.ins_contig, insert.ins_contig_index,
4706              free_records, insert.ins_tree_depth);
4707
4708         if (insert.ins_contig == CONTIG_NONE && free_records == 0) {
4709                 status = ocfs2_grow_tree(handle, et,
4710                                          &insert.ins_tree_depth, &last_eb_bh,
4711                                          meta_ac);
4712                 if (status) {
4713                         mlog_errno(status);
4714                         goto bail;
4715                 }
4716         }
4717
4718         /* Finally, we can add clusters. This might rotate the tree for us. */
4719         status = ocfs2_do_insert_extent(handle, et, &rec, &insert);
4720         if (status < 0)
4721                 mlog_errno(status);
4722         else
4723                 ocfs2_et_extent_map_insert(et, &rec);
4724
4725 bail:
4726         brelse(last_eb_bh);
4727
4728         mlog_exit(status);
4729         return status;
4730 }
4731
4732 /*
4733  * Allcate and add clusters into the extent b-tree.
4734  * The new clusters(clusters_to_add) will be inserted at logical_offset.
4735  * The extent b-tree's root is specified by et, and
4736  * it is not limited to the file storage. Any extent tree can use this
4737  * function if it implements the proper ocfs2_extent_tree.
4738  */
4739 int ocfs2_add_clusters_in_btree(handle_t *handle,
4740                                 struct ocfs2_extent_tree *et,
4741                                 u32 *logical_offset,
4742                                 u32 clusters_to_add,
4743                                 int mark_unwritten,
4744                                 struct ocfs2_alloc_context *data_ac,
4745                                 struct ocfs2_alloc_context *meta_ac,
4746                                 enum ocfs2_alloc_restarted *reason_ret)
4747 {
4748         int status = 0;
4749         int free_extents;
4750         enum ocfs2_alloc_restarted reason = RESTART_NONE;
4751         u32 bit_off, num_bits;
4752         u64 block;
4753         u8 flags = 0;
4754         struct ocfs2_super *osb =
4755                 OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci));
4756
4757         BUG_ON(!clusters_to_add);
4758
4759         if (mark_unwritten)
4760                 flags = OCFS2_EXT_UNWRITTEN;
4761
4762         free_extents = ocfs2_num_free_extents(osb, et);
4763         if (free_extents < 0) {
4764                 status = free_extents;
4765                 mlog_errno(status);
4766                 goto leave;
4767         }
4768
4769         /* there are two cases which could cause us to EAGAIN in the
4770          * we-need-more-metadata case:
4771          * 1) we haven't reserved *any*
4772          * 2) we are so fragmented, we've needed to add metadata too
4773          *    many times. */
4774         if (!free_extents && !meta_ac) {
4775                 mlog(0, "we haven't reserved any metadata!\n");
4776                 status = -EAGAIN;
4777                 reason = RESTART_META;
4778                 goto leave;
4779         } else if ((!free_extents)
4780                    && (ocfs2_alloc_context_bits_left(meta_ac)
4781                        < ocfs2_extend_meta_needed(et->et_root_el))) {
4782                 mlog(0, "filesystem is really fragmented...\n");
4783                 status = -EAGAIN;
4784                 reason = RESTART_META;
4785                 goto leave;
4786         }
4787
4788         status = __ocfs2_claim_clusters(handle, data_ac, 1,
4789                                         clusters_to_add, &bit_off, &num_bits);
4790         if (status < 0) {
4791                 if (status != -ENOSPC)
4792                         mlog_errno(status);
4793                 goto leave;
4794         }
4795
4796         BUG_ON(num_bits > clusters_to_add);
4797
4798         /* reserve our write early -- insert_extent may update the tree root */
4799         status = ocfs2_et_root_journal_access(handle, et,
4800                                               OCFS2_JOURNAL_ACCESS_WRITE);
4801         if (status < 0) {
4802                 mlog_errno(status);
4803                 goto leave;
4804         }
4805
4806         block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
4807         mlog(0, "Allocating %u clusters at block %u for owner %llu\n",
4808              num_bits, bit_off,
4809              (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci));
4810         status = ocfs2_insert_extent(handle, et, *logical_offset, block,
4811                                      num_bits, flags, meta_ac);
4812         if (status < 0) {
4813                 mlog_errno(status);
4814                 goto leave;
4815         }
4816
4817         ocfs2_journal_dirty(handle, et->et_root_bh);
4818
4819         clusters_to_add -= num_bits;
4820         *logical_offset += num_bits;
4821
4822         if (clusters_to_add) {
4823                 mlog(0, "need to alloc once more, wanted = %u\n",
4824                      clusters_to_add);
4825                 status = -EAGAIN;
4826                 reason = RESTART_TRANS;
4827         }
4828
4829 leave:
4830         mlog_exit(status);
4831         if (reason_ret)
4832                 *reason_ret = reason;
4833         return status;
4834 }
4835
4836 static void ocfs2_make_right_split_rec(struct super_block *sb,
4837                                        struct ocfs2_extent_rec *split_rec,
4838                                        u32 cpos,
4839                                        struct ocfs2_extent_rec *rec)
4840 {
4841         u32 rec_cpos = le32_to_cpu(rec->e_cpos);
4842         u32 rec_range = rec_cpos + le16_to_cpu(rec->e_leaf_clusters);
4843
4844         memset(split_rec, 0, sizeof(struct ocfs2_extent_rec));
4845
4846         split_rec->e_cpos = cpu_to_le32(cpos);
4847         split_rec->e_leaf_clusters = cpu_to_le16(rec_range - cpos);
4848
4849         split_rec->e_blkno = rec->e_blkno;
4850         le64_add_cpu(&split_rec->e_blkno,
4851                      ocfs2_clusters_to_blocks(sb, cpos - rec_cpos));
4852
4853         split_rec->e_flags = rec->e_flags;
4854 }
4855
4856 static int ocfs2_split_and_insert(handle_t *handle,
4857                                   struct ocfs2_extent_tree *et,
4858                                   struct ocfs2_path *path,
4859                                   struct buffer_head **last_eb_bh,
4860                                   int split_index,
4861                                   struct ocfs2_extent_rec *orig_split_rec,
4862                                   struct ocfs2_alloc_context *meta_ac)
4863 {
4864         int ret = 0, depth;
4865         unsigned int insert_range, rec_range, do_leftright = 0;
4866         struct ocfs2_extent_rec tmprec;
4867         struct ocfs2_extent_list *rightmost_el;
4868         struct ocfs2_extent_rec rec;
4869         struct ocfs2_extent_rec split_rec = *orig_split_rec;
4870         struct ocfs2_insert_type insert;
4871         struct ocfs2_extent_block *eb;
4872
4873 leftright:
4874         /*
4875          * Store a copy of the record on the stack - it might move
4876          * around as the tree is manipulated below.
4877          */
4878         rec = path_leaf_el(path)->l_recs[split_index];
4879
4880         rightmost_el = et->et_root_el;
4881
4882         depth = le16_to_cpu(rightmost_el->l_tree_depth);
4883         if (depth) {
4884                 BUG_ON(!(*last_eb_bh));
4885                 eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
4886                 rightmost_el = &eb->h_list;
4887         }
4888
4889         if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
4890             le16_to_cpu(rightmost_el->l_count)) {
4891                 ret = ocfs2_grow_tree(handle, et,
4892                                       &depth, last_eb_bh, meta_ac);
4893                 if (ret) {
4894                         mlog_errno(ret);
4895                         goto out;
4896                 }
4897         }
4898
4899         memset(&insert, 0, sizeof(struct ocfs2_insert_type));
4900         insert.ins_appending = APPEND_NONE;
4901         insert.ins_contig = CONTIG_NONE;
4902         insert.ins_tree_depth = depth;
4903
4904         insert_range = le32_to_cpu(split_rec.e_cpos) +
4905                 le16_to_cpu(split_rec.e_leaf_clusters);
4906         rec_range = le32_to_cpu(rec.e_cpos) +
4907                 le16_to_cpu(rec.e_leaf_clusters);
4908
4909         if (split_rec.e_cpos == rec.e_cpos) {
4910                 insert.ins_split = SPLIT_LEFT;
4911         } else if (insert_range == rec_range) {
4912                 insert.ins_split = SPLIT_RIGHT;
4913         } else {
4914                 /*
4915                  * Left/right split. We fake this as a right split
4916                  * first and then make a second pass as a left split.
4917                  */
4918                 insert.ins_split = SPLIT_RIGHT;
4919
4920                 ocfs2_make_right_split_rec(ocfs2_metadata_cache_get_super(et->et_ci),
4921                                            &tmprec, insert_range, &rec);
4922
4923                 split_rec = tmprec;
4924
4925                 BUG_ON(do_leftright);
4926                 do_leftright = 1;
4927         }
4928
4929         ret = ocfs2_do_insert_extent(handle, et, &split_rec, &insert);
4930         if (ret) {
4931                 mlog_errno(ret);
4932                 goto out;
4933         }
4934
4935         if (do_leftright == 1) {
4936                 u32 cpos;
4937                 struct ocfs2_extent_list *el;
4938
4939                 do_leftright++;
4940                 split_rec = *orig_split_rec;
4941
4942                 ocfs2_reinit_path(path, 1);
4943
4944                 cpos = le32_to_cpu(split_rec.e_cpos);
4945                 ret = ocfs2_find_path(et->et_ci, path, cpos);
4946                 if (ret) {
4947                         mlog_errno(ret);
4948                         goto out;
4949                 }
4950
4951                 el = path_leaf_el(path);
4952                 split_index = ocfs2_search_extent_list(el, cpos);
4953                 goto leftright;
4954         }
4955 out:
4956
4957         return ret;
4958 }
4959
4960 static int ocfs2_replace_extent_rec(handle_t *handle,
4961                                     struct ocfs2_extent_tree *et,
4962                                     struct ocfs2_path *path,
4963                                     struct ocfs2_extent_list *el,
4964                                     int split_index,
4965                                     struct ocfs2_extent_rec *split_rec)
4966 {
4967         int ret;
4968
4969         ret = ocfs2_path_bh_journal_access(handle, et->et_ci, path,
4970                                            path_num_items(path) - 1);
4971         if (ret) {
4972                 mlog_errno(ret);
4973                 goto out;
4974         }
4975
4976         el->l_recs[split_index] = *split_rec;
4977
4978         ocfs2_journal_dirty(handle, path_leaf_bh(path));
4979 out:
4980         return ret;
4981 }
4982
4983 /*
4984  * Split part or all of the extent record at split_index in the leaf
4985  * pointed to by path. Merge with the contiguous extent record if needed.
4986  *
4987  * Care is taken to handle contiguousness so as to not grow the tree.
4988  *
4989  * meta_ac is not strictly necessary - we only truly need it if growth
4990  * of the tree is required. All other cases will degrade into a less
4991  * optimal tree layout.
4992  *
4993  * last_eb_bh should be the rightmost leaf block for any extent
4994  * btree. Since a split may grow the tree or a merge might shrink it,
4995  * the caller cannot trust the contents of that buffer after this call.
4996  *
4997  * This code is optimized for readability - several passes might be
4998  * made over certain portions of the tree. All of those blocks will
4999  * have been brought into cache (and pinned via the journal), so the
5000  * extra overhead is not expressed in terms of disk reads.
5001  */
5002 int ocfs2_split_extent(handle_t *handle,
5003                        struct ocfs2_extent_tree *et,
5004                        struct ocfs2_path *path,
5005                        int split_index,
5006                        struct ocfs2_extent_rec *split_rec,
5007                        struct ocfs2_alloc_context *meta_ac,
5008                        struct ocfs2_cached_dealloc_ctxt *dealloc)
5009 {
5010         int ret = 0;
5011         struct ocfs2_extent_list *el = path_leaf_el(path);
5012         struct buffer_head *last_eb_bh = NULL;
5013         struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
5014         struct ocfs2_merge_ctxt ctxt;
5015         struct ocfs2_extent_list *rightmost_el;
5016
5017         if (le32_to_cpu(rec->e_cpos) > le32_to_cpu(split_rec->e_cpos) ||
5018             ((le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)) <
5019              (le32_to_cpu(split_rec->e_cpos) + le16_to_cpu(split_rec->e_leaf_clusters)))) {
5020                 ret = -EIO;
5021                 mlog_errno(ret);
5022                 goto out;
5023         }
5024
5025         ctxt.c_contig_type = ocfs2_figure_merge_contig_type(et, path, el,
5026                                                             split_index,
5027                                                             split_rec);
5028
5029         /*
5030          * The core merge / split code wants to know how much room is
5031          * left in this allocation tree, so we pass the
5032          * rightmost extent list.
5033          */
5034         if (path->p_tree_depth) {
5035                 struct ocfs2_extent_block *eb;
5036
5037                 ret = ocfs2_read_extent_block(et->et_ci,
5038                                               ocfs2_et_get_last_eb_blk(et),
5039                                               &last_eb_bh);
5040                 if (ret) {
5041                         mlog_exit(ret);
5042                         goto out;
5043                 }
5044
5045                 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
5046                 rightmost_el = &eb->h_list;
5047         } else
5048                 rightmost_el = path_root_el(path);
5049
5050         if (rec->e_cpos == split_rec->e_cpos &&
5051             rec->e_leaf_clusters == split_rec->e_leaf_clusters)
5052                 ctxt.c_split_covers_rec = 1;
5053         else
5054                 ctxt.c_split_covers_rec = 0;
5055
5056         ctxt.c_has_empty_extent = ocfs2_is_empty_extent(&el->l_recs[0]);
5057
5058         mlog(0, "index: %d, contig: %u, has_empty: %u, split_covers: %u\n",
5059              split_index, ctxt.c_contig_type, ctxt.c_has_empty_extent,
5060              ctxt.c_split_covers_rec);
5061
5062         if (ctxt.c_contig_type == CONTIG_NONE) {
5063                 if (ctxt.c_split_covers_rec)
5064                         ret = ocfs2_replace_extent_rec(handle, et, path, el,
5065                                                        split_index, split_rec);
5066                 else
5067                         ret = ocfs2_split_and_insert(handle, et, path,
5068                                                      &last_eb_bh, split_index,
5069                                                      split_rec, meta_ac);
5070                 if (ret)
5071                         mlog_errno(ret);
5072         } else {
5073                 ret = ocfs2_try_to_merge_extent(handle, et, path,
5074                                                 split_index, split_rec,
5075                                                 dealloc, &ctxt);
5076                 if (ret)
5077                         mlog_errno(ret);
5078         }
5079
5080 out:
5081         brelse(last_eb_bh);
5082         return ret;
5083 }
5084
5085 /*
5086  * Change the flags of the already-existing extent at cpos for len clusters.
5087  *
5088  * new_flags: the flags we want to set.
5089  * clear_flags: the flags we want to clear.
5090  * phys: the new physical offset we want this new extent starts from.
5091  *
5092  * If the existing extent is larger than the request, initiate a
5093  * split. An attempt will be made at merging with adjacent extents.
5094  *
5095  * The caller is responsible for passing down meta_ac if we'll need it.
5096  */
5097 int ocfs2_change_extent_flag(handle_t *handle,
5098                              struct ocfs2_extent_tree *et,
5099                              u32 cpos, u32 len, u32 phys,
5100                              struct ocfs2_alloc_context *meta_ac,
5101                              struct ocfs2_cached_dealloc_ctxt *dealloc,
5102                              int new_flags, int clear_flags)
5103 {
5104         int ret, index;
5105         struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
5106         u64 start_blkno = ocfs2_clusters_to_blocks(sb, phys);
5107         struct ocfs2_extent_rec split_rec;
5108         struct ocfs2_path *left_path = NULL;
5109         struct ocfs2_extent_list *el;
5110         struct ocfs2_extent_rec *rec;
5111
5112         left_path = ocfs2_new_path_from_et(et);
5113         if (!left_path) {
5114                 ret = -ENOMEM;
5115                 mlog_errno(ret);
5116                 goto out;
5117         }
5118
5119         ret = ocfs2_find_path(et->et_ci, left_path, cpos);
5120         if (ret) {
5121                 mlog_errno(ret);
5122                 goto out;
5123         }
5124         el = path_leaf_el(left_path);
5125
5126         index = ocfs2_search_extent_list(el, cpos);
5127         if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
5128                 ocfs2_error(sb,
5129                             "Owner %llu has an extent at cpos %u which can no "
5130                             "longer be found.\n",
5131                              (unsigned long long)
5132                              ocfs2_metadata_cache_owner(et->et_ci), cpos);
5133                 ret = -EROFS;
5134                 goto out;
5135         }
5136
5137         ret = -EIO;
5138         rec = &el->l_recs[index];
5139         if (new_flags && (rec->e_flags & new_flags)) {
5140                 mlog(ML_ERROR, "Owner %llu tried to set %d flags on an "
5141                      "extent that already had them",
5142                      (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
5143                      new_flags);
5144                 goto out;
5145         }
5146
5147         if (clear_flags && !(rec->e_flags & clear_flags)) {
5148                 mlog(ML_ERROR, "Owner %llu tried to clear %d flags on an "
5149                      "extent that didn't have them",
5150                      (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
5151                      clear_flags);
5152                 goto out;
5153         }
5154
5155         memset(&split_rec, 0, sizeof(struct ocfs2_extent_rec));
5156         split_rec.e_cpos = cpu_to_le32(cpos);
5157         split_rec.e_leaf_clusters = cpu_to_le16(len);
5158         split_rec.e_blkno = cpu_to_le64(start_blkno);
5159         split_rec.e_flags = rec->e_flags;
5160         if (new_flags)
5161                 split_rec.e_flags |= new_flags;
5162         if (clear_flags)
5163                 split_rec.e_flags &= ~clear_flags;
5164
5165         ret = ocfs2_split_extent(handle, et, left_path,
5166                                  index, &split_rec, meta_ac,
5167                                  dealloc);
5168         if (ret)
5169                 mlog_errno(ret);
5170
5171 out:
5172         ocfs2_free_path(left_path);
5173         return ret;
5174
5175 }
5176
5177 /*
5178  * Mark the already-existing extent at cpos as written for len clusters.
5179  * This removes the unwritten extent flag.
5180  *
5181  * If the existing extent is larger than the request, initiate a
5182  * split. An attempt will be made at merging with adjacent extents.
5183  *
5184  * The caller is responsible for passing down meta_ac if we'll need it.
5185  */
5186 int ocfs2_mark_extent_written(struct inode *inode,
5187                               struct ocfs2_extent_tree *et,
5188                               handle_t *handle, u32 cpos, u32 len, u32 phys,
5189                               struct ocfs2_alloc_context *meta_ac,
5190                               struct ocfs2_cached_dealloc_ctxt *dealloc)
5191 {
5192         int ret;
5193
5194         mlog(0, "Inode %lu cpos %u, len %u, phys clusters %u\n",
5195              inode->i_ino, cpos, len, phys);
5196
5197         if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) {
5198                 ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents "
5199                             "that are being written to, but the feature bit "
5200                             "is not set in the super block.",
5201                             (unsigned long long)OCFS2_I(inode)->ip_blkno);
5202                 ret = -EROFS;
5203                 goto out;
5204         }
5205
5206         /*
5207          * XXX: This should be fixed up so that we just re-insert the
5208          * next extent records.
5209          */
5210         ocfs2_et_extent_map_truncate(et, 0);
5211
5212         ret = ocfs2_change_extent_flag(handle, et, cpos,
5213                                        len, phys, meta_ac, dealloc,
5214                                        0, OCFS2_EXT_UNWRITTEN);
5215         if (ret)
5216                 mlog_errno(ret);
5217
5218 out:
5219         return ret;
5220 }
5221
5222 static int ocfs2_split_tree(handle_t *handle, struct ocfs2_extent_tree *et,
5223                             struct ocfs2_path *path,
5224                             int index, u32 new_range,
5225                             struct ocfs2_alloc_context *meta_ac)
5226 {
5227         int ret, depth, credits;
5228         struct buffer_head *last_eb_bh = NULL;
5229         struct ocfs2_extent_block *eb;
5230         struct ocfs2_extent_list *rightmost_el, *el;
5231         struct ocfs2_extent_rec split_rec;
5232         struct ocfs2_extent_rec *rec;
5233         struct ocfs2_insert_type insert;
5234
5235         /*
5236          * Setup the record to split before we grow the tree.
5237          */
5238         el = path_leaf_el(path);
5239         rec = &el->l_recs[index];
5240         ocfs2_make_right_split_rec(ocfs2_metadata_cache_get_super(et->et_ci),
5241                                    &split_rec, new_range, rec);
5242
5243         depth = path->p_tree_depth;
5244         if (depth > 0) {
5245                 ret = ocfs2_read_extent_block(et->et_ci,
5246                                               ocfs2_et_get_last_eb_blk(et),
5247                                               &last_eb_bh);
5248                 if (ret < 0) {
5249                         mlog_errno(ret);
5250                         goto out;
5251                 }
5252
5253                 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
5254                 rightmost_el = &eb->h_list;
5255         } else
5256                 rightmost_el = path_leaf_el(path);
5257
5258         credits = path->p_tree_depth +
5259                   ocfs2_extend_meta_needed(et->et_root_el);
5260         ret = ocfs2_extend_trans(handle, credits);
5261         if (ret) {
5262                 mlog_errno(ret);
5263                 goto out;
5264         }
5265
5266         if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
5267             le16_to_cpu(rightmost_el->l_count)) {
5268                 ret = ocfs2_grow_tree(handle, et, &depth, &last_eb_bh,
5269                                       meta_ac);
5270                 if (ret) {
5271                         mlog_errno(ret);
5272                         goto out;
5273                 }
5274         }
5275
5276         memset(&insert, 0, sizeof(struct ocfs2_insert_type));
5277         insert.ins_appending = APPEND_NONE;
5278         insert.ins_contig = CONTIG_NONE;
5279         insert.ins_split = SPLIT_RIGHT;
5280         insert.ins_tree_depth = depth;
5281
5282         ret = ocfs2_do_insert_extent(handle, et, &split_rec, &insert);
5283         if (ret)
5284                 mlog_errno(ret);
5285
5286 out:
5287         brelse(last_eb_bh);
5288         return ret;
5289 }
5290
5291 static int ocfs2_truncate_rec(handle_t *handle,
5292                               struct ocfs2_extent_tree *et,
5293                               struct ocfs2_path *path, int index,
5294                               struct ocfs2_cached_dealloc_ctxt *dealloc,
5295                               u32 cpos, u32 len)
5296 {
5297         int ret;
5298         u32 left_cpos, rec_range, trunc_range;
5299         int wants_rotate = 0, is_rightmost_tree_rec = 0;
5300         struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
5301         struct ocfs2_path *left_path = NULL;
5302         struct ocfs2_extent_list *el = path_leaf_el(path);
5303         struct ocfs2_extent_rec *rec;
5304         struct ocfs2_extent_block *eb;
5305
5306         if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) {
5307                 ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
5308                 if (ret) {
5309                         mlog_errno(ret);
5310                         goto out;
5311                 }
5312
5313                 index--;
5314         }
5315
5316         if (index == (le16_to_cpu(el->l_next_free_rec) - 1) &&
5317             path->p_tree_depth) {
5318                 /*
5319                  * Check whether this is the rightmost tree record. If
5320                  * we remove all of this record or part of its right
5321                  * edge then an update of the record lengths above it
5322                  * will be required.
5323                  */
5324                 eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
5325                 if (eb->h_next_leaf_blk == 0)
5326                         is_rightmost_tree_rec = 1;
5327         }
5328
5329         rec = &el->l_recs[index];
5330         if (index == 0 && path->p_tree_depth &&
5331             le32_to_cpu(rec->e_cpos) == cpos) {
5332                 /*
5333                  * Changing the leftmost offset (via partial or whole
5334                  * record truncate) of an interior (or rightmost) path
5335                  * means we have to update the subtree that is formed
5336                  * by this leaf and the one to it's left.
5337                  *
5338                  * There are two cases we can skip:
5339                  *   1) Path is the leftmost one in our btree.
5340                  *   2) The leaf is rightmost and will be empty after
5341                  *      we remove the extent record - the rotate code
5342                  *      knows how to update the newly formed edge.
5343                  */
5344
5345                 ret = ocfs2_find_cpos_for_left_leaf(sb, path, &left_cpos);
5346                 if (ret) {
5347                         mlog_errno(ret);
5348                         goto out;
5349                 }
5350
5351                 if (left_cpos && le16_to_cpu(el->l_next_free_rec) > 1) {
5352                         left_path = ocfs2_new_path_from_path(path);
5353                         if (!left_path) {
5354                                 ret = -ENOMEM;
5355                                 mlog_errno(ret);
5356                                 goto out;
5357                         }
5358
5359                         ret = ocfs2_find_path(et->et_ci, left_path,
5360                                               left_cpos);
5361                         if (ret) {
5362                                 mlog_errno(ret);
5363                                 goto out;
5364                         }
5365                 }
5366         }
5367
5368         ret = ocfs2_extend_rotate_transaction(handle, 0,
5369                                               handle->h_buffer_credits,
5370                                               path);
5371         if (ret) {
5372                 mlog_errno(ret);
5373                 goto out;
5374         }
5375
5376         ret = ocfs2_journal_access_path(et->et_ci, handle, path);
5377         if (ret) {
5378                 mlog_errno(ret);
5379                 goto out;
5380         }
5381
5382         ret = ocfs2_journal_access_path(et->et_ci, handle, left_path);
5383         if (ret) {
5384                 mlog_errno(ret);
5385                 goto out;
5386         }
5387
5388         rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
5389         trunc_range = cpos + len;
5390
5391         if (le32_to_cpu(rec->e_cpos) == cpos && rec_range == trunc_range) {
5392                 int next_free;
5393
5394                 memset(rec, 0, sizeof(*rec));
5395                 ocfs2_cleanup_merge(el, index);
5396                 wants_rotate = 1;
5397
5398                 next_free = le16_to_cpu(el->l_next_free_rec);
5399                 if (is_rightmost_tree_rec && next_free > 1) {
5400                         /*
5401                          * We skip the edge update if this path will
5402                          * be deleted by the rotate code.
5403                          */
5404                         rec = &el->l_recs[next_free - 1];
5405                         ocfs2_adjust_rightmost_records(handle, et, path,
5406                                                        rec);
5407                 }
5408         } else if (le32_to_cpu(rec->e_cpos) == cpos) {
5409                 /* Remove leftmost portion of the record. */
5410                 le32_add_cpu(&rec->e_cpos, len);
5411                 le64_add_cpu(&rec->e_blkno, ocfs2_clusters_to_blocks(sb, len));
5412                 le16_add_cpu(&rec->e_leaf_clusters, -len);
5413         } else if (rec_range == trunc_range) {
5414                 /* Remove rightmost portion of the record */
5415                 le16_add_cpu(&rec->e_leaf_clusters, -len);
5416                 if (is_rightmost_tree_rec)
5417                         ocfs2_adjust_rightmost_records(handle, et, path, rec);
5418         } else {
5419                 /* Caller should have trapped this. */
5420                 mlog(ML_ERROR, "Owner %llu: Invalid record truncate: (%u, %u) "
5421                      "(%u, %u)\n",
5422                      (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
5423                      le32_to_cpu(rec->e_cpos),
5424                      le16_to_cpu(rec->e_leaf_clusters), cpos, len);
5425                 BUG();
5426         }
5427
5428         if (left_path) {
5429                 int subtree_index;
5430
5431                 subtree_index = ocfs2_find_subtree_root(et, left_path, path);
5432                 ocfs2_complete_edge_insert(handle, left_path, path,
5433                                            subtree_index);
5434         }
5435
5436         ocfs2_journal_dirty(handle, path_leaf_bh(path));
5437
5438         ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
5439         if (ret) {
5440                 mlog_errno(ret);
5441                 goto out;
5442         }
5443
5444 out:
5445         ocfs2_free_path(left_path);
5446         return ret;
5447 }
5448
5449 int ocfs2_remove_extent(handle_t *handle,
5450                         struct ocfs2_extent_tree *et,
5451                         u32 cpos, u32 len,
5452                         struct ocfs2_alloc_context *meta_ac,
5453                         struct ocfs2_cached_dealloc_ctxt *dealloc)
5454 {
5455         int ret, index;
5456         u32 rec_range, trunc_range;
5457         struct ocfs2_extent_rec *rec;
5458         struct ocfs2_extent_list *el;
5459         struct ocfs2_path *path = NULL;
5460
5461         /*
5462          * XXX: Why are we truncating to 0 instead of wherever this
5463          * affects us?
5464          */
5465         ocfs2_et_extent_map_truncate(et, 0);
5466
5467         path = ocfs2_new_path_from_et(et);
5468         if (!path) {
5469                 ret = -ENOMEM;
5470                 mlog_errno(ret);
5471                 goto out;
5472         }
5473
5474         ret = ocfs2_find_path(et->et_ci, path, cpos);
5475         if (ret) {
5476                 mlog_errno(ret);
5477                 goto out;
5478         }
5479
5480         el = path_leaf_el(path);
5481         index = ocfs2_search_extent_list(el, cpos);
5482         if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
5483                 ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
5484                             "Owner %llu has an extent at cpos %u which can no "
5485                             "longer be found.\n",
5486                             (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
5487                             cpos);
5488                 ret = -EROFS;
5489                 goto out;
5490         }
5491
5492         /*
5493          * We have 3 cases of extent removal:
5494          *   1) Range covers the entire extent rec
5495          *   2) Range begins or ends on one edge of the extent rec
5496          *   3) Range is in the middle of the extent rec (no shared edges)
5497          *
5498          * For case 1 we remove the extent rec and left rotate to
5499          * fill the hole.
5500          *
5501          * For case 2 we just shrink the existing extent rec, with a
5502          * tree update if the shrinking edge is also the edge of an
5503          * extent block.
5504          *
5505          * For case 3 we do a right split to turn the extent rec into
5506          * something case 2 can handle.
5507          */
5508         rec = &el->l_recs[index];
5509         rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
5510         trunc_range = cpos + len;
5511
5512         BUG_ON(cpos < le32_to_cpu(rec->e_cpos) || trunc_range > rec_range);
5513
5514         mlog(0, "Owner %llu, remove (cpos %u, len %u). Existing index %d "
5515              "(cpos %u, len %u)\n",
5516              (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
5517              cpos, len, index,
5518              le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec));
5519
5520         if (le32_to_cpu(rec->e_cpos) == cpos || rec_range == trunc_range) {
5521                 ret = ocfs2_truncate_rec(handle, et, path, index, dealloc,
5522                                          cpos, len);
5523                 if (ret) {
5524                         mlog_errno(ret);
5525                         goto out;
5526                 }
5527         } else {
5528                 ret = ocfs2_split_tree(handle, et, path, index,
5529                                        trunc_range, meta_ac);
5530                 if (ret) {
5531                         mlog_errno(ret);
5532                         goto out;
5533                 }
5534
5535                 /*
5536                  * The split could have manipulated the tree enough to
5537                  * move the record location, so we have to look for it again.
5538                  */
5539                 ocfs2_reinit_path(path, 1);
5540
5541                 ret = ocfs2_find_path(et->et_ci, path, cpos);
5542                 if (ret) {
5543                         mlog_errno(ret);
5544                         goto out;
5545                 }
5546
5547                 el = path_leaf_el(path);
5548                 index = ocfs2_search_extent_list(el, cpos);
5549                 if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
5550                         ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
5551                                     "Owner %llu: split at cpos %u lost record.",
5552                                     (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
5553                                     cpos);
5554                         ret = -EROFS;
5555                         goto out;
5556                 }
5557
5558                 /*
5559                  * Double check our values here. If anything is fishy,
5560                  * it's easier to catch it at the top level.
5561                  */
5562                 rec = &el->l_recs[index];
5563                 rec_range = le32_to_cpu(rec->e_cpos) +
5564                         ocfs2_rec_clusters(el, rec);
5565                 if (rec_range != trunc_range) {
5566                         ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
5567                                     "Owner %llu: error after split at cpos %u"
5568                                     "trunc len %u, existing record is (%u,%u)",
5569                                     (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
5570                                     cpos, len, le32_to_cpu(rec->e_cpos),
5571                                     ocfs2_rec_clusters(el, rec));
5572                         ret = -EROFS;
5573                         goto out;
5574                 }
5575
5576                 ret = ocfs2_truncate_rec(handle, et, path, index, dealloc,
5577                                          cpos, len);
5578                 if (ret) {
5579                         mlog_errno(ret);
5580                         goto out;
5581                 }
5582         }
5583
5584 out:
5585         ocfs2_free_path(path);
5586         return ret;
5587 }
5588
5589 int ocfs2_remove_btree_range(struct inode *inode,
5590                              struct ocfs2_extent_tree *et,
5591                              u32 cpos, u32 phys_cpos, u32 len,
5592                              struct ocfs2_cached_dealloc_ctxt *dealloc)
5593 {
5594         int ret;
5595         u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
5596         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
5597         struct inode *tl_inode = osb->osb_tl_inode;
5598         handle_t *handle;
5599         struct ocfs2_alloc_context *meta_ac = NULL;
5600
5601         ret = ocfs2_lock_allocators(inode, et, 0, 1, NULL, &meta_ac);
5602         if (ret) {
5603                 mlog_errno(ret);
5604                 return ret;
5605         }
5606
5607         mutex_lock(&tl_inode->i_mutex);
5608
5609         if (ocfs2_truncate_log_needs_flush(osb)) {
5610                 ret = __ocfs2_flush_truncate_log(osb);
5611                 if (ret < 0) {
5612                         mlog_errno(ret);
5613                         goto out;
5614                 }
5615         }
5616
5617         handle = ocfs2_start_trans(osb, ocfs2_remove_extent_credits(osb->sb));
5618         if (IS_ERR(handle)) {
5619                 ret = PTR_ERR(handle);
5620                 mlog_errno(ret);
5621                 goto out;
5622         }
5623
5624         ret = ocfs2_et_root_journal_access(handle, et,
5625                                            OCFS2_JOURNAL_ACCESS_WRITE);
5626         if (ret) {
5627                 mlog_errno(ret);
5628                 goto out;
5629         }
5630
5631         dquot_free_space_nodirty(inode,
5632                                   ocfs2_clusters_to_bytes(inode->i_sb, len));
5633
5634         ret = ocfs2_remove_extent(handle, et, cpos, len, meta_ac, dealloc);
5635         if (ret) {
5636                 mlog_errno(ret);
5637                 goto out_commit;
5638         }
5639
5640         ocfs2_et_update_clusters(et, -len);
5641
5642         ocfs2_journal_dirty(handle, et->et_root_bh);
5643
5644         ret = ocfs2_truncate_log_append(osb, handle, phys_blkno, len);
5645         if (ret)
5646                 mlog_errno(ret);
5647
5648 out_commit:
5649         ocfs2_commit_trans(osb, handle);
5650 out:
5651         mutex_unlock(&tl_inode->i_mutex);
5652
5653         if (meta_ac)
5654                 ocfs2_free_alloc_context(meta_ac);
5655
5656         return ret;
5657 }
5658
5659 int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb)
5660 {
5661         struct buffer_head *tl_bh = osb->osb_tl_bh;
5662         struct ocfs2_dinode *di;
5663         struct ocfs2_truncate_log *tl;
5664
5665         di = (struct ocfs2_dinode *) tl_bh->b_data;
5666         tl = &di->id2.i_dealloc;
5667
5668         mlog_bug_on_msg(le16_to_cpu(tl->tl_used) > le16_to_cpu(tl->tl_count),
5669                         "slot %d, invalid truncate log parameters: used = "
5670                         "%u, count = %u\n", osb->slot_num,
5671                         le16_to_cpu(tl->tl_used), le16_to_cpu(tl->tl_count));
5672         return le16_to_cpu(tl->tl_used) == le16_to_cpu(tl->tl_count);
5673 }
5674
5675 static int ocfs2_truncate_log_can_coalesce(struct ocfs2_truncate_log *tl,
5676                                            unsigned int new_start)
5677 {
5678         unsigned int tail_index;
5679         unsigned int current_tail;
5680
5681         /* No records, nothing to coalesce */
5682         if (!le16_to_cpu(tl->tl_used))
5683                 return 0;
5684
5685         tail_index = le16_to_cpu(tl->tl_used) - 1;
5686         current_tail = le32_to_cpu(tl->tl_recs[tail_index].t_start);
5687         current_tail += le32_to_cpu(tl->tl_recs[tail_index].t_clusters);
5688
5689         return current_tail == new_start;
5690 }
5691
5692 int ocfs2_truncate_log_append(struct ocfs2_super *osb,
5693                               handle_t *handle,
5694                               u64 start_blk,
5695                               unsigned int num_clusters)
5696 {
5697         int status, index;
5698         unsigned int start_cluster, tl_count;
5699         struct inode *tl_inode = osb->osb_tl_inode;
5700         struct buffer_head *tl_bh = osb->osb_tl_bh;
5701         struct ocfs2_dinode *di;
5702         struct ocfs2_truncate_log *tl;
5703
5704         mlog_entry("start_blk = %llu, num_clusters = %u\n",
5705                    (unsigned long long)start_blk, num_clusters);
5706
5707         BUG_ON(mutex_trylock(&tl_inode->i_mutex));
5708
5709         start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk);
5710
5711         di = (struct ocfs2_dinode *) tl_bh->b_data;
5712
5713         /* tl_bh is loaded from ocfs2_truncate_log_init().  It's validated
5714          * by the underlying call to ocfs2_read_inode_block(), so any
5715          * corruption is a code bug */
5716         BUG_ON(!OCFS2_IS_VALID_DINODE(di));
5717
5718         tl = &di->id2.i_dealloc;
5719         tl_count = le16_to_cpu(tl->tl_count);
5720         mlog_bug_on_msg(tl_count > ocfs2_truncate_recs_per_inode(osb->sb) ||
5721                         tl_count == 0,
5722                         "Truncate record count on #%llu invalid "
5723                         "wanted %u, actual %u\n",
5724                         (unsigned long long)OCFS2_I(tl_inode)->ip_blkno,
5725                         ocfs2_truncate_recs_per_inode(osb->sb),
5726                         le16_to_cpu(tl->tl_count));
5727
5728         /* Caller should have known to flush before calling us. */
5729         index = le16_to_cpu(tl->tl_used);
5730         if (index >= tl_count) {
5731                 status = -ENOSPC;
5732                 mlog_errno(status);
5733                 goto bail;
5734         }
5735
5736         status = ocfs2_journal_access_di(handle, INODE_CACHE(tl_inode), tl_bh,
5737                                          OCFS2_JOURNAL_ACCESS_WRITE);
5738         if (status < 0) {
5739                 mlog_errno(status);
5740                 goto bail;
5741         }
5742
5743         mlog(0, "Log truncate of %u clusters starting at cluster %u to "
5744              "%llu (index = %d)\n", num_clusters, start_cluster,
5745              (unsigned long long)OCFS2_I(tl_inode)->ip_blkno, index);
5746
5747         if (ocfs2_truncate_log_can_coalesce(tl, start_cluster)) {
5748                 /*
5749                  * Move index back to the record we are coalescing with.
5750                  * ocfs2_truncate_log_can_coalesce() guarantees nonzero
5751                  */
5752                 index--;
5753
5754                 num_clusters += le32_to_cpu(tl->tl_recs[index].t_clusters);
5755                 mlog(0, "Coalesce with index %u (start = %u, clusters = %u)\n",
5756                      index, le32_to_cpu(tl->tl_recs[index].t_start),
5757                      num_clusters);
5758         } else {
5759                 tl->tl_recs[index].t_start = cpu_to_le32(start_cluster);
5760                 tl->tl_used = cpu_to_le16(index + 1);
5761         }
5762         tl->tl_recs[index].t_clusters = cpu_to_le32(num_clusters);
5763
5764         ocfs2_journal_dirty(handle, tl_bh);
5765
5766 bail:
5767         mlog_exit(status);
5768         return status;
5769 }
5770
5771 static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
5772                                          handle_t *handle,
5773                                          struct inode *data_alloc_inode,
5774                                          struct buffer_head *data_alloc_bh)
5775 {
5776         int status = 0;
5777         int i;
5778         unsigned int num_clusters;
5779         u64 start_blk;
5780         struct ocfs2_truncate_rec rec;
5781         struct ocfs2_dinode *di;
5782         struct ocfs2_truncate_log *tl;
5783         struct inode *tl_inode = osb->osb_tl_inode;
5784         struct buffer_head *tl_bh = osb->osb_tl_bh;
5785
5786         mlog_entry_void();
5787
5788         di = (struct ocfs2_dinode *) tl_bh->b_data;
5789         tl = &di->id2.i_dealloc;
5790         i = le16_to_cpu(tl->tl_used) - 1;
5791         while (i >= 0) {
5792                 /* Caller has given us at least enough credits to
5793                  * update the truncate log dinode */
5794                 status = ocfs2_journal_access_di(handle, INODE_CACHE(tl_inode), tl_bh,
5795                                                  OCFS2_JOURNAL_ACCESS_WRITE);
5796                 if (status < 0) {
5797                         mlog_errno(status);
5798                         goto bail;
5799                 }
5800
5801                 tl->tl_used = cpu_to_le16(i);
5802
5803                 ocfs2_journal_dirty(handle, tl_bh);
5804
5805                 /* TODO: Perhaps we can calculate the bulk of the
5806                  * credits up front rather than extending like
5807                  * this. */
5808                 status = ocfs2_extend_trans(handle,
5809                                             OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
5810                 if (status < 0) {
5811                         mlog_errno(status);
5812                         goto bail;
5813                 }
5814
5815                 rec = tl->tl_recs[i];
5816                 start_blk = ocfs2_clusters_to_blocks(data_alloc_inode->i_sb,
5817                                                     le32_to_cpu(rec.t_start));
5818                 num_clusters = le32_to_cpu(rec.t_clusters);
5819
5820                 /* if start_blk is not set, we ignore the record as
5821                  * invalid. */
5822                 if (start_blk) {
5823                         mlog(0, "free record %d, start = %u, clusters = %u\n",
5824                              i, le32_to_cpu(rec.t_start), num_clusters);
5825
5826                         status = ocfs2_free_clusters(handle, data_alloc_inode,
5827                                                      data_alloc_bh, start_blk,
5828                                                      num_clusters);
5829                         if (status < 0) {
5830                                 mlog_errno(status);
5831                                 goto bail;
5832                         }
5833                 }
5834                 i--;
5835         }
5836
5837 bail:
5838         mlog_exit(status);
5839         return status;
5840 }
5841
5842 /* Expects you to already be holding tl_inode->i_mutex */
5843 int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
5844 {
5845         int status;
5846         unsigned int num_to_flush;
5847         handle_t *handle;
5848         struct inode *tl_inode = osb->osb_tl_inode;
5849         struct inode *data_alloc_inode = NULL;
5850         struct buffer_head *tl_bh = osb->osb_tl_bh;
5851         struct buffer_head *data_alloc_bh = NULL;
5852         struct ocfs2_dinode *di;
5853         struct ocfs2_truncate_log *tl;
5854
5855         mlog_entry_void();
5856
5857         BUG_ON(mutex_trylock(&tl_inode->i_mutex));
5858
5859         di = (struct ocfs2_dinode *) tl_bh->b_data;
5860
5861         /* tl_bh is loaded from ocfs2_truncate_log_init().  It's validated
5862          * by the underlying call to ocfs2_read_inode_block(), so any
5863          * corruption is a code bug */
5864         BUG_ON(!OCFS2_IS_VALID_DINODE(di));
5865
5866         tl = &di->id2.i_dealloc;
5867         num_to_flush = le16_to_cpu(tl->tl_used);
5868         mlog(0, "Flush %u records from truncate log #%llu\n",
5869              num_to_flush, (unsigned long long)OCFS2_I(tl_inode)->ip_blkno);
5870         if (!num_to_flush) {
5871                 status = 0;
5872                 goto out;
5873         }
5874
5875         data_alloc_inode = ocfs2_get_system_file_inode(osb,
5876                                                        GLOBAL_BITMAP_SYSTEM_INODE,
5877                                                        OCFS2_INVALID_SLOT);
5878         if (!data_alloc_inode) {
5879                 status = -EINVAL;
5880                 mlog(ML_ERROR, "Could not get bitmap inode!\n");
5881                 goto out;
5882         }
5883
5884         mutex_lock(&data_alloc_inode->i_mutex);
5885
5886         status = ocfs2_inode_lock(data_alloc_inode, &data_alloc_bh, 1);
5887         if (status < 0) {
5888                 mlog_errno(status);
5889                 goto out_mutex;
5890         }
5891
5892         handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
5893         if (IS_ERR(handle)) {
5894                 status = PTR_ERR(handle);
5895                 mlog_errno(status);
5896                 goto out_unlock;
5897         }
5898
5899         status = ocfs2_replay_truncate_records(osb, handle, data_alloc_inode,
5900                                                data_alloc_bh);
5901         if (status < 0)
5902                 mlog_errno(status);
5903
5904         ocfs2_commit_trans(osb, handle);
5905
5906 out_unlock:
5907         brelse(data_alloc_bh);
5908         ocfs2_inode_unlock(data_alloc_inode, 1);
5909
5910 out_mutex:
5911         mutex_unlock(&data_alloc_inode->i_mutex);
5912         iput(data_alloc_inode);
5913
5914 out:
5915         mlog_exit(status);
5916         return status;
5917 }
5918
5919 int ocfs2_flush_truncate_log(struct ocfs2_super *osb)
5920 {
5921         int status;
5922         struct inode *tl_inode = osb->osb_tl_inode;
5923
5924         mutex_lock(&tl_inode->i_mutex);
5925         status = __ocfs2_flush_truncate_log(osb);
5926         mutex_unlock(&tl_inode->i_mutex);
5927
5928         return status;
5929 }
5930
5931 static void ocfs2_truncate_log_worker(struct work_struct *work)
5932 {
5933         int status;
5934         struct ocfs2_super *osb =
5935                 container_of(work, struct ocfs2_super,
5936                              osb_truncate_log_wq.work);
5937
5938         mlog_entry_void();
5939
5940         status = ocfs2_flush_truncate_log(osb);
5941         if (status < 0)
5942                 mlog_errno(status);
5943         else
5944                 ocfs2_init_steal_slots(osb);
5945
5946         mlog_exit(status);
5947 }
5948
5949 #define OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL (2 * HZ)
5950 void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb,
5951                                        int cancel)
5952 {
5953         if (osb->osb_tl_inode) {
5954                 /* We want to push off log flushes while truncates are
5955                  * still running. */
5956                 if (cancel)
5957                         cancel_delayed_work(&osb->osb_truncate_log_wq);
5958
5959                 queue_delayed_work(ocfs2_wq, &osb->osb_truncate_log_wq,
5960                                    OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL);
5961         }
5962 }
5963
5964 static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb,
5965                                        int slot_num,
5966                                        struct inode **tl_inode,
5967                                        struct buffer_head **tl_bh)
5968 {
5969         int status;
5970         struct inode *inode = NULL;
5971         struct buffer_head *bh = NULL;
5972
5973         inode = ocfs2_get_system_file_inode(osb,
5974                                            TRUNCATE_LOG_SYSTEM_INODE,
5975                                            slot_num);
5976         if (!inode) {
5977                 status = -EINVAL;
5978                 mlog(ML_ERROR, "Could not get load truncate log inode!\n");
5979                 goto bail;
5980         }
5981
5982         status = ocfs2_read_inode_block(inode, &bh);
5983         if (status < 0) {
5984                 iput(inode);
5985                 mlog_errno(status);
5986                 goto bail;
5987         }
5988
5989         *tl_inode = inode;
5990         *tl_bh    = bh;
5991 bail:
5992         mlog_exit(status);
5993         return status;
5994 }
5995
5996 /* called during the 1st stage of node recovery. we stamp a clean
5997  * truncate log and pass back a copy for processing later. if the
5998  * truncate log does not require processing, a *tl_copy is set to
5999  * NULL. */
6000 int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
6001                                       int slot_num,
6002                                       struct ocfs2_dinode **tl_copy)
6003 {
6004         int status;
6005         struct inode *tl_inode = NULL;
6006         struct buffer_head *tl_bh = NULL;
6007         struct ocfs2_dinode *di;
6008         struct ocfs2_truncate_log *tl;
6009
6010         *tl_copy = NULL;
6011
6012         mlog(0, "recover truncate log from slot %d\n", slot_num);
6013
6014         status = ocfs2_get_truncate_log_info(osb, slot_num, &tl_inode, &tl_bh);
6015         if (status < 0) {
6016                 mlog_errno(status);
6017                 goto bail;
6018         }
6019
6020         di = (struct ocfs2_dinode *) tl_bh->b_data;
6021
6022         /* tl_bh is loaded from ocfs2_get_truncate_log_info().  It's
6023          * validated by the underlying call to ocfs2_read_inode_block(),
6024          * so any corruption is a code bug */
6025         BUG_ON(!OCFS2_IS_VALID_DINODE(di));
6026
6027         tl = &di->id2.i_dealloc;
6028         if (le16_to_cpu(tl->tl_used)) {
6029                 mlog(0, "We'll have %u logs to recover\n",
6030                      le16_to_cpu(tl->tl_used));
6031
6032                 *tl_copy = kmalloc(tl_bh->b_size, GFP_KERNEL);
6033                 if (!(*tl_copy)) {
6034                         status = -ENOMEM;
6035                         mlog_errno(status);
6036                         goto bail;
6037                 }
6038
6039                 /* Assuming the write-out below goes well, this copy
6040                  * will be passed back to recovery for processing. */
6041                 memcpy(*tl_copy, tl_bh->b_data, tl_bh->b_size);
6042
6043                 /* All we need to do to clear the truncate log is set
6044                  * tl_used. */
6045                 tl->tl_used = 0;
6046
6047                 ocfs2_compute_meta_ecc(osb->sb, tl_bh->b_data, &di->i_check);
6048                 status = ocfs2_write_block(osb, tl_bh, INODE_CACHE(tl_inode));
6049                 if (status < 0) {
6050                         mlog_errno(status);
6051                         goto bail;
6052                 }
6053         }
6054
6055 bail:
6056         if (tl_inode)
6057                 iput(tl_inode);
6058         brelse(tl_bh);
6059
6060         if (status < 0 && (*tl_copy)) {
6061                 kfree(*tl_copy);
6062                 *tl_copy = NULL;
6063         }
6064
6065         mlog_exit(status);
6066         return status;
6067 }
6068
6069 int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
6070                                          struct ocfs2_dinode *tl_copy)
6071 {
6072         int status = 0;
6073         int i;
6074         unsigned int clusters, num_recs, start_cluster;
6075         u64 start_blk;
6076         handle_t *handle;
6077         struct inode *tl_inode = osb->osb_tl_inode;
6078         struct ocfs2_truncate_log *tl;
6079
6080         mlog_entry_void();
6081
6082         if (OCFS2_I(tl_inode)->ip_blkno == le64_to_cpu(tl_copy->i_blkno)) {
6083                 mlog(ML_ERROR, "Asked to recover my own truncate log!\n");
6084                 return -EINVAL;
6085         }
6086
6087         tl = &tl_copy->id2.i_dealloc;
6088         num_recs = le16_to_cpu(tl->tl_used);
6089         mlog(0, "cleanup %u records from %llu\n", num_recs,
6090              (unsigned long long)le64_to_cpu(tl_copy->i_blkno));
6091
6092         mutex_lock(&tl_inode->i_mutex);
6093         for(i = 0; i < num_recs; i++) {
6094                 if (ocfs2_truncate_log_needs_flush(osb)) {
6095                         status = __ocfs2_flush_truncate_log(osb);
6096                         if (status < 0) {
6097                                 mlog_errno(status);
6098                                 goto bail_up;
6099                         }
6100                 }
6101
6102                 handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
6103                 if (IS_ERR(handle)) {
6104                         status = PTR_ERR(handle);
6105                         mlog_errno(status);
6106                         goto bail_up;
6107                 }
6108
6109                 clusters = le32_to_cpu(tl->tl_recs[i].t_clusters);
6110                 start_cluster = le32_to_cpu(tl->tl_recs[i].t_start);
6111                 start_blk = ocfs2_clusters_to_blocks(osb->sb, start_cluster);
6112
6113                 status = ocfs2_truncate_log_append(osb, handle,
6114                                                    start_blk, clusters);
6115                 ocfs2_commit_trans(osb, handle);
6116                 if (status < 0) {
6117                         mlog_errno(status);
6118                         goto bail_up;
6119                 }
6120         }
6121
6122 bail_up:
6123         mutex_unlock(&tl_inode->i_mutex);
6124
6125         mlog_exit(status);
6126         return status;
6127 }
6128
6129 void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb)
6130 {
6131         int status;
6132         struct inode *tl_inode = osb->osb_tl_inode;
6133
6134         mlog_entry_void();
6135
6136         if (tl_inode) {
6137                 cancel_delayed_work(&osb->osb_truncate_log_wq);
6138                 flush_workqueue(ocfs2_wq);
6139
6140                 status = ocfs2_flush_truncate_log(osb);
6141                 if (status < 0)
6142                         mlog_errno(status);
6143
6144                 brelse(osb->osb_tl_bh);
6145                 iput(osb->osb_tl_inode);
6146         }
6147
6148         mlog_exit_void();
6149 }
6150
6151 int ocfs2_truncate_log_init(struct ocfs2_super *osb)
6152 {
6153         int status;
6154         struct inode *tl_inode = NULL;
6155         struct buffer_head *tl_bh = NULL;
6156
6157         mlog_entry_void();
6158
6159         status = ocfs2_get_truncate_log_info(osb,
6160                                              osb->slot_num,
6161                                              &tl_inode,
6162                                              &tl_bh);
6163         if (status < 0)
6164                 mlog_errno(status);
6165
6166         /* ocfs2_truncate_log_shutdown keys on the existence of
6167          * osb->osb_tl_inode so we don't set any of the osb variables
6168          * until we're sure all is well. */
6169         INIT_DELAYED_WORK(&osb->osb_truncate_log_wq,
6170                           ocfs2_truncate_log_worker);
6171         osb->osb_tl_bh    = tl_bh;
6172         osb->osb_tl_inode = tl_inode;
6173
6174         mlog_exit(status);
6175         return status;
6176 }
6177
6178 /*
6179  * Delayed de-allocation of suballocator blocks.
6180  *
6181  * Some sets of block de-allocations might involve multiple suballocator inodes.
6182  *
6183  * The locking for this can get extremely complicated, especially when
6184  * the suballocator inodes to delete from aren't known until deep
6185  * within an unrelated codepath.
6186  *
6187  * ocfs2_extent_block structures are a good example of this - an inode
6188  * btree could have been grown by any number of nodes each allocating
6189  * out of their own suballoc inode.
6190  *
6191  * These structures allow the delay of block de-allocation until a
6192  * later time, when locking of multiple cluster inodes won't cause
6193  * deadlock.
6194  */
6195
6196 /*
6197  * Describe a single bit freed from a suballocator.  For the block
6198  * suballocators, it represents one block.  For the global cluster
6199  * allocator, it represents some clusters and free_bit indicates
6200  * clusters number.
6201  */
6202 struct ocfs2_cached_block_free {
6203         struct ocfs2_cached_block_free          *free_next;
6204         u64                                     free_blk;
6205         unsigned int                            free_bit;
6206 };
6207
6208 struct ocfs2_per_slot_free_list {
6209         struct ocfs2_per_slot_free_list         *f_next_suballocator;
6210         int                                     f_inode_type;
6211         int                                     f_slot;
6212         struct ocfs2_cached_block_free          *f_first;
6213 };
6214
6215 static int ocfs2_free_cached_blocks(struct ocfs2_super *osb,
6216                                     int sysfile_type,
6217                                     int slot,
6218                                     struct ocfs2_cached_block_free *head)
6219 {
6220         int ret;
6221         u64 bg_blkno;
6222         handle_t *handle;
6223         struct inode *inode;
6224         struct buffer_head *di_bh = NULL;
6225         struct ocfs2_cached_block_free *tmp;
6226
6227         inode = ocfs2_get_system_file_inode(osb, sysfile_type, slot);
6228         if (!inode) {
6229                 ret = -EINVAL;
6230                 mlog_errno(ret);
6231                 goto out;
6232         }
6233
6234         mutex_lock(&inode->i_mutex);
6235
6236         ret = ocfs2_inode_lock(inode, &di_bh, 1);
6237         if (ret) {
6238                 mlog_errno(ret);
6239                 goto out_mutex;
6240         }
6241
6242         handle = ocfs2_start_trans(osb, OCFS2_SUBALLOC_FREE);
6243         if (IS_ERR(handle)) {
6244                 ret = PTR_ERR(handle);
6245                 mlog_errno(ret);
6246                 goto out_unlock;
6247         }
6248
6249         while (head) {
6250                 bg_blkno = ocfs2_which_suballoc_group(head->free_blk,
6251                                                       head->free_bit);
6252                 mlog(0, "Free bit: (bit %u, blkno %llu)\n",
6253                      head->free_bit, (unsigned long long)head->free_blk);
6254
6255                 ret = ocfs2_free_suballoc_bits(handle, inode, di_bh,
6256                                                head->free_bit, bg_blkno, 1);
6257                 if (ret) {
6258                         mlog_errno(ret);
6259                         goto out_journal;
6260                 }
6261
6262                 ret = ocfs2_extend_trans(handle, OCFS2_SUBALLOC_FREE);
6263                 if (ret) {
6264                         mlog_errno(ret);
6265                         goto out_journal;
6266                 }
6267
6268                 tmp = head;
6269                 head = head->free_next;
6270                 kfree(tmp);
6271         }
6272
6273 out_journal:
6274         ocfs2_commit_trans(osb, handle);
6275
6276 out_unlock:
6277         ocfs2_inode_unlock(inode, 1);
6278         brelse(di_bh);
6279 out_mutex:
6280         mutex_unlock(&inode->i_mutex);
6281         iput(inode);
6282 out:
6283         while(head) {
6284                 /* Premature exit may have left some dangling items. */
6285                 tmp = head;
6286                 head = head->free_next;
6287                 kfree(tmp);
6288         }
6289
6290         return ret;
6291 }
6292
6293 int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
6294                                 u64 blkno, unsigned int bit)
6295 {
6296         int ret = 0;
6297         struct ocfs2_cached_block_free *item;
6298
6299         item = kmalloc(sizeof(*item), GFP_NOFS);
6300         if (item == NULL) {
6301                 ret = -ENOMEM;
6302                 mlog_errno(ret);
6303                 return ret;
6304         }
6305
6306         mlog(0, "Insert clusters: (bit %u, blk %llu)\n",
6307              bit, (unsigned long long)blkno);
6308
6309         item->free_blk = blkno;
6310         item->free_bit = bit;
6311         item->free_next = ctxt->c_global_allocator;
6312
6313         ctxt->c_global_allocator = item;
6314         return ret;
6315 }
6316
6317 static int ocfs2_free_cached_clusters(struct ocfs2_super *osb,
6318                                       struct ocfs2_cached_block_free *head)
6319 {
6320         struct ocfs2_cached_block_free *tmp;
6321         struct inode *tl_inode = osb->osb_tl_inode;
6322         handle_t *handle;
6323         int ret = 0;
6324
6325         mutex_lock(&tl_inode->i_mutex);
6326
6327         while (head) {
6328                 if (ocfs2_truncate_log_needs_flush(osb)) {
6329                         ret = __ocfs2_flush_truncate_log(osb);
6330                         if (ret < 0) {
6331                                 mlog_errno(ret);
6332                                 break;
6333                         }
6334                 }
6335
6336                 handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
6337                 if (IS_ERR(handle)) {
6338                         ret = PTR_ERR(handle);
6339                         mlog_errno(ret);
6340                         break;
6341                 }
6342
6343                 ret = ocfs2_truncate_log_append(osb, handle, head->free_blk,
6344                                                 head->free_bit);
6345
6346                 ocfs2_commit_trans(osb, handle);
6347                 tmp = head;
6348                 head = head->free_next;
6349                 kfree(tmp);
6350
6351                 if (ret < 0) {
6352                         mlog_errno(ret);
6353                         break;
6354                 }
6355         }
6356
6357         mutex_unlock(&tl_inode->i_mutex);
6358
6359         while (head) {
6360                 /* Premature exit may have left some dangling items. */
6361                 tmp = head;
6362                 head = head->free_next;
6363                 kfree(tmp);
6364         }
6365
6366         return ret;
6367 }
6368
6369 int ocfs2_run_deallocs(struct ocfs2_super *osb,
6370                        struct ocfs2_cached_dealloc_ctxt *ctxt)
6371 {
6372         int ret = 0, ret2;
6373         struct ocfs2_per_slot_free_list *fl;
6374
6375         if (!ctxt)
6376                 return 0;
6377
6378         while (ctxt->c_first_suballocator) {
6379                 fl = ctxt->c_first_suballocator;
6380
6381                 if (fl->f_first) {
6382                         mlog(0, "Free items: (type %u, slot %d)\n",
6383                              fl->f_inode_type, fl->f_slot);
6384                         ret2 = ocfs2_free_cached_blocks(osb,
6385                                                         fl->f_inode_type,
6386                                                         fl->f_slot,
6387                                                         fl->f_first);
6388                         if (ret2)
6389                                 mlog_errno(ret2);
6390                         if (!ret)
6391                                 ret = ret2;
6392                 }
6393
6394                 ctxt->c_first_suballocator = fl->f_next_suballocator;
6395                 kfree(fl);
6396         }
6397
6398         if (ctxt->c_global_allocator) {
6399                 ret2 = ocfs2_free_cached_clusters(osb,
6400                                                   ctxt->c_global_allocator);
6401                 if (ret2)
6402                         mlog_errno(ret2);
6403                 if (!ret)
6404                         ret = ret2;
6405
6406                 ctxt->c_global_allocator = NULL;
6407         }
6408
6409         return ret;
6410 }
6411
6412 static struct ocfs2_per_slot_free_list *
6413 ocfs2_find_per_slot_free_list(int type,
6414                               int slot,
6415                               struct ocfs2_cached_dealloc_ctxt *ctxt)
6416 {
6417         struct ocfs2_per_slot_free_list *fl = ctxt->c_first_suballocator;
6418
6419         while (fl) {
6420                 if (fl->f_inode_type == type && fl->f_slot == slot)
6421                         return fl;
6422
6423                 fl = fl->f_next_suballocator;
6424         }
6425
6426         fl = kmalloc(sizeof(*fl), GFP_NOFS);
6427         if (fl) {
6428                 fl->f_inode_type = type;
6429                 fl->f_slot = slot;
6430                 fl->f_first = NULL;
6431                 fl->f_next_suballocator = ctxt->c_first_suballocator;
6432
6433                 ctxt->c_first_suballocator = fl;
6434         }
6435         return fl;
6436 }
6437
6438 int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
6439                               int type, int slot, u64 blkno,
6440                               unsigned int bit)
6441 {
6442         int ret;
6443         struct ocfs2_per_slot_free_list *fl;
6444         struct ocfs2_cached_block_free *item;
6445
6446         fl = ocfs2_find_per_slot_free_list(type, slot, ctxt);
6447         if (fl == NULL) {
6448                 ret = -ENOMEM;
6449                 mlog_errno(ret);
6450                 goto out;
6451         }
6452
6453         item = kmalloc(sizeof(*item), GFP_NOFS);
6454         if (item == NULL) {
6455                 ret = -ENOMEM;
6456                 mlog_errno(ret);
6457                 goto out;
6458         }
6459
6460         mlog(0, "Insert: (type %d, slot %u, bit %u, blk %llu)\n",
6461              type, slot, bit, (unsigned long long)blkno);
6462
6463         item->free_blk = blkno;
6464         item->free_bit = bit;
6465         item->free_next = fl->f_first;
6466
6467         fl->f_first = item;
6468
6469         ret = 0;
6470 out:
6471         return ret;
6472 }
6473
6474 static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
6475                                          struct ocfs2_extent_block *eb)
6476 {
6477         return ocfs2_cache_block_dealloc(ctxt, EXTENT_ALLOC_SYSTEM_INODE,
6478                                          le16_to_cpu(eb->h_suballoc_slot),
6479                                          le64_to_cpu(eb->h_blkno),
6480                                          le16_to_cpu(eb->h_suballoc_bit));
6481 }
6482
6483 /* This function will figure out whether the currently last extent
6484  * block will be deleted, and if it will, what the new last extent
6485  * block will be so we can update his h_next_leaf_blk field, as well
6486  * as the dinodes i_last_eb_blk */
6487 static int ocfs2_find_new_last_ext_blk(struct inode *inode,
6488                                        unsigned int clusters_to_del,
6489                                        struct ocfs2_path *path,
6490                                        struct buffer_head **new_last_eb)
6491 {
6492         int next_free, ret = 0;
6493         u32 cpos;
6494         struct ocfs2_extent_rec *rec;
6495         struct ocfs2_extent_block *eb;
6496         struct ocfs2_extent_list *el;
6497         struct buffer_head *bh = NULL;
6498
6499         *new_last_eb = NULL;
6500
6501         /* we have no tree, so of course, no last_eb. */
6502         if (!path->p_tree_depth)
6503                 goto out;
6504
6505         /* trunc to zero special case - this makes tree_depth = 0
6506          * regardless of what it is.  */
6507         if (OCFS2_I(inode)->ip_clusters == clusters_to_del)
6508                 goto out;
6509
6510         el = path_leaf_el(path);
6511         BUG_ON(!el->l_next_free_rec);
6512
6513         /*
6514          * Make sure that this extent list will actually be empty
6515          * after we clear away the data. We can shortcut out if
6516          * there's more than one non-empty extent in the
6517          * list. Otherwise, a check of the remaining extent is
6518          * necessary.
6519          */
6520         next_free = le16_to_cpu(el->l_next_free_rec);
6521         rec = NULL;
6522         if (ocfs2_is_empty_extent(&el->l_recs[0])) {
6523                 if (next_free > 2)
6524                         goto out;
6525
6526                 /* We may have a valid extent in index 1, check it. */
6527                 if (next_free == 2)
6528                         rec = &el->l_recs[1];
6529
6530                 /*
6531                  * Fall through - no more nonempty extents, so we want
6532                  * to delete this leaf.
6533                  */
6534         } else {
6535                 if (next_free > 1)
6536                         goto out;
6537
6538                 rec = &el->l_recs[0];
6539         }
6540
6541         if (rec) {
6542                 /*
6543                  * Check it we'll only be trimming off the end of this
6544                  * cluster.
6545                  */
6546                 if (le16_to_cpu(rec->e_leaf_clusters) > clusters_to_del)
6547                         goto out;
6548         }
6549
6550         ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos);
6551         if (ret) {
6552                 mlog_errno(ret);
6553                 goto out;
6554         }
6555
6556         ret = ocfs2_find_leaf(INODE_CACHE(inode), path_root_el(path), cpos, &bh);
6557         if (ret) {
6558                 mlog_errno(ret);
6559                 goto out;
6560         }
6561
6562         eb = (struct ocfs2_extent_block *) bh->b_data;
6563         el = &eb->h_list;
6564
6565         /* ocfs2_find_leaf() gets the eb from ocfs2_read_extent_block().
6566          * Any corruption is a code bug. */
6567         BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
6568
6569         *new_last_eb = bh;
6570         get_bh(*new_last_eb);
6571         mlog(0, "returning block %llu, (cpos: %u)\n",
6572              (unsigned long long)le64_to_cpu(eb->h_blkno), cpos);
6573 out:
6574         brelse(bh);
6575
6576         return ret;
6577 }
6578
6579 /*
6580  * Trim some clusters off the rightmost edge of a tree. Only called
6581  * during truncate.
6582  *
6583  * The caller needs to:
6584  *   - start journaling of each path component.
6585  *   - compute and fully set up any new last ext block
6586  */
6587 static int ocfs2_trim_tree(struct inode *inode, struct ocfs2_path *path,
6588                            handle_t *handle, struct ocfs2_truncate_context *tc,
6589                            u32 clusters_to_del, u64 *delete_start, u8 *flags)
6590 {
6591         int ret, i, index = path->p_tree_depth;
6592         u32 new_edge = 0;
6593         u64 deleted_eb = 0;
6594         struct buffer_head *bh;
6595         struct ocfs2_extent_list *el;
6596         struct ocfs2_extent_rec *rec;
6597
6598         *delete_start = 0;
6599         *flags = 0;
6600
6601         while (index >= 0) {
6602                 bh = path->p_node[index].bh;
6603                 el = path->p_node[index].el;
6604
6605                 mlog(0, "traveling tree (index = %d, block = %llu)\n",
6606                      index,  (unsigned long long)bh->b_blocknr);
6607
6608                 BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
6609
6610                 if (index !=
6611                     (path->p_tree_depth - le16_to_cpu(el->l_tree_depth))) {
6612                         ocfs2_error(inode->i_sb,
6613                                     "Inode %lu has invalid ext. block %llu",
6614                                     inode->i_ino,
6615                                     (unsigned long long)bh->b_blocknr);
6616                         ret = -EROFS;
6617                         goto out;
6618                 }
6619
6620 find_tail_record:
6621                 i = le16_to_cpu(el->l_next_free_rec) - 1;
6622                 rec = &el->l_recs[i];
6623
6624                 mlog(0, "Extent list before: record %d: (%u, %u, %llu), "
6625                      "next = %u\n", i, le32_to_cpu(rec->e_cpos),
6626                      ocfs2_rec_clusters(el, rec),
6627                      (unsigned long long)le64_to_cpu(rec->e_blkno),
6628                      le16_to_cpu(el->l_next_free_rec));
6629
6630                 BUG_ON(ocfs2_rec_clusters(el, rec) < clusters_to_del);
6631
6632                 if (le16_to_cpu(el->l_tree_depth) == 0) {
6633                         /*
6634                          * If the leaf block contains a single empty
6635                          * extent and no records, we can just remove
6636                          * the block.
6637                          */
6638                         if (i == 0 && ocfs2_is_empty_extent(rec)) {
6639                                 memset(rec, 0,
6640                                        sizeof(struct ocfs2_extent_rec));
6641                                 el->l_next_free_rec = cpu_to_le16(0);
6642
6643                                 goto delete;
6644                         }
6645
6646                         /*
6647                          * Remove any empty extents by shifting things
6648                          * left. That should make life much easier on
6649                          * the code below. This condition is rare
6650                          * enough that we shouldn't see a performance
6651                          * hit.
6652                          */
6653                         if (ocfs2_is_empty_extent(&el->l_recs[0])) {
6654                                 le16_add_cpu(&el->l_next_free_rec, -1);
6655
6656                                 for(i = 0;
6657                                     i < le16_to_cpu(el->l_next_free_rec); i++)
6658                                         el->l_recs[i] = el->l_recs[i + 1];
6659
6660                                 memset(&el->l_recs[i], 0,
6661                                        sizeof(struct ocfs2_extent_rec));
6662
6663                                 /*
6664                                  * We've modified our extent list. The
6665                                  * simplest way to handle this change
6666                                  * is to being the search from the
6667                                  * start again.
6668                                  */
6669                                 goto find_tail_record;
6670                         }
6671
6672                         le16_add_cpu(&rec->e_leaf_clusters, -clusters_to_del);
6673
6674                         /*
6675                          * We'll use "new_edge" on our way back up the
6676                          * tree to know what our rightmost cpos is.
6677                          */
6678                         new_edge = le16_to_cpu(rec->e_leaf_clusters);
6679                         new_edge += le32_to_cpu(rec->e_cpos);
6680
6681                         /*
6682                          * The caller will use this to delete data blocks.
6683                          */
6684                         *delete_start = le64_to_cpu(rec->e_blkno)
6685                                 + ocfs2_clusters_to_blocks(inode->i_sb,
6686                                         le16_to_cpu(rec->e_leaf_clusters));
6687                         *flags = rec->e_flags;
6688
6689                         /*
6690                          * If it's now empty, remove this record.
6691                          */
6692                         if (le16_to_cpu(rec->e_leaf_clusters) == 0) {
6693                                 memset(rec, 0,
6694                                        sizeof(struct ocfs2_extent_rec));
6695                                 le16_add_cpu(&el->l_next_free_rec, -1);
6696                         }
6697                 } else {
6698                         if (le64_to_cpu(rec->e_blkno) == deleted_eb) {
6699                                 memset(rec, 0,
6700                                        sizeof(struct ocfs2_extent_rec));
6701                                 le16_add_cpu(&el->l_next_free_rec, -1);
6702
6703                                 goto delete;
6704                         }
6705
6706                         /* Can this actually happen? */
6707                         if (le16_to_cpu(el->l_next_free_rec) == 0)
6708                                 goto delete;
6709
6710                         /*
6711                          * We never actually deleted any clusters
6712                          * because our leaf was empty. There's no
6713                          * reason to adjust the rightmost edge then.
6714                          */
6715                         if (new_edge == 0)
6716                                 goto delete;
6717
6718                         rec->e_int_clusters = cpu_to_le32(new_edge);
6719                         le32_add_cpu(&rec->e_int_clusters,
6720                                      -le32_to_cpu(rec->e_cpos));
6721
6722                          /*
6723                           * A deleted child record should have been
6724                           * caught above.
6725                           */
6726                          BUG_ON(le32_to_cpu(rec->e_int_clusters) == 0);
6727                 }
6728
6729 delete:
6730                 ocfs2_journal_dirty(handle, bh);
6731
6732                 mlog(0, "extent list container %llu, after: record %d: "
6733                      "(%u, %u, %llu), next = %u.\n",
6734                      (unsigned long long)bh->b_blocknr, i,
6735                      le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec),
6736                      (unsigned long long)le64_to_cpu(rec->e_blkno),
6737                      le16_to_cpu(el->l_next_free_rec));
6738
6739                 /*
6740                  * We must be careful to only attempt delete of an
6741                  * extent block (and not the root inode block).
6742                  */
6743                 if (index > 0 && le16_to_cpu(el->l_next_free_rec) == 0) {
6744                         struct ocfs2_extent_block *eb =
6745                                 (struct ocfs2_extent_block *)bh->b_data;
6746
6747                         /*
6748                          * Save this for use when processing the
6749                          * parent block.
6750                          */
6751                         deleted_eb = le64_to_cpu(eb->h_blkno);
6752
6753                         mlog(0, "deleting this extent block.\n");
6754
6755                         ocfs2_remove_from_cache(INODE_CACHE(inode), bh);
6756
6757                         BUG_ON(ocfs2_rec_clusters(el, &el->l_recs[0]));
6758                         BUG_ON(le32_to_cpu(el->l_recs[0].e_cpos));
6759                         BUG_ON(le64_to_cpu(el->l_recs[0].e_blkno));
6760
6761                         ret = ocfs2_cache_extent_block_free(&tc->tc_dealloc, eb);
6762                         /* An error here is not fatal. */
6763                         if (ret < 0)
6764                                 mlog_errno(ret);
6765                 } else {
6766                         deleted_eb = 0;
6767                 }
6768
6769                 index--;
6770         }
6771
6772         ret = 0;
6773 out:
6774         return ret;
6775 }
6776
6777 static int ocfs2_do_truncate(struct ocfs2_super *osb,
6778                              unsigned int clusters_to_del,
6779                              struct inode *inode,
6780                              struct buffer_head *fe_bh,
6781                              handle_t *handle,
6782                              struct ocfs2_truncate_context *tc,
6783                              struct ocfs2_path *path,
6784                              struct ocfs2_alloc_context *meta_ac)
6785 {
6786         int status;
6787         struct ocfs2_dinode *fe;
6788         struct ocfs2_extent_block *last_eb = NULL;
6789         struct ocfs2_extent_list *el;
6790         struct buffer_head *last_eb_bh = NULL;
6791         u64 delete_blk = 0;
6792         u8 rec_flags;
6793
6794         fe = (struct ocfs2_dinode *) fe_bh->b_data;
6795
6796         status = ocfs2_find_new_last_ext_blk(inode, clusters_to_del,
6797                                              path, &last_eb_bh);
6798         if (status < 0) {
6799                 mlog_errno(status);
6800                 goto bail;
6801         }
6802
6803         /*
6804          * Each component will be touched, so we might as well journal
6805          * here to avoid having to handle errors later.
6806          */
6807         status = ocfs2_journal_access_path(INODE_CACHE(inode), handle, path);
6808         if (status < 0) {
6809                 mlog_errno(status);
6810                 goto bail;
6811         }
6812
6813         if (last_eb_bh) {
6814                 status = ocfs2_journal_access_eb(handle, INODE_CACHE(inode), last_eb_bh,
6815                                                  OCFS2_JOURNAL_ACCESS_WRITE);
6816                 if (status < 0) {
6817                         mlog_errno(status);
6818                         goto bail;
6819                 }
6820
6821                 last_eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
6822         }
6823
6824         el = &(fe->id2.i_list);
6825
6826         /*
6827          * Lower levels depend on this never happening, but it's best
6828          * to check it up here before changing the tree.
6829          */
6830         if (el->l_tree_depth && el->l_recs[0].e_int_clusters == 0) {
6831                 ocfs2_error(inode->i_sb,
6832                             "Inode %lu has an empty extent record, depth %u\n",
6833                             inode->i_ino, le16_to_cpu(el->l_tree_depth));
6834                 status = -EROFS;
6835                 goto bail;
6836         }
6837
6838         dquot_free_space_nodirty(inode,
6839                         ocfs2_clusters_to_bytes(osb->sb, clusters_to_del));
6840         spin_lock(&OCFS2_I(inode)->ip_lock);
6841         OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) -
6842                                       clusters_to_del;
6843         spin_unlock(&OCFS2_I(inode)->ip_lock);
6844         le32_add_cpu(&fe->i_clusters, -clusters_to_del);
6845         inode->i_blocks = ocfs2_inode_sector_count(inode);
6846
6847         status = ocfs2_trim_tree(inode, path, handle, tc,
6848                                  clusters_to_del, &delete_blk, &rec_flags);
6849         if (status) {
6850                 mlog_errno(status);
6851                 goto bail;
6852         }
6853
6854         if (le32_to_cpu(fe->i_clusters) == 0) {
6855                 /* trunc to zero is a special case. */
6856                 el->l_tree_depth = 0;
6857                 fe->i_last_eb_blk = 0;
6858         } else if (last_eb)
6859                 fe->i_last_eb_blk = last_eb->h_blkno;
6860
6861         ocfs2_journal_dirty(handle, fe_bh);
6862
6863         if (last_eb) {
6864                 /* If there will be a new last extent block, then by
6865                  * definition, there cannot be any leaves to the right of
6866                  * him. */
6867                 last_eb->h_next_leaf_blk = 0;
6868                 ocfs2_journal_dirty(handle, last_eb_bh);
6869         }
6870
6871         if (delete_blk) {
6872                 if (rec_flags & OCFS2_EXT_REFCOUNTED)
6873                         status = ocfs2_decrease_refcount(inode, handle,
6874                                         ocfs2_blocks_to_clusters(osb->sb,
6875                                                                  delete_blk),
6876                                         clusters_to_del, meta_ac,
6877                                         &tc->tc_dealloc, 1);
6878                 else
6879                         status = ocfs2_truncate_log_append(osb, handle,
6880                                                            delete_blk,
6881                                                            clusters_to_del);
6882                 if (status < 0) {
6883                         mlog_errno(status);
6884                         goto bail;
6885                 }
6886         }
6887         status = 0;
6888 bail:
6889         brelse(last_eb_bh);
6890         mlog_exit(status);
6891         return status;
6892 }
6893
6894 static int ocfs2_zero_func(handle_t *handle, struct buffer_head *bh)
6895 {
6896         set_buffer_uptodate(bh);
6897         mark_buffer_dirty(bh);
6898         return 0;
6899 }
6900
6901 void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
6902                               unsigned int from, unsigned int to,
6903                               struct page *page, int zero, u64 *phys)
6904 {
6905         int ret, partial = 0;
6906
6907         ret = ocfs2_map_page_blocks(page, phys, inode, from, to, 0);
6908         if (ret)
6909                 mlog_errno(ret);
6910
6911         if (zero)
6912                 zero_user_segment(page, from, to);
6913
6914         /*
6915          * Need to set the buffers we zero'd into uptodate
6916          * here if they aren't - ocfs2_map_page_blocks()
6917          * might've skipped some
6918          */
6919         ret = walk_page_buffers(handle, page_buffers(page),
6920                                 from, to, &partial,
6921                                 ocfs2_zero_func);
6922         if (ret < 0)
6923                 mlog_errno(ret);
6924         else if (ocfs2_should_order_data(inode)) {
6925                 ret = ocfs2_jbd2_file_inode(handle, inode);
6926                 if (ret < 0)
6927                         mlog_errno(ret);
6928         }
6929
6930         if (!partial)
6931                 SetPageUptodate(page);
6932
6933         flush_dcache_page(page);
6934 }
6935
6936 static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t start,
6937                                      loff_t end, struct page **pages,
6938                                      int numpages, u64 phys, handle_t *handle)
6939 {
6940         int i;
6941         struct page *page;
6942         unsigned int from, to = PAGE_CACHE_SIZE;
6943         struct super_block *sb = inode->i_sb;
6944
6945         BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb)));
6946
6947         if (numpages == 0)
6948                 goto out;
6949
6950         to = PAGE_CACHE_SIZE;
6951         for(i = 0; i < numpages; i++) {
6952                 page = pages[i];
6953
6954                 from = start & (PAGE_CACHE_SIZE - 1);
6955                 if ((end >> PAGE_CACHE_SHIFT) == page->index)
6956                         to = end & (PAGE_CACHE_SIZE - 1);
6957
6958                 BUG_ON(from > PAGE_CACHE_SIZE);
6959                 BUG_ON(to > PAGE_CACHE_SIZE);
6960
6961                 ocfs2_map_and_dirty_page(inode, handle, from, to, page, 1,
6962                                          &phys);
6963
6964                 start = (page->index + 1) << PAGE_CACHE_SHIFT;
6965         }
6966 out:
6967         if (pages)
6968                 ocfs2_unlock_and_free_pages(pages, numpages);
6969 }
6970
6971 int ocfs2_grab_pages(struct inode *inode, loff_t start, loff_t end,
6972                      struct page **pages, int *num)
6973 {
6974         int numpages, ret = 0;
6975         struct address_space *mapping = inode->i_mapping;
6976         unsigned long index;
6977         loff_t last_page_bytes;
6978
6979         BUG_ON(start > end);
6980
6981         numpages = 0;
6982         last_page_bytes = PAGE_ALIGN(end);
6983         index = start >> PAGE_CACHE_SHIFT;
6984         do {
6985                 pages[numpages] = grab_cache_page(mapping, index);
6986                 if (!pages[numpages]) {
6987                         ret = -ENOMEM;
6988                         mlog_errno(ret);
6989                         goto out;
6990                 }
6991
6992                 numpages++;
6993                 index++;
6994         } while (index < (last_page_bytes >> PAGE_CACHE_SHIFT));
6995
6996 out:
6997         if (ret != 0) {
6998                 if (pages)
6999                         ocfs2_unlock_and_free_pages(pages, numpages);
7000                 numpages = 0;
7001         }
7002
7003         *num = numpages;
7004
7005         return ret;
7006 }
7007
7008 static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end,
7009                                 struct page **pages, int *num)
7010 {
7011         struct super_block *sb = inode->i_sb;
7012
7013         BUG_ON(start >> OCFS2_SB(sb)->s_clustersize_bits !=
7014                (end - 1) >> OCFS2_SB(sb)->s_clustersize_bits);
7015
7016         return ocfs2_grab_pages(inode, start, end, pages, num);
7017 }
7018
7019 /*
7020  * Zero the area past i_size but still within an allocated
7021  * cluster. This avoids exposing nonzero data on subsequent file
7022  * extends.
7023  *
7024  * We need to call this before i_size is updated on the inode because
7025  * otherwise block_write_full_page() will skip writeout of pages past
7026  * i_size. The new_i_size parameter is passed for this reason.
7027  */
7028 int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
7029                                   u64 range_start, u64 range_end)
7030 {
7031         int ret = 0, numpages;
7032         struct page **pages = NULL;
7033         u64 phys;
7034         unsigned int ext_flags;
7035         struct super_block *sb = inode->i_sb;
7036
7037         /*
7038          * File systems which don't support sparse files zero on every
7039          * extend.
7040          */
7041         if (!ocfs2_sparse_alloc(OCFS2_SB(sb)))
7042                 return 0;
7043
7044         pages = kcalloc(ocfs2_pages_per_cluster(sb),
7045                         sizeof(struct page *), GFP_NOFS);
7046         if (pages == NULL) {
7047                 ret = -ENOMEM;
7048                 mlog_errno(ret);
7049                 goto out;
7050         }
7051
7052         if (range_start == range_end)
7053                 goto out;
7054
7055         ret = ocfs2_extent_map_get_blocks(inode,
7056                                           range_start >> sb->s_blocksize_bits,
7057                                           &phys, NULL, &ext_flags);
7058         if (ret) {
7059                 mlog_errno(ret);
7060                 goto out;
7061         }
7062
7063         /*
7064          * Tail is a hole, or is marked unwritten. In either case, we
7065          * can count on read and write to return/push zero's.
7066          */
7067         if (phys == 0 || ext_flags & OCFS2_EXT_UNWRITTEN)
7068                 goto out;
7069
7070         ret = ocfs2_grab_eof_pages(inode, range_start, range_end, pages,
7071                                    &numpages);
7072         if (ret) {
7073                 mlog_errno(ret);
7074                 goto out;
7075         }
7076
7077         ocfs2_zero_cluster_pages(inode, range_start, range_end, pages,
7078                                  numpages, phys, handle);
7079
7080         /*
7081          * Initiate writeout of the pages we zero'd here. We don't
7082          * wait on them - the truncate_inode_pages() call later will
7083          * do that for us.
7084          */
7085         ret = filemap_fdatawrite_range(inode->i_mapping, range_start,
7086                                        range_end - 1);
7087         if (ret)
7088                 mlog_errno(ret);
7089
7090 out:
7091         if (pages)
7092                 kfree(pages);
7093
7094         return ret;
7095 }
7096
7097 static void ocfs2_zero_dinode_id2_with_xattr(struct inode *inode,
7098                                              struct ocfs2_dinode *di)
7099 {
7100         unsigned int blocksize = 1 << inode->i_sb->s_blocksize_bits;
7101         unsigned int xattrsize = le16_to_cpu(di->i_xattr_inline_size);
7102
7103         if (le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_XATTR_FL)
7104                 memset(&di->id2, 0, blocksize -
7105                                     offsetof(struct ocfs2_dinode, id2) -
7106                                     xattrsize);
7107         else
7108                 memset(&di->id2, 0, blocksize -
7109                                     offsetof(struct ocfs2_dinode, id2));
7110 }
7111
7112 void ocfs2_dinode_new_extent_list(struct inode *inode,
7113                                   struct ocfs2_dinode *di)
7114 {
7115         ocfs2_zero_dinode_id2_with_xattr(inode, di);
7116         di->id2.i_list.l_tree_depth = 0;
7117         di->id2.i_list.l_next_free_rec = 0;
7118         di->id2.i_list.l_count = cpu_to_le16(
7119                 ocfs2_extent_recs_per_inode_with_xattr(inode->i_sb, di));
7120 }
7121
7122 void ocfs2_set_inode_data_inline(struct inode *inode, struct ocfs2_dinode *di)
7123 {
7124         struct ocfs2_inode_info *oi = OCFS2_I(inode);
7125         struct ocfs2_inline_data *idata = &di->id2.i_data;
7126
7127         spin_lock(&oi->ip_lock);
7128         oi->ip_dyn_features |= OCFS2_INLINE_DATA_FL;
7129         di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
7130         spin_unlock(&oi->ip_lock);
7131
7132         /*
7133          * We clear the entire i_data structure here so that all
7134          * fields can be properly initialized.
7135          */
7136         ocfs2_zero_dinode_id2_with_xattr(inode, di);
7137
7138         idata->id_count = cpu_to_le16(
7139                         ocfs2_max_inline_data_with_xattr(inode->i_sb, di));
7140 }
7141
7142 int ocfs2_convert_inline_data_to_extents(struct inode *inode,
7143                                          struct buffer_head *di_bh)
7144 {
7145         int ret, i, has_data, num_pages = 0;
7146         handle_t *handle;
7147         u64 uninitialized_var(block);
7148         struct ocfs2_inode_info *oi = OCFS2_I(inode);
7149         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
7150         struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
7151         struct ocfs2_alloc_context *data_ac = NULL;
7152         struct page **pages = NULL;
7153         loff_t end = osb->s_clustersize;
7154         struct ocfs2_extent_tree et;
7155         int did_quota = 0;
7156
7157         has_data = i_size_read(inode) ? 1 : 0;
7158
7159         if (has_data) {
7160                 pages = kcalloc(ocfs2_pages_per_cluster(osb->sb),
7161                                 sizeof(struct page *), GFP_NOFS);
7162                 if (pages == NULL) {
7163                         ret = -ENOMEM;
7164                         mlog_errno(ret);
7165                         goto out;
7166                 }
7167
7168                 ret = ocfs2_reserve_clusters(osb, 1, &data_ac);
7169                 if (ret) {
7170                         mlog_errno(ret);
7171                         goto out;
7172                 }
7173         }
7174
7175         handle = ocfs2_start_trans(osb,
7176                                    ocfs2_inline_to_extents_credits(osb->sb));
7177         if (IS_ERR(handle)) {
7178                 ret = PTR_ERR(handle);
7179                 mlog_errno(ret);
7180                 goto out_unlock;
7181         }
7182
7183         ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
7184                                       OCFS2_JOURNAL_ACCESS_WRITE);
7185         if (ret) {
7186                 mlog_errno(ret);
7187                 goto out_commit;
7188         }
7189
7190         if (has_data) {
7191                 u32 bit_off, num;
7192                 unsigned int page_end;
7193                 u64 phys;
7194
7195                 ret = dquot_alloc_space_nodirty(inode,
7196                                        ocfs2_clusters_to_bytes(osb->sb, 1));
7197                 if (ret)
7198                         goto out_commit;
7199                 did_quota = 1;
7200
7201                 data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv;
7202
7203                 ret = ocfs2_claim_clusters(handle, data_ac, 1, &bit_off,
7204                                            &num);
7205                 if (ret) {
7206                         mlog_errno(ret);
7207                         goto out_commit;
7208                 }
7209
7210                 /*
7211                  * Save two copies, one for insert, and one that can
7212                  * be changed by ocfs2_map_and_dirty_page() below.
7213                  */
7214                 block = phys = ocfs2_clusters_to_blocks(inode->i_sb, bit_off);
7215
7216                 /*
7217                  * Non sparse file systems zero on extend, so no need
7218                  * to do that now.
7219                  */
7220                 if (!ocfs2_sparse_alloc(osb) &&
7221                     PAGE_CACHE_SIZE < osb->s_clustersize)
7222                         end = PAGE_CACHE_SIZE;
7223
7224                 ret = ocfs2_grab_eof_pages(inode, 0, end, pages, &num_pages);
7225                 if (ret) {
7226                         mlog_errno(ret);
7227                         goto out_commit;
7228                 }
7229
7230                 /*
7231                  * This should populate the 1st page for us and mark
7232                  * it up to date.
7233                  */
7234                 ret = ocfs2_read_inline_data(inode, pages[0], di_bh);
7235                 if (ret) {
7236                         mlog_errno(ret);
7237                         goto out_commit;
7238                 }
7239
7240                 page_end = PAGE_CACHE_SIZE;
7241                 if (PAGE_CACHE_SIZE > osb->s_clustersize)
7242                         page_end = osb->s_clustersize;
7243
7244                 for (i = 0; i < num_pages; i++)
7245                         ocfs2_map_and_dirty_page(inode, handle, 0, page_end,
7246                                                  pages[i], i > 0, &phys);
7247         }
7248
7249         spin_lock(&oi->ip_lock);
7250         oi->ip_dyn_features &= ~OCFS2_INLINE_DATA_FL;
7251         di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
7252         spin_unlock(&oi->ip_lock);
7253
7254         ocfs2_dinode_new_extent_list(inode, di);
7255
7256         ocfs2_journal_dirty(handle, di_bh);
7257
7258         if (has_data) {
7259                 /*
7260                  * An error at this point should be extremely rare. If
7261                  * this proves to be false, we could always re-build
7262                  * the in-inode data from our pages.
7263                  */
7264                 ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
7265                 ret = ocfs2_insert_extent(handle, &et, 0, block, 1, 0, NULL);
7266                 if (ret) {
7267                         mlog_errno(ret);
7268                         goto out_commit;
7269                 }
7270
7271                 inode->i_blocks = ocfs2_inode_sector_count(inode);
7272         }
7273
7274 out_commit:
7275         if (ret < 0 && did_quota)
7276                 dquot_free_space_nodirty(inode,
7277                                           ocfs2_clusters_to_bytes(osb->sb, 1));
7278
7279         ocfs2_commit_trans(osb, handle);
7280
7281 out_unlock:
7282         if (data_ac)
7283                 ocfs2_free_alloc_context(data_ac);
7284
7285 out:
7286         if (pages) {
7287                 ocfs2_unlock_and_free_pages(pages, num_pages);
7288                 kfree(pages);
7289         }
7290
7291         return ret;
7292 }
7293
7294 /*
7295  * It is expected, that by the time you call this function,
7296  * inode->i_size and fe->i_size have been adjusted.
7297  *
7298  * WARNING: This will kfree the truncate context
7299  */
7300 int ocfs2_commit_truncate(struct ocfs2_super *osb,
7301                           struct inode *inode,
7302                           struct buffer_head *fe_bh,
7303                           struct ocfs2_truncate_context *tc)
7304 {
7305         int status, i, credits, tl_sem = 0;
7306         u32 clusters_to_del, new_highest_cpos, range;
7307         u64 blkno = 0;
7308         struct ocfs2_extent_list *el;
7309         handle_t *handle = NULL;
7310         struct inode *tl_inode = osb->osb_tl_inode;
7311         struct ocfs2_path *path = NULL;
7312         struct ocfs2_dinode *di = (struct ocfs2_dinode *)fe_bh->b_data;
7313         struct ocfs2_alloc_context *meta_ac = NULL;
7314         struct ocfs2_refcount_tree *ref_tree = NULL;
7315
7316         mlog_entry_void();
7317
7318         new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb,
7319                                                      i_size_read(inode));
7320
7321         path = ocfs2_new_path(fe_bh, &di->id2.i_list,
7322                               ocfs2_journal_access_di);
7323         if (!path) {
7324                 status = -ENOMEM;
7325                 mlog_errno(status);
7326                 goto bail;
7327         }
7328
7329         ocfs2_extent_map_trunc(inode, new_highest_cpos);
7330
7331 start:
7332         /*
7333          * Check that we still have allocation to delete.
7334          */
7335         if (OCFS2_I(inode)->ip_clusters == 0) {
7336                 status = 0;
7337                 goto bail;
7338         }
7339
7340         credits = 0;
7341
7342         /*
7343          * Truncate always works against the rightmost tree branch.
7344          */
7345         status = ocfs2_find_path(INODE_CACHE(inode), path, UINT_MAX);
7346         if (status) {
7347                 mlog_errno(status);
7348                 goto bail;
7349         }
7350
7351         mlog(0, "inode->ip_clusters = %u, tree_depth = %u\n",
7352              OCFS2_I(inode)->ip_clusters, path->p_tree_depth);
7353
7354         /*
7355          * By now, el will point to the extent list on the bottom most
7356          * portion of this tree. Only the tail record is considered in
7357          * each pass.
7358          *
7359          * We handle the following cases, in order:
7360          * - empty extent: delete the remaining branch
7361          * - remove the entire record
7362          * - remove a partial record
7363          * - no record needs to be removed (truncate has completed)
7364          */
7365         el = path_leaf_el(path);
7366         if (le16_to_cpu(el->l_next_free_rec) == 0) {
7367                 ocfs2_error(inode->i_sb,
7368                             "Inode %llu has empty extent block at %llu\n",
7369                             (unsigned long long)OCFS2_I(inode)->ip_blkno,
7370                             (unsigned long long)path_leaf_bh(path)->b_blocknr);
7371                 status = -EROFS;
7372                 goto bail;
7373         }
7374
7375         i = le16_to_cpu(el->l_next_free_rec) - 1;
7376         range = le32_to_cpu(el->l_recs[i].e_cpos) +
7377                 ocfs2_rec_clusters(el, &el->l_recs[i]);
7378         if (i == 0 && ocfs2_is_empty_extent(&el->l_recs[i])) {
7379                 clusters_to_del = 0;
7380         } else if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_highest_cpos) {
7381                 clusters_to_del = ocfs2_rec_clusters(el, &el->l_recs[i]);
7382                 blkno = le64_to_cpu(el->l_recs[i].e_blkno);
7383         } else if (range > new_highest_cpos) {
7384                 clusters_to_del = (ocfs2_rec_clusters(el, &el->l_recs[i]) +
7385                                    le32_to_cpu(el->l_recs[i].e_cpos)) -
7386                                   new_highest_cpos;
7387                 blkno = le64_to_cpu(el->l_recs[i].e_blkno) +
7388                         ocfs2_clusters_to_blocks(inode->i_sb,
7389                                 ocfs2_rec_clusters(el, &el->l_recs[i]) -
7390                                 clusters_to_del);
7391         } else {
7392                 status = 0;
7393                 goto bail;
7394         }
7395
7396         mlog(0, "clusters_to_del = %u in this pass, tail blk=%llu\n",
7397              clusters_to_del, (unsigned long long)path_leaf_bh(path)->b_blocknr);
7398
7399         if (el->l_recs[i].e_flags & OCFS2_EXT_REFCOUNTED && clusters_to_del) {
7400                 BUG_ON(!(OCFS2_I(inode)->ip_dyn_features &
7401                          OCFS2_HAS_REFCOUNT_FL));
7402
7403                 status = ocfs2_lock_refcount_tree(osb,
7404                                                 le64_to_cpu(di->i_refcount_loc),
7405                                                 1, &ref_tree, NULL);
7406                 if (status) {
7407                         mlog_errno(status);
7408                         goto bail;
7409                 }
7410
7411                 status = ocfs2_prepare_refcount_change_for_del(inode, fe_bh,
7412                                                                blkno,
7413                                                                clusters_to_del,
7414                                                                &credits,
7415                                                                &meta_ac);
7416                 if (status < 0) {
7417                         mlog_errno(status);
7418                         goto bail;
7419                 }
7420         }
7421
7422         mutex_lock(&tl_inode->i_mutex);
7423         tl_sem = 1;
7424         /* ocfs2_truncate_log_needs_flush guarantees us at least one
7425          * record is free for use. If there isn't any, we flush to get
7426          * an empty truncate log.  */
7427         if (ocfs2_truncate_log_needs_flush(osb)) {
7428                 status = __ocfs2_flush_truncate_log(osb);
7429                 if (status < 0) {
7430                         mlog_errno(status);
7431                         goto bail;
7432                 }
7433         }
7434
7435         credits += ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del,
7436                                                 (struct ocfs2_dinode *)fe_bh->b_data,
7437                                                 el);
7438         handle = ocfs2_start_trans(osb, credits);
7439         if (IS_ERR(handle)) {
7440                 status = PTR_ERR(handle);
7441                 handle = NULL;
7442                 mlog_errno(status);
7443                 goto bail;
7444         }
7445
7446         status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh, handle,
7447                                    tc, path, meta_ac);
7448         if (status < 0) {
7449                 mlog_errno(status);
7450                 goto bail;
7451         }
7452
7453         mutex_unlock(&tl_inode->i_mutex);
7454         tl_sem = 0;
7455
7456         ocfs2_commit_trans(osb, handle);
7457         handle = NULL;
7458
7459         ocfs2_reinit_path(path, 1);
7460
7461         if (meta_ac) {
7462                 ocfs2_free_alloc_context(meta_ac);
7463                 meta_ac = NULL;
7464         }
7465
7466         if (ref_tree) {
7467                 ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
7468                 ref_tree = NULL;
7469         }
7470
7471         /*
7472          * The check above will catch the case where we've truncated
7473          * away all allocation.
7474          */
7475         goto start;
7476
7477 bail:
7478
7479         ocfs2_schedule_truncate_log_flush(osb, 1);
7480
7481         if (tl_sem)
7482                 mutex_unlock(&tl_inode->i_mutex);
7483
7484         if (handle)
7485                 ocfs2_commit_trans(osb, handle);
7486
7487         if (meta_ac)
7488                 ocfs2_free_alloc_context(meta_ac);
7489
7490         if (ref_tree)
7491                 ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
7492
7493         ocfs2_run_deallocs(osb, &tc->tc_dealloc);
7494
7495         ocfs2_free_path(path);
7496
7497         /* This will drop the ext_alloc cluster lock for us */
7498         ocfs2_free_truncate_context(tc);
7499
7500         mlog_exit(status);
7501         return status;
7502 }
7503
7504 /*
7505  * Expects the inode to already be locked.
7506  */
7507 int ocfs2_prepare_truncate(struct ocfs2_super *osb,
7508                            struct inode *inode,
7509                            struct buffer_head *fe_bh,
7510                            struct ocfs2_truncate_context **tc)
7511 {
7512         int status;
7513         unsigned int new_i_clusters;
7514         struct ocfs2_dinode *fe;
7515         struct ocfs2_extent_block *eb;
7516         struct buffer_head *last_eb_bh = NULL;
7517
7518         mlog_entry_void();
7519
7520         *tc = NULL;
7521
7522         new_i_clusters = ocfs2_clusters_for_bytes(osb->sb,
7523                                                   i_size_read(inode));
7524         fe = (struct ocfs2_dinode *) fe_bh->b_data;
7525
7526         mlog(0, "fe->i_clusters = %u, new_i_clusters = %u, fe->i_size ="
7527              "%llu\n", le32_to_cpu(fe->i_clusters), new_i_clusters,
7528              (unsigned long long)le64_to_cpu(fe->i_size));
7529
7530         *tc = kzalloc(sizeof(struct ocfs2_truncate_context), GFP_KERNEL);
7531         if (!(*tc)) {
7532                 status = -ENOMEM;
7533                 mlog_errno(status);
7534                 goto bail;
7535         }
7536         ocfs2_init_dealloc_ctxt(&(*tc)->tc_dealloc);
7537
7538         if (fe->id2.i_list.l_tree_depth) {
7539                 status = ocfs2_read_extent_block(INODE_CACHE(inode),
7540                                                  le64_to_cpu(fe->i_last_eb_blk),
7541                                                  &last_eb_bh);
7542                 if (status < 0) {
7543                         mlog_errno(status);
7544                         goto bail;
7545                 }
7546                 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
7547         }
7548
7549         (*tc)->tc_last_eb_bh = last_eb_bh;
7550
7551         status = 0;
7552 bail:
7553         if (status < 0) {
7554                 if (*tc)
7555                         ocfs2_free_truncate_context(*tc);
7556                 *tc = NULL;
7557         }
7558         mlog_exit_void();
7559         return status;
7560 }
7561
7562 /*
7563  * 'start' is inclusive, 'end' is not.
7564  */
7565 int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
7566                           unsigned int start, unsigned int end, int trunc)
7567 {
7568         int ret;
7569         unsigned int numbytes;
7570         handle_t *handle;
7571         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
7572         struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
7573         struct ocfs2_inline_data *idata = &di->id2.i_data;
7574
7575         if (end > i_size_read(inode))
7576                 end = i_size_read(inode);
7577
7578         BUG_ON(start >= end);
7579
7580         if (!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) ||
7581             !(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL) ||
7582             !ocfs2_supports_inline_data(osb)) {
7583                 ocfs2_error(inode->i_sb,
7584                             "Inline data flags for inode %llu don't agree! "
7585                             "Disk: 0x%x, Memory: 0x%x, Superblock: 0x%x\n",
7586                             (unsigned long long)OCFS2_I(inode)->ip_blkno,
7587                             le16_to_cpu(di->i_dyn_features),
7588                             OCFS2_I(inode)->ip_dyn_features,
7589                             osb->s_feature_incompat);
7590                 ret = -EROFS;
7591                 goto out;
7592         }
7593
7594         handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
7595         if (IS_ERR(handle)) {
7596                 ret = PTR_ERR(handle);
7597                 mlog_errno(ret);
7598                 goto out;
7599         }
7600
7601         ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
7602                                       OCFS2_JOURNAL_ACCESS_WRITE);
7603         if (ret) {
7604                 mlog_errno(ret);
7605                 goto out_commit;
7606         }
7607
7608         numbytes = end - start;
7609         memset(idata->id_data + start, 0, numbytes);
7610
7611         /*
7612          * No need to worry about the data page here - it's been
7613          * truncated already and inline data doesn't need it for
7614          * pushing zero's to disk, so we'll let readpage pick it up
7615          * later.
7616          */
7617         if (trunc) {
7618                 i_size_write(inode, start);
7619                 di->i_size = cpu_to_le64(start);
7620         }
7621
7622         inode->i_blocks = ocfs2_inode_sector_count(inode);
7623         inode->i_ctime = inode->i_mtime = CURRENT_TIME;
7624
7625         di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
7626         di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
7627
7628         ocfs2_journal_dirty(handle, di_bh);
7629
7630 out_commit:
7631         ocfs2_commit_trans(osb, handle);
7632
7633 out:
7634         return ret;
7635 }
7636
7637 static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc)
7638 {
7639         /*
7640          * The caller is responsible for completing deallocation
7641          * before freeing the context.
7642          */
7643         if (tc->tc_dealloc.c_first_suballocator != NULL)
7644                 mlog(ML_NOTICE,
7645                      "Truncate completion has non-empty dealloc context\n");
7646
7647         brelse(tc->tc_last_eb_bh);
7648
7649         kfree(tc);
7650 }