Merge branch 'master' of git://oss.sgi.com:8090/xfs/linux-2.6
authorNiv Sardi <xaiki@debian.org>
Fri, 7 Nov 2008 04:07:12 +0000 (15:07 +1100)
committerNiv Sardi <xaiki@debian.org>
Fri, 7 Nov 2008 04:07:12 +0000 (15:07 +1100)
84 files changed:
Documentation/filesystems/xfs.txt
fs/inode.c
fs/xfs/Makefile
fs/xfs/linux-2.6/xfs_aops.c
fs/xfs/linux-2.6/xfs_cred.h
fs/xfs/linux-2.6/xfs_globals.c
fs/xfs/linux-2.6/xfs_globals.h
fs/xfs/linux-2.6/xfs_ioctl.c
fs/xfs/linux-2.6/xfs_iops.c
fs/xfs/linux-2.6/xfs_linux.h
fs/xfs/linux-2.6/xfs_stats.c
fs/xfs/linux-2.6/xfs_stats.h
fs/xfs/linux-2.6/xfs_super.c
fs/xfs/linux-2.6/xfs_super.h
fs/xfs/linux-2.6/xfs_sync.c [new file with mode: 0644]
fs/xfs/linux-2.6/xfs_sync.h [new file with mode: 0644]
fs/xfs/linux-2.6/xfs_sysctl.c
fs/xfs/linux-2.6/xfs_sysctl.h
fs/xfs/linux-2.6/xfs_vfs.h
fs/xfs/linux-2.6/xfs_vnode.c
fs/xfs/linux-2.6/xfs_vnode.h
fs/xfs/quota/xfs_dquot.c
fs/xfs/quota/xfs_dquot.h
fs/xfs/quota/xfs_dquot_item.c
fs/xfs/quota/xfs_qm.c
fs/xfs/quota/xfs_qm.h
fs/xfs/quota/xfs_qm_bhv.c
fs/xfs/quota/xfs_qm_syscalls.c
fs/xfs/support/debug.c
fs/xfs/xfs.h
fs/xfs/xfs_acl.c
fs/xfs/xfs_ag.h
fs/xfs/xfs_alloc.c
fs/xfs/xfs_alloc.h
fs/xfs/xfs_alloc_btree.c
fs/xfs/xfs_alloc_btree.h
fs/xfs/xfs_arch.h
fs/xfs/xfs_bit.h
fs/xfs/xfs_bmap.c
fs/xfs/xfs_bmap.h
fs/xfs/xfs_bmap_btree.c
fs/xfs/xfs_bmap_btree.h
fs/xfs/xfs_btree.c
fs/xfs/xfs_btree.h
fs/xfs/xfs_btree_trace.c [new file with mode: 0644]
fs/xfs/xfs_btree_trace.h [new file with mode: 0644]
fs/xfs/xfs_buf_item.c
fs/xfs/xfs_clnt.h [deleted file]
fs/xfs/xfs_da_btree.c
fs/xfs/xfs_da_btree.h
fs/xfs/xfs_dinode.h
fs/xfs/xfs_dir2.c
fs/xfs/xfs_dmops.c
fs/xfs/xfs_extfree_item.c
fs/xfs/xfs_fsops.c
fs/xfs/xfs_ialloc.c
fs/xfs/xfs_ialloc.h
fs/xfs/xfs_ialloc_btree.c
fs/xfs/xfs_ialloc_btree.h
fs/xfs/xfs_iget.c
fs/xfs/xfs_imap.h
fs/xfs/xfs_inode.c
fs/xfs/xfs_inode.h
fs/xfs/xfs_inode_item.c
fs/xfs/xfs_inode_item.h
fs/xfs/xfs_itable.c
fs/xfs/xfs_log.c
fs/xfs/xfs_log_priv.h
fs/xfs/xfs_log_recover.c
fs/xfs/xfs_mount.c
fs/xfs/xfs_mount.h
fs/xfs/xfs_qmops.c
fs/xfs/xfs_trans.c
fs/xfs/xfs_trans.h
fs/xfs/xfs_trans_ail.c
fs/xfs/xfs_trans_buf.c
fs/xfs/xfs_trans_item.c
fs/xfs/xfs_trans_priv.h
fs/xfs/xfs_vfsops.c
fs/xfs/xfs_vfsops.h
fs/xfs/xfs_vnodeops.c
fs/xfs/xfs_vnodeops.h
include/linux/fs.h
kernel/sysctl_check.c

index 0a1668ba26008d56a5e888a1a876f072a5800966..9878f50d6ed6d37b3241eb558a58d08681898927 100644 (file)
@@ -229,10 +229,6 @@ The following sysctls are available for the XFS filesystem:
        ISGID bit is cleared if the irix_sgid_inherit compatibility sysctl
        is set.
 
-  fs.xfs.restrict_chown                (Min: 0  Default: 1  Max: 1)
-       Controls whether unprivileged users can use chown to "give away"
-       a file to another user.
-
   fs.xfs.inherit_sync          (Min: 0  Default: 1  Max: 1)
        Setting this to "1" will cause the "sync" flag set
        by the xfs_io(8) chattr command on a directory to be
index 0487ddba139780a4e0bfabc8f4ef2103358d9713..f84ba338fafd81264847f797533f14230c9c7214 100644 (file)
@@ -108,84 +108,100 @@ static void wake_up_inode(struct inode *inode)
        wake_up_bit(&inode->i_state, __I_LOCK);
 }
 
-static struct inode *alloc_inode(struct super_block *sb)
+/**
+ * inode_init_always - perform inode structure intialisation
+ * @sb         - superblock inode belongs to.
+ * @inode      - inode to initialise
+ *
+ * These are initializations that need to be done on every inode
+ * allocation as the fields are not initialised by slab allocation.
+ */
+struct inode *inode_init_always(struct super_block *sb, struct inode *inode)
 {
        static const struct address_space_operations empty_aops;
        static struct inode_operations empty_iops;
        static const struct file_operations empty_fops;
-       struct inode *inode;
-
-       if (sb->s_op->alloc_inode)
-               inode = sb->s_op->alloc_inode(sb);
-       else
-               inode = (struct inode *) kmem_cache_alloc(inode_cachep, GFP_KERNEL);
 
-       if (inode) {
-               struct address_space * const mapping = &inode->i_data;
-
-               inode->i_sb = sb;
-               inode->i_blkbits = sb->s_blocksize_bits;
-               inode->i_flags = 0;
-               atomic_set(&inode->i_count, 1);
-               inode->i_op = &empty_iops;
-               inode->i_fop = &empty_fops;
-               inode->i_nlink = 1;
-               atomic_set(&inode->i_writecount, 0);
-               inode->i_size = 0;
-               inode->i_blocks = 0;
-               inode->i_bytes = 0;
-               inode->i_generation = 0;
+       struct address_space * const mapping = &inode->i_data;
+
+       inode->i_sb = sb;
+       inode->i_blkbits = sb->s_blocksize_bits;
+       inode->i_flags = 0;
+       atomic_set(&inode->i_count, 1);
+       inode->i_op = &empty_iops;
+       inode->i_fop = &empty_fops;
+       inode->i_nlink = 1;
+       atomic_set(&inode->i_writecount, 0);
+       inode->i_size = 0;
+       inode->i_blocks = 0;
+       inode->i_bytes = 0;
+       inode->i_generation = 0;
 #ifdef CONFIG_QUOTA
-               memset(&inode->i_dquot, 0, sizeof(inode->i_dquot));
+       memset(&inode->i_dquot, 0, sizeof(inode->i_dquot));
 #endif
-               inode->i_pipe = NULL;
-               inode->i_bdev = NULL;
-               inode->i_cdev = NULL;
-               inode->i_rdev = 0;
-               inode->dirtied_when = 0;
-               if (security_inode_alloc(inode)) {
-                       if (inode->i_sb->s_op->destroy_inode)
-                               inode->i_sb->s_op->destroy_inode(inode);
-                       else
-                               kmem_cache_free(inode_cachep, (inode));
-                       return NULL;
-               }
+       inode->i_pipe = NULL;
+       inode->i_bdev = NULL;
+       inode->i_cdev = NULL;
+       inode->i_rdev = 0;
+       inode->dirtied_when = 0;
+       if (security_inode_alloc(inode)) {
+               if (inode->i_sb->s_op->destroy_inode)
+                       inode->i_sb->s_op->destroy_inode(inode);
+               else
+                       kmem_cache_free(inode_cachep, (inode));
+               return NULL;
+       }
 
-               spin_lock_init(&inode->i_lock);
-               lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key);
+       spin_lock_init(&inode->i_lock);
+       lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key);
 
-               mutex_init(&inode->i_mutex);
-               lockdep_set_class(&inode->i_mutex, &sb->s_type->i_mutex_key);
+       mutex_init(&inode->i_mutex);
+       lockdep_set_class(&inode->i_mutex, &sb->s_type->i_mutex_key);
 
-               init_rwsem(&inode->i_alloc_sem);
-               lockdep_set_class(&inode->i_alloc_sem, &sb->s_type->i_alloc_sem_key);
+       init_rwsem(&inode->i_alloc_sem);
+       lockdep_set_class(&inode->i_alloc_sem, &sb->s_type->i_alloc_sem_key);
 
-               mapping->a_ops = &empty_aops;
-               mapping->host = inode;
-               mapping->flags = 0;
-               mapping_set_gfp_mask(mapping, GFP_HIGHUSER_PAGECACHE);
-               mapping->assoc_mapping = NULL;
-               mapping->backing_dev_info = &default_backing_dev_info;
-               mapping->writeback_index = 0;
+       mapping->a_ops = &empty_aops;
+       mapping->host = inode;
+       mapping->flags = 0;
+       mapping_set_gfp_mask(mapping, GFP_HIGHUSER_PAGECACHE);
+       mapping->assoc_mapping = NULL;
+       mapping->backing_dev_info = &default_backing_dev_info;
+       mapping->writeback_index = 0;
 
-               /*
-                * If the block_device provides a backing_dev_info for client
-                * inodes then use that.  Otherwise the inode share the bdev's
-                * backing_dev_info.
-                */
-               if (sb->s_bdev) {
-                       struct backing_dev_info *bdi;
+       /*
+        * If the block_device provides a backing_dev_info for client
+        * inodes then use that.  Otherwise the inode share the bdev's
+        * backing_dev_info.
+        */
+       if (sb->s_bdev) {
+               struct backing_dev_info *bdi;
 
-                       bdi = sb->s_bdev->bd_inode_backing_dev_info;
-                       if (!bdi)
-                               bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
-                       mapping->backing_dev_info = bdi;
-               }
-               inode->i_private = NULL;
-               inode->i_mapping = mapping;
+               bdi = sb->s_bdev->bd_inode_backing_dev_info;
+               if (!bdi)
+                       bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
+               mapping->backing_dev_info = bdi;
        }
+       inode->i_private = NULL;
+       inode->i_mapping = mapping;
+
        return inode;
 }
+EXPORT_SYMBOL(inode_init_always);
+
+static struct inode *alloc_inode(struct super_block *sb)
+{
+       struct inode *inode;
+
+       if (sb->s_op->alloc_inode)
+               inode = sb->s_op->alloc_inode(sb);
+       else
+               inode = kmem_cache_alloc(inode_cachep, GFP_KERNEL);
+
+       if (inode)
+               return inode_init_always(sb, inode);
+       return NULL;
+}
 
 void destroy_inode(struct inode *inode) 
 {
@@ -196,6 +212,7 @@ void destroy_inode(struct inode *inode)
        else
                kmem_cache_free(inode_cachep, (inode));
 }
+EXPORT_SYMBOL(destroy_inode);
 
 
 /*
@@ -534,6 +551,49 @@ repeat:
        return node ? inode : NULL;
 }
 
+static unsigned long hash(struct super_block *sb, unsigned long hashval)
+{
+       unsigned long tmp;
+
+       tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) /
+                       L1_CACHE_BYTES;
+       tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> I_HASHBITS);
+       return tmp & I_HASHMASK;
+}
+
+static inline void
+__inode_add_to_lists(struct super_block *sb, struct hlist_head *head,
+                       struct inode *inode)
+{
+       inodes_stat.nr_inodes++;
+       list_add(&inode->i_list, &inode_in_use);
+       list_add(&inode->i_sb_list, &sb->s_inodes);
+       if (head)
+               hlist_add_head(&inode->i_hash, head);
+}
+
+/**
+ * inode_add_to_lists - add a new inode to relevant lists
+ * @sb         - superblock inode belongs to.
+ * @inode      - inode to mark in use
+ *
+ * When an inode is allocated it needs to be accounted for, added to the in use
+ * list, the owning superblock and the inode hash. This needs to be done under
+ * the inode_lock, so export a function to do this rather than the inode lock
+ * itself. We calculate the hash list to add to here so it is all internal
+ * which requires the caller to have already set up the inode number in the
+ * inode to add.
+ */
+void inode_add_to_lists(struct super_block *sb, struct inode *inode)
+{
+       struct hlist_head *head = inode_hashtable + hash(sb, inode->i_ino);
+
+       spin_lock(&inode_lock);
+       __inode_add_to_lists(sb, head, inode);
+       spin_unlock(&inode_lock);
+}
+EXPORT_SYMBOL_GPL(inode_add_to_lists);
+
 /**
  *     new_inode       - obtain an inode
  *     @sb: superblock
@@ -561,9 +621,7 @@ struct inode *new_inode(struct super_block *sb)
        inode = alloc_inode(sb);
        if (inode) {
                spin_lock(&inode_lock);
-               inodes_stat.nr_inodes++;
-               list_add(&inode->i_list, &inode_in_use);
-               list_add(&inode->i_sb_list, &sb->s_inodes);
+               __inode_add_to_lists(sb, NULL, inode);
                inode->i_ino = ++last_ino;
                inode->i_state = 0;
                spin_unlock(&inode_lock);
@@ -622,10 +680,7 @@ static struct inode * get_new_inode(struct super_block *sb, struct hlist_head *h
                        if (set(inode, data))
                                goto set_failed;
 
-                       inodes_stat.nr_inodes++;
-                       list_add(&inode->i_list, &inode_in_use);
-                       list_add(&inode->i_sb_list, &sb->s_inodes);
-                       hlist_add_head(&inode->i_hash, head);
+                       __inode_add_to_lists(sb, head, inode);
                        inode->i_state = I_LOCK|I_NEW;
                        spin_unlock(&inode_lock);
 
@@ -671,10 +726,7 @@ static struct inode * get_new_inode_fast(struct super_block *sb, struct hlist_he
                old = find_inode_fast(sb, head, ino);
                if (!old) {
                        inode->i_ino = ino;
-                       inodes_stat.nr_inodes++;
-                       list_add(&inode->i_list, &inode_in_use);
-                       list_add(&inode->i_sb_list, &sb->s_inodes);
-                       hlist_add_head(&inode->i_hash, head);
+                       __inode_add_to_lists(sb, head, inode);
                        inode->i_state = I_LOCK|I_NEW;
                        spin_unlock(&inode_lock);
 
@@ -698,16 +750,6 @@ static struct inode * get_new_inode_fast(struct super_block *sb, struct hlist_he
        return inode;
 }
 
-static unsigned long hash(struct super_block *sb, unsigned long hashval)
-{
-       unsigned long tmp;
-
-       tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) /
-                       L1_CACHE_BYTES;
-       tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> I_HASHBITS);
-       return tmp & I_HASHMASK;
-}
-
 /**
  *     iunique - get a unique inode number
  *     @sb: superblock
index 737c9a425361df67f3f7ef9d0ff674e26caf6105..51b87de97f875c1d1f29e55e72f7b619f6ae1a68 100644 (file)
@@ -91,7 +91,8 @@ xfs-y                         += xfs_alloc.o \
                                   xfs_dmops.o \
                                   xfs_qmops.o
 
-xfs-$(CONFIG_XFS_TRACE)                += xfs_dir2_trace.o
+xfs-$(CONFIG_XFS_TRACE)                += xfs_btree_trace.o \
+                                  xfs_dir2_trace.o
 
 # Objects in linux/
 xfs-y                          += $(addprefix $(XFS_LINUX)/, \
@@ -106,6 +107,7 @@ xfs-y                               += $(addprefix $(XFS_LINUX)/, \
                                   xfs_iops.o \
                                   xfs_lrw.o \
                                   xfs_super.o \
+                                  xfs_sync.o \
                                   xfs_vnode.o \
                                   xfs_xattr.o)
 
index a44d68eb50b5302b8482000e523661b14c0854d8..8fbc97df3609aea427abda775161f40c9b8f0f57 100644 (file)
@@ -191,7 +191,7 @@ xfs_setfilesize(
                ip->i_d.di_size = isize;
                ip->i_update_core = 1;
                ip->i_update_size = 1;
-               mark_inode_dirty_sync(ioend->io_inode);
+               xfs_mark_inode_dirty_sync(ip);
        }
 
        xfs_iunlock(ip, XFS_ILOCK_EXCL);
index 652721ce0ea5ecd0bb19d53c519facf9a62026a6..e279d00779f4dfb07defbd51f77216790d0daea9 100644 (file)
  * Credentials
  */
 typedef struct cred {
-       /* EMPTY */
+       /* EMPTY */
 } cred_t;
 
-extern struct cred *sys_cred;
-
-/* this is a hack.. (assumes sys_cred is the only cred_t in the system) */
-static inline int capable_cred(cred_t *cr, int cid)
-{
-       return (cr == sys_cred) ? 1 : capable(cid);
-}
-
 #endif  /* __XFS_CRED_H__ */
index ef90e64641e6a662b0478d74fc0918246c9a2e95..2ae8b1ccb02e1f86ad1f8219985130eaf132dc81 100644 (file)
@@ -26,7 +26,6 @@
  */
 xfs_param_t xfs_params = {
                          /*    MIN             DFLT            MAX     */
-       .restrict_chown = {     0,              1,              1       },
        .sgid_inherit   = {     0,              0,              1       },
        .symlink_mode   = {     0,              0,              1       },
        .panic_mask     = {     0,              0,              255     },
@@ -43,10 +42,3 @@ xfs_param_t xfs_params = {
        .inherit_nodfrg = {     0,              1,              1       },
        .fstrm_timer    = {     1,              30*100,         3600*100},
 };
-
-/*
- * Global system credential structure.
- */
-static cred_t sys_cred_val;
-cred_t *sys_cred = &sys_cred_val;
-
index 2770b0085ee869bedac0def84d8cdf3e37964c6f..69f71caf061cf037075f015af0425af294343409 100644 (file)
@@ -19,6 +19,5 @@
 #define __XFS_GLOBALS_H__
 
 extern uint64_t        xfs_panic_mask;         /* set to cause more panics */
-extern struct cred *sys_cred;
 
 #endif /* __XFS_GLOBALS_H__ */
index d3438c72dcaf555b8d18174a00d1b6eab264615f..f1bd6c36e6fe57eafe9259130a71722e38fd6c9e 100644 (file)
@@ -691,8 +691,7 @@ xfs_ioc_space(
        if (ioflags & IO_INVIS)
                attr_flags |= XFS_ATTR_DMI;
 
-       error = xfs_change_file_space(ip, cmd, &bf, filp->f_pos,
-                                             NULL, attr_flags);
+       error = xfs_change_file_space(ip, cmd, &bf, filp->f_pos, attr_flags);
        return -error;
 }
 
@@ -1007,7 +1006,7 @@ xfs_ioctl_setattr(
         * to the file owner ID, except in cases where the
         * CAP_FSETID capability is applicable.
         */
-       if (current->fsuid != ip->i_d.di_uid && !capable(CAP_FOWNER)) {
+       if (current_fsuid() != ip->i_d.di_uid && !capable(CAP_FOWNER)) {
                code = XFS_ERROR(EPERM);
                goto error_return;
        }
@@ -1104,10 +1103,6 @@ xfs_ioctl_setattr(
 
        /*
         * Change file ownership.  Must be the owner or privileged.
-        * If the system was configured with the "restricted_chown"
-        * option, the owner is not permitted to give away the file,
-        * and can change the group id only to a group of which he
-        * or she is a member.
         */
        if (mask & FSX_PROJID) {
                /*
index 095d271f34349e01175eecd3e4c362dbcc1dccce..f78bc221576411d36ba2c0878a17487cb2c97039 100644 (file)
@@ -64,14 +64,14 @@ xfs_synchronize_atime(
 {
        struct inode    *inode = VFS_I(ip);
 
-       if (inode) {
+       if (!(inode->i_state & I_CLEAR)) {
                ip->i_d.di_atime.t_sec = (__int32_t)inode->i_atime.tv_sec;
                ip->i_d.di_atime.t_nsec = (__int32_t)inode->i_atime.tv_nsec;
        }
 }
 
 /*
- * If the linux inode exists, mark it dirty.
+ * If the linux inode is valid, mark it dirty.
  * Used when commiting a dirty inode into a transaction so that
  * the inode will get written back by the linux code
  */
@@ -81,7 +81,7 @@ xfs_mark_inode_dirty_sync(
 {
        struct inode    *inode = VFS_I(ip);
 
-       if (inode)
+       if (!(inode->i_state & (I_WILL_FREE|I_FREEING|I_CLEAR)))
                mark_inode_dirty_sync(inode);
 }
 
@@ -128,7 +128,7 @@ xfs_ichgtime(
        if (sync_it) {
                SYNCHRONIZE();
                ip->i_update_core = 1;
-               mark_inode_dirty_sync(inode);
+               xfs_mark_inode_dirty_sync(ip);
        }
 }
 
@@ -601,7 +601,7 @@ xfs_vn_setattr(
        struct dentry   *dentry,
        struct iattr    *iattr)
 {
-       return -xfs_setattr(XFS_I(dentry->d_inode), iattr, 0, NULL);
+       return -xfs_setattr(XFS_I(dentry->d_inode), iattr, 0);
 }
 
 /*
@@ -642,7 +642,7 @@ xfs_vn_fallocate(
 
        xfs_ilock(ip, XFS_IOLOCK_EXCL);
        error = xfs_change_file_space(ip, XFS_IOC_RESVSP, &bf,
-                                     0, NULL, XFS_ATTR_NOLOCK);
+                                     0, XFS_ATTR_NOLOCK);
        if (!error && !(mode & FALLOC_FL_KEEP_SIZE) &&
            offset + len > i_size_read(inode))
                new_size = offset + len;
@@ -653,7 +653,7 @@ xfs_vn_fallocate(
 
                iattr.ia_valid = ATTR_SIZE;
                iattr.ia_size = new_size;
-               error = xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK, NULL);
+               error = xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK);
        }
 
        xfs_iunlock(ip, XFS_IOLOCK_EXCL);
@@ -766,12 +766,21 @@ xfs_diflags_to_iflags(
  * When reading existing inodes from disk this is called directly
  * from xfs_iget, when creating a new inode it is called from
  * xfs_ialloc after setting up the inode.
+ *
+ * We are always called with an uninitialised linux inode here.
+ * We need to initialise the necessary fields and take a reference
+ * on it.
  */
 void
 xfs_setup_inode(
        struct xfs_inode        *ip)
 {
-       struct inode            *inode = ip->i_vnode;
+       struct inode            *inode = &ip->i_vnode;
+
+       inode->i_ino = ip->i_ino;
+       inode->i_state = I_NEW|I_LOCK;
+       inode_add_to_lists(ip->i_mount->m_super, inode);
+       ASSERT(atomic_read(&inode->i_count) == 1);
 
        inode->i_mode   = ip->i_d.di_mode;
        inode->i_nlink  = ip->i_d.di_nlink;
index cc0f7b3a97957687114dd961d9543cd5c807aeb3..77d6ddcaf54735b991c7ea10bedffdfdea4bee95 100644 (file)
@@ -77,6 +77,7 @@
 #include <linux/spinlock.h>
 #include <linux/random.h>
 #include <linux/ctype.h>
+#include <linux/writeback.h>
 
 #include <asm/page.h>
 #include <asm/div64.h>
 #undef  HAVE_PERCPU_SB /* per cpu superblock counters are a 2.6 feature */
 #endif
 
-#define restricted_chown       xfs_params.restrict_chown.val
 #define irix_sgid_inherit      xfs_params.sgid_inherit.val
 #define irix_symlink_mode      xfs_params.symlink_mode.val
 #define xfs_panic_mask         xfs_params.panic_mask.val
index 3d5b67c075c7ab5b927884313f35196fd9249e80..64f4ec90b8b27c5babbdec3545f1abd8043c0faf 100644 (file)
@@ -53,6 +53,10 @@ xfs_read_xfsstats(
                { "icluster",           XFSSTAT_END_INODE_CLUSTER       },
                { "vnodes",             XFSSTAT_END_VNODE_OPS           },
                { "buf",                XFSSTAT_END_BUF                 },
+               { "abtb2",              XFSSTAT_END_ABTB_V2             },
+               { "abtc2",              XFSSTAT_END_ABTC_V2             },
+               { "bmbt2",              XFSSTAT_END_BMBT_V2             },
+               { "ibt2",               XFSSTAT_END_IBT_V2              },
        };
 
        /* Loop over all stats groups */
index e83820febc9feacc8b642f3555d147017048c1c8..736854b1ca1a0c6d1d92488f6cc79891241b8a6d 100644 (file)
@@ -118,6 +118,71 @@ struct xfsstats {
        __uint32_t              xb_page_retries;
        __uint32_t              xb_page_found;
        __uint32_t              xb_get_read;
+/* Version 2 btree counters */
+#define XFSSTAT_END_ABTB_V2            (XFSSTAT_END_BUF+15)
+       __uint32_t              xs_abtb_2_lookup;
+       __uint32_t              xs_abtb_2_compare;
+       __uint32_t              xs_abtb_2_insrec;
+       __uint32_t              xs_abtb_2_delrec;
+       __uint32_t              xs_abtb_2_newroot;
+       __uint32_t              xs_abtb_2_killroot;
+       __uint32_t              xs_abtb_2_increment;
+       __uint32_t              xs_abtb_2_decrement;
+       __uint32_t              xs_abtb_2_lshift;
+       __uint32_t              xs_abtb_2_rshift;
+       __uint32_t              xs_abtb_2_split;
+       __uint32_t              xs_abtb_2_join;
+       __uint32_t              xs_abtb_2_alloc;
+       __uint32_t              xs_abtb_2_free;
+       __uint32_t              xs_abtb_2_moves;
+#define XFSSTAT_END_ABTC_V2            (XFSSTAT_END_ABTB_V2+15)
+       __uint32_t              xs_abtc_2_lookup;
+       __uint32_t              xs_abtc_2_compare;
+       __uint32_t              xs_abtc_2_insrec;
+       __uint32_t              xs_abtc_2_delrec;
+       __uint32_t              xs_abtc_2_newroot;
+       __uint32_t              xs_abtc_2_killroot;
+       __uint32_t              xs_abtc_2_increment;
+       __uint32_t              xs_abtc_2_decrement;
+       __uint32_t              xs_abtc_2_lshift;
+       __uint32_t              xs_abtc_2_rshift;
+       __uint32_t              xs_abtc_2_split;
+       __uint32_t              xs_abtc_2_join;
+       __uint32_t              xs_abtc_2_alloc;
+       __uint32_t              xs_abtc_2_free;
+       __uint32_t              xs_abtc_2_moves;
+#define XFSSTAT_END_BMBT_V2            (XFSSTAT_END_ABTC_V2+15)
+       __uint32_t              xs_bmbt_2_lookup;
+       __uint32_t              xs_bmbt_2_compare;
+       __uint32_t              xs_bmbt_2_insrec;
+       __uint32_t              xs_bmbt_2_delrec;
+       __uint32_t              xs_bmbt_2_newroot;
+       __uint32_t              xs_bmbt_2_killroot;
+       __uint32_t              xs_bmbt_2_increment;
+       __uint32_t              xs_bmbt_2_decrement;
+       __uint32_t              xs_bmbt_2_lshift;
+       __uint32_t              xs_bmbt_2_rshift;
+       __uint32_t              xs_bmbt_2_split;
+       __uint32_t              xs_bmbt_2_join;
+       __uint32_t              xs_bmbt_2_alloc;
+       __uint32_t              xs_bmbt_2_free;
+       __uint32_t              xs_bmbt_2_moves;
+#define XFSSTAT_END_IBT_V2             (XFSSTAT_END_BMBT_V2+15)
+       __uint32_t              xs_ibt_2_lookup;
+       __uint32_t              xs_ibt_2_compare;
+       __uint32_t              xs_ibt_2_insrec;
+       __uint32_t              xs_ibt_2_delrec;
+       __uint32_t              xs_ibt_2_newroot;
+       __uint32_t              xs_ibt_2_killroot;
+       __uint32_t              xs_ibt_2_increment;
+       __uint32_t              xs_ibt_2_decrement;
+       __uint32_t              xs_ibt_2_lshift;
+       __uint32_t              xs_ibt_2_rshift;
+       __uint32_t              xs_ibt_2_split;
+       __uint32_t              xs_ibt_2_join;
+       __uint32_t              xs_ibt_2_alloc;
+       __uint32_t              xs_ibt_2_free;
+       __uint32_t              xs_ibt_2_moves;
 /* Extra precision counters */
        __uint64_t              xs_xstrat_bytes;
        __uint64_t              xs_write_bytes;
index 37ebe36056ebd758af0e62012b60c5dfb13b7039..c3d004bc4621c7bf0b66580d5955386294879a79 100644 (file)
@@ -18,7 +18,6 @@
 #include "xfs.h"
 #include "xfs_bit.h"
 #include "xfs_log.h"
-#include "xfs_clnt.h"
 #include "xfs_inum.h"
 #include "xfs_trans.h"
 #include "xfs_sb.h"
@@ -36,6 +35,7 @@
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_btree.h"
+#include "xfs_btree_trace.h"
 #include "xfs_ialloc.h"
 #include "xfs_bmap.h"
 #include "xfs_rtalloc.h"
@@ -58,6 +58,7 @@
 #include "xfs_extfree_item.h"
 #include "xfs_mru_cache.h"
 #include "xfs_inode_item.h"
+#include "xfs_sync.h"
 
 #include <linux/namei.h>
 #include <linux/init.h>
 
 static struct quotactl_ops xfs_quotactl_operations;
 static struct super_operations xfs_super_operations;
-static kmem_zone_t *xfs_vnode_zone;
 static kmem_zone_t *xfs_ioend_zone;
 mempool_t *xfs_ioend_pool;
 
-STATIC struct xfs_mount_args *
-xfs_args_allocate(
-       struct super_block      *sb,
-       int                     silent)
-{
-       struct xfs_mount_args   *args;
-
-       args = kzalloc(sizeof(struct xfs_mount_args), GFP_KERNEL);
-       if (!args)
-               return NULL;
-
-       args->logbufs = args->logbufsize = -1;
-       strncpy(args->fsname, sb->s_id, MAXNAMELEN);
-
-       /* Copy the already-parsed mount(2) flags we're interested in */
-       if (sb->s_flags & MS_DIRSYNC)
-               args->flags |= XFSMNT_DIRSYNC;
-       if (sb->s_flags & MS_SYNCHRONOUS)
-               args->flags |= XFSMNT_WSYNC;
-       if (silent)
-               args->flags |= XFSMNT_QUIET;
-       args->flags |= XFSMNT_32BITINODES;
-
-       return args;
-}
-
 #define MNTOPT_LOGBUFS "logbufs"       /* number of XFS log buffers */
 #define MNTOPT_LOGBSIZE        "logbsize"      /* size of XFS log buffers */
 #define MNTOPT_LOGDEV  "logdev"        /* log device */
@@ -188,26 +162,54 @@ suffix_strtoul(char *s, char **endp, unsigned int base)
        return simple_strtoul((const char *)s, endp, base) << shift_left_factor;
 }
 
+/*
+ * This function fills in xfs_mount_t fields based on mount args.
+ * Note: the superblock has _not_ yet been read in.
+ *
+ * Note that this function leaks the various device name allocations on
+ * failure.  The caller takes care of them.
+ */
 STATIC int
 xfs_parseargs(
        struct xfs_mount        *mp,
        char                    *options,
-       struct xfs_mount_args   *args,
-       int                     update)
+       char                    **mtpt)
 {
+       struct super_block      *sb = mp->m_super;
        char                    *this_char, *value, *eov;
-       int                     dsunit, dswidth, vol_dsunit, vol_dswidth;
-       int                     iosize;
+       int                     dsunit = 0;
+       int                     dswidth = 0;
+       int                     iosize = 0;
        int                     dmapi_implies_ikeep = 1;
+       uchar_t                 iosizelog = 0;
 
-       args->flags |= XFSMNT_BARRIER;
-       args->flags2 |= XFSMNT2_COMPAT_IOSIZE;
+       /*
+        * Copy binary VFS mount flags we are interested in.
+        */
+       if (sb->s_flags & MS_RDONLY)
+               mp->m_flags |= XFS_MOUNT_RDONLY;
+       if (sb->s_flags & MS_DIRSYNC)
+               mp->m_flags |= XFS_MOUNT_DIRSYNC;
+       if (sb->s_flags & MS_SYNCHRONOUS)
+               mp->m_flags |= XFS_MOUNT_WSYNC;
+
+       /*
+        * Set some default flags that could be cleared by the mount option
+        * parsing.
+        */
+       mp->m_flags |= XFS_MOUNT_BARRIER;
+       mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE;
+       mp->m_flags |= XFS_MOUNT_SMALL_INUMS;
+
+       /*
+        * These can be overridden by the mount option parsing.
+        */
+       mp->m_logbufs = -1;
+       mp->m_logbsize = -1;
 
        if (!options)
                goto done;
 
-       iosize = dsunit = dswidth = vol_dsunit = vol_dswidth = 0;
-
        while ((this_char = strsep(&options, ",")) != NULL) {
                if (!*this_char)
                        continue;
@@ -221,7 +223,7 @@ xfs_parseargs(
                                        this_char);
                                return EINVAL;
                        }
-                       args->logbufs = simple_strtoul(value, &eov, 10);
+                       mp->m_logbufs = simple_strtoul(value, &eov, 10);
                } else if (!strcmp(this_char, MNTOPT_LOGBSIZE)) {
                        if (!value || !*value) {
                                cmn_err(CE_WARN,
@@ -229,7 +231,7 @@ xfs_parseargs(
                                        this_char);
                                return EINVAL;
                        }
-                       args->logbufsize = suffix_strtoul(value, &eov, 10);
+                       mp->m_logbsize = suffix_strtoul(value, &eov, 10);
                } else if (!strcmp(this_char, MNTOPT_LOGDEV)) {
                        if (!value || !*value) {
                                cmn_err(CE_WARN,
@@ -237,7 +239,9 @@ xfs_parseargs(
                                        this_char);
                                return EINVAL;
                        }
-                       strncpy(args->logname, value, MAXNAMELEN);
+                       mp->m_logname = kstrndup(value, MAXNAMELEN, GFP_KERNEL);
+                       if (!mp->m_logname)
+                               return ENOMEM;
                } else if (!strcmp(this_char, MNTOPT_MTPT)) {
                        if (!value || !*value) {
                                cmn_err(CE_WARN,
@@ -245,7 +249,9 @@ xfs_parseargs(
                                        this_char);
                                return EINVAL;
                        }
-                       strncpy(args->mtpt, value, MAXNAMELEN);
+                       *mtpt = kstrndup(value, MAXNAMELEN, GFP_KERNEL);
+                       if (!*mtpt)
+                               return ENOMEM;
                } else if (!strcmp(this_char, MNTOPT_RTDEV)) {
                        if (!value || !*value) {
                                cmn_err(CE_WARN,
@@ -253,7 +259,9 @@ xfs_parseargs(
                                        this_char);
                                return EINVAL;
                        }
-                       strncpy(args->rtname, value, MAXNAMELEN);
+                       mp->m_rtname = kstrndup(value, MAXNAMELEN, GFP_KERNEL);
+                       if (!mp->m_rtname)
+                               return ENOMEM;
                } else if (!strcmp(this_char, MNTOPT_BIOSIZE)) {
                        if (!value || !*value) {
                                cmn_err(CE_WARN,
@@ -262,8 +270,7 @@ xfs_parseargs(
                                return EINVAL;
                        }
                        iosize = simple_strtoul(value, &eov, 10);
-                       args->flags |= XFSMNT_IOSIZE;
-                       args->iosizelog = (uint8_t) iosize;
+                       iosizelog = ffs(iosize) - 1;
                } else if (!strcmp(this_char, MNTOPT_ALLOCSIZE)) {
                        if (!value || !*value) {
                                cmn_err(CE_WARN,
@@ -272,8 +279,7 @@ xfs_parseargs(
                                return EINVAL;
                        }
                        iosize = suffix_strtoul(value, &eov, 10);
-                       args->flags |= XFSMNT_IOSIZE;
-                       args->iosizelog = ffs(iosize) - 1;
+                       iosizelog = ffs(iosize) - 1;
                } else if (!strcmp(this_char, MNTOPT_GRPID) ||
                           !strcmp(this_char, MNTOPT_BSDGROUPS)) {
                        mp->m_flags |= XFS_MOUNT_GRPID;
@@ -281,23 +287,25 @@ xfs_parseargs(
                           !strcmp(this_char, MNTOPT_SYSVGROUPS)) {
                        mp->m_flags &= ~XFS_MOUNT_GRPID;
                } else if (!strcmp(this_char, MNTOPT_WSYNC)) {
-                       args->flags |= XFSMNT_WSYNC;
+                       mp->m_flags |= XFS_MOUNT_WSYNC;
                } else if (!strcmp(this_char, MNTOPT_OSYNCISOSYNC)) {
-                       args->flags |= XFSMNT_OSYNCISOSYNC;
+                       mp->m_flags |= XFS_MOUNT_OSYNCISOSYNC;
                } else if (!strcmp(this_char, MNTOPT_NORECOVERY)) {
-                       args->flags |= XFSMNT_NORECOVERY;
+                       mp->m_flags |= XFS_MOUNT_NORECOVERY;
                } else if (!strcmp(this_char, MNTOPT_INO64)) {
-                       args->flags |= XFSMNT_INO64;
-#if !XFS_BIG_INUMS
+#if XFS_BIG_INUMS
+                       mp->m_flags |= XFS_MOUNT_INO64;
+                       mp->m_inoadd = XFS_INO64_OFFSET;
+#else
                        cmn_err(CE_WARN,
                                "XFS: %s option not allowed on this system",
                                this_char);
                        return EINVAL;
 #endif
                } else if (!strcmp(this_char, MNTOPT_NOALIGN)) {
-                       args->flags |= XFSMNT_NOALIGN;
+                       mp->m_flags |= XFS_MOUNT_NOALIGN;
                } else if (!strcmp(this_char, MNTOPT_SWALLOC)) {
-                       args->flags |= XFSMNT_SWALLOC;
+                       mp->m_flags |= XFS_MOUNT_SWALLOC;
                } else if (!strcmp(this_char, MNTOPT_SUNIT)) {
                        if (!value || !*value) {
                                cmn_err(CE_WARN,
@@ -315,7 +323,7 @@ xfs_parseargs(
                        }
                        dswidth = simple_strtoul(value, &eov, 10);
                } else if (!strcmp(this_char, MNTOPT_64BITINODE)) {
-                       args->flags &= ~XFSMNT_32BITINODES;
+                       mp->m_flags &= ~XFS_MOUNT_SMALL_INUMS;
 #if !XFS_BIG_INUMS
                        cmn_err(CE_WARN,
                                "XFS: %s option not allowed on this system",
@@ -323,56 +331,61 @@ xfs_parseargs(
                        return EINVAL;
 #endif
                } else if (!strcmp(this_char, MNTOPT_NOUUID)) {
-                       args->flags |= XFSMNT_NOUUID;
+                       mp->m_flags |= XFS_MOUNT_NOUUID;
                } else if (!strcmp(this_char, MNTOPT_BARRIER)) {
-                       args->flags |= XFSMNT_BARRIER;
+                       mp->m_flags |= XFS_MOUNT_BARRIER;
                } else if (!strcmp(this_char, MNTOPT_NOBARRIER)) {
-                       args->flags &= ~XFSMNT_BARRIER;
+                       mp->m_flags &= ~XFS_MOUNT_BARRIER;
                } else if (!strcmp(this_char, MNTOPT_IKEEP)) {
-                       args->flags |= XFSMNT_IKEEP;
+                       mp->m_flags |= XFS_MOUNT_IKEEP;
                } else if (!strcmp(this_char, MNTOPT_NOIKEEP)) {
                        dmapi_implies_ikeep = 0;
-                       args->flags &= ~XFSMNT_IKEEP;
+                       mp->m_flags &= ~XFS_MOUNT_IKEEP;
                } else if (!strcmp(this_char, MNTOPT_LARGEIO)) {
-                       args->flags2 &= ~XFSMNT2_COMPAT_IOSIZE;
+                       mp->m_flags &= ~XFS_MOUNT_COMPAT_IOSIZE;
                } else if (!strcmp(this_char, MNTOPT_NOLARGEIO)) {
-                       args->flags2 |= XFSMNT2_COMPAT_IOSIZE;
+                       mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE;
                } else if (!strcmp(this_char, MNTOPT_ATTR2)) {
-                       args->flags |= XFSMNT_ATTR2;
+                       mp->m_flags |= XFS_MOUNT_ATTR2;
                } else if (!strcmp(this_char, MNTOPT_NOATTR2)) {
-                       args->flags &= ~XFSMNT_ATTR2;
-                       args->flags |= XFSMNT_NOATTR2;
+                       mp->m_flags &= ~XFS_MOUNT_ATTR2;
+                       mp->m_flags |= XFS_MOUNT_NOATTR2;
                } else if (!strcmp(this_char, MNTOPT_FILESTREAM)) {
-                       args->flags2 |= XFSMNT2_FILESTREAMS;
+                       mp->m_flags |= XFS_MOUNT_FILESTREAMS;
                } else if (!strcmp(this_char, MNTOPT_NOQUOTA)) {
-                       args->flags &= ~(XFSMNT_UQUOTAENF|XFSMNT_UQUOTA);
-                       args->flags &= ~(XFSMNT_GQUOTAENF|XFSMNT_GQUOTA);
+                       mp->m_qflags &= ~(XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE |
+                                         XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE |
+                                         XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE |
+                                         XFS_UQUOTA_ENFD | XFS_OQUOTA_ENFD);
                } else if (!strcmp(this_char, MNTOPT_QUOTA) ||
                           !strcmp(this_char, MNTOPT_UQUOTA) ||
                           !strcmp(this_char, MNTOPT_USRQUOTA)) {
-                       args->flags |= XFSMNT_UQUOTA | XFSMNT_UQUOTAENF;
+                       mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE |
+                                        XFS_UQUOTA_ENFD);
                } else if (!strcmp(this_char, MNTOPT_QUOTANOENF) ||
                           !strcmp(this_char, MNTOPT_UQUOTANOENF)) {
-                       args->flags |= XFSMNT_UQUOTA;
-                       args->flags &= ~XFSMNT_UQUOTAENF;
+                       mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE);
+                       mp->m_qflags &= ~XFS_UQUOTA_ENFD;
                } else if (!strcmp(this_char, MNTOPT_PQUOTA) ||
                           !strcmp(this_char, MNTOPT_PRJQUOTA)) {
-                       args->flags |= XFSMNT_PQUOTA | XFSMNT_PQUOTAENF;
+                       mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE |
+                                        XFS_OQUOTA_ENFD);
                } else if (!strcmp(this_char, MNTOPT_PQUOTANOENF)) {
-                       args->flags |= XFSMNT_PQUOTA;
-                       args->flags &= ~XFSMNT_PQUOTAENF;
+                       mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE);
+                       mp->m_qflags &= ~XFS_OQUOTA_ENFD;
                } else if (!strcmp(this_char, MNTOPT_GQUOTA) ||
                           !strcmp(this_char, MNTOPT_GRPQUOTA)) {
-                       args->flags |= XFSMNT_GQUOTA | XFSMNT_GQUOTAENF;
+                       mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE |
+                                        XFS_OQUOTA_ENFD);
                } else if (!strcmp(this_char, MNTOPT_GQUOTANOENF)) {
-                       args->flags |= XFSMNT_GQUOTA;
-                       args->flags &= ~XFSMNT_GQUOTAENF;
+                       mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE);
+                       mp->m_qflags &= ~XFS_OQUOTA_ENFD;
                } else if (!strcmp(this_char, MNTOPT_DMAPI)) {
-                       args->flags |= XFSMNT_DMAPI;
+                       mp->m_flags |= XFS_MOUNT_DMAPI;
                } else if (!strcmp(this_char, MNTOPT_XDSM)) {
-                       args->flags |= XFSMNT_DMAPI;
+                       mp->m_flags |= XFS_MOUNT_DMAPI;
                } else if (!strcmp(this_char, MNTOPT_DMI)) {
-                       args->flags |= XFSMNT_DMAPI;
+                       mp->m_flags |= XFS_MOUNT_DMAPI;
                } else if (!strcmp(this_char, "ihashsize")) {
                        cmn_err(CE_WARN,
        "XFS: ihashsize no longer used, option is deprecated.");
@@ -390,27 +403,29 @@ xfs_parseargs(
                }
        }
 
-       if (args->flags & XFSMNT_NORECOVERY) {
-               if ((mp->m_flags & XFS_MOUNT_RDONLY) == 0) {
-                       cmn_err(CE_WARN,
-                               "XFS: no-recovery mounts must be read-only.");
-                       return EINVAL;
-               }
+       /*
+        * no recovery flag requires a read-only mount
+        */
+       if ((mp->m_flags & XFS_MOUNT_NORECOVERY) &&
+           !(mp->m_flags & XFS_MOUNT_RDONLY)) {
+               cmn_err(CE_WARN, "XFS: no-recovery mounts must be read-only.");
+               return EINVAL;
        }
 
-       if ((args->flags & XFSMNT_NOALIGN) && (dsunit || dswidth)) {
+       if ((mp->m_flags & XFS_MOUNT_NOALIGN) && (dsunit || dswidth)) {
                cmn_err(CE_WARN,
        "XFS: sunit and swidth options incompatible with the noalign option");
                return EINVAL;
        }
 
-       if ((args->flags & XFSMNT_GQUOTA) && (args->flags & XFSMNT_PQUOTA)) {
+       if ((mp->m_qflags & (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE)) &&
+           (mp->m_qflags & (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE))) {
                cmn_err(CE_WARN,
                        "XFS: cannot mount with both project and group quota");
                return EINVAL;
        }
 
-       if ((args->flags & XFSMNT_DMAPI) && *args->mtpt == '\0') {
+       if ((mp->m_flags & XFS_MOUNT_DMAPI) && (!*mtpt || *mtpt[0] == '\0')) {
                printk("XFS: %s option needs the mount point option as well\n",
                        MNTOPT_DMAPI);
                return EINVAL;
@@ -438,27 +453,66 @@ xfs_parseargs(
         * Note that if "ikeep" or "noikeep" mount options are
         * supplied, then they are honored.
         */
-       if ((args->flags & XFSMNT_DMAPI) && dmapi_implies_ikeep)
-               args->flags |= XFSMNT_IKEEP;
+       if ((mp->m_flags & XFS_MOUNT_DMAPI) && dmapi_implies_ikeep)
+               mp->m_flags |= XFS_MOUNT_IKEEP;
 
-       if ((args->flags & XFSMNT_NOALIGN) != XFSMNT_NOALIGN) {
+done:
+       if (!(mp->m_flags & XFS_MOUNT_NOALIGN)) {
+               /*
+                * At this point the superblock has not been read
+                * in, therefore we do not know the block size.
+                * Before the mount call ends we will convert
+                * these to FSBs.
+                */
                if (dsunit) {
-                       args->sunit = dsunit;
-                       args->flags |= XFSMNT_RETERR;
-               } else {
-                       args->sunit = vol_dsunit;
+                       mp->m_dalign = dsunit;
+                       mp->m_flags |= XFS_MOUNT_RETERR;
                }
-               dswidth ? (args->swidth = dswidth) :
-                         (args->swidth = vol_dswidth);
-       } else {
-               args->sunit = args->swidth = 0;
+
+               if (dswidth)
+                       mp->m_swidth = dswidth;
+       }
+
+       if (mp->m_logbufs != -1 &&
+           mp->m_logbufs != 0 &&
+           (mp->m_logbufs < XLOG_MIN_ICLOGS ||
+            mp->m_logbufs > XLOG_MAX_ICLOGS)) {
+               cmn_err(CE_WARN,
+                       "XFS: invalid logbufs value: %d [not %d-%d]",
+                       mp->m_logbufs, XLOG_MIN_ICLOGS, XLOG_MAX_ICLOGS);
+               return XFS_ERROR(EINVAL);
+       }
+       if (mp->m_logbsize != -1 &&
+           mp->m_logbsize !=  0 &&
+           (mp->m_logbsize < XLOG_MIN_RECORD_BSIZE ||
+            mp->m_logbsize > XLOG_MAX_RECORD_BSIZE ||
+            !is_power_of_2(mp->m_logbsize))) {
+               cmn_err(CE_WARN,
+       "XFS: invalid logbufsize: %d [not 16k,32k,64k,128k or 256k]",
+                       mp->m_logbsize);
+               return XFS_ERROR(EINVAL);
+       }
+
+       mp->m_fsname = kstrndup(sb->s_id, MAXNAMELEN, GFP_KERNEL);
+       if (!mp->m_fsname)
+               return ENOMEM;
+       mp->m_fsname_len = strlen(mp->m_fsname) + 1;
+
+       if (iosizelog) {
+               if (iosizelog > XFS_MAX_IO_LOG ||
+                   iosizelog < XFS_MIN_IO_LOG) {
+                       cmn_err(CE_WARN,
+               "XFS: invalid log iosize: %d [not %d-%d]",
+                               iosizelog, XFS_MIN_IO_LOG,
+                               XFS_MAX_IO_LOG);
+                       return XFS_ERROR(EINVAL);
+               }
+
+               mp->m_flags |= XFS_MOUNT_DFLT_IOSIZE;
+               mp->m_readio_log = iosizelog;
+               mp->m_writeio_log = iosizelog;
        }
 
-done:
-       if (args->flags & XFSMNT_32BITINODES)
-               mp->m_flags |= XFS_MOUNT_SMALL_INUMS;
-       if (args->flags2)
-               args->flags |= XFSMNT_FLAGS2;
        return 0;
 }
 
@@ -704,8 +758,7 @@ xfs_close_devices(
  */
 STATIC int
 xfs_open_devices(
-       struct xfs_mount        *mp,
-       struct xfs_mount_args   *args)
+       struct xfs_mount        *mp)
 {
        struct block_device     *ddev = mp->m_super->s_bdev;
        struct block_device     *logdev = NULL, *rtdev = NULL;
@@ -714,14 +767,14 @@ xfs_open_devices(
        /*
         * Open real time and log devices - order is important.
         */
-       if (args->logname[0]) {
-               error = xfs_blkdev_get(mp, args->logname, &logdev);
+       if (mp->m_logname) {
+               error = xfs_blkdev_get(mp, mp->m_logname, &logdev);
                if (error)
                        goto out;
        }
 
-       if (args->rtname[0]) {
-               error = xfs_blkdev_get(mp, args->rtname, &rtdev);
+       if (mp->m_rtname) {
+               error = xfs_blkdev_get(mp, mp->m_rtname, &rtdev);
                if (error)
                        goto out_close_logdev;
 
@@ -813,18 +866,18 @@ xfs_setup_devices(
  */
 void
 xfsaild_wakeup(
-       xfs_mount_t             *mp,
+       struct xfs_ail          *ailp,
        xfs_lsn_t               threshold_lsn)
 {
-       mp->m_ail.xa_target = threshold_lsn;
-       wake_up_process(mp->m_ail.xa_task);
+       ailp->xa_target = threshold_lsn;
+       wake_up_process(ailp->xa_task);
 }
 
 int
 xfsaild(
        void    *data)
 {
-       xfs_mount_t     *mp = (xfs_mount_t *)data;
+       struct xfs_ail  *ailp = data;
        xfs_lsn_t       last_pushed_lsn = 0;
        long            tout = 0;
 
@@ -836,11 +889,11 @@ xfsaild(
                /* swsusp */
                try_to_freeze();
 
-               ASSERT(mp->m_log);
-               if (XFS_FORCED_SHUTDOWN(mp))
+               ASSERT(ailp->xa_mount->m_log);
+               if (XFS_FORCED_SHUTDOWN(ailp->xa_mount))
                        continue;
 
-               tout = xfsaild_push(mp, &last_pushed_lsn);
+               tout = xfsaild_push(ailp, &last_pushed_lsn);
        }
 
        return 0;
@@ -848,43 +901,82 @@ xfsaild(
 
 int
 xfsaild_start(
-       xfs_mount_t     *mp)
+       struct xfs_ail  *ailp)
 {
-       mp->m_ail.xa_target = 0;
-       mp->m_ail.xa_task = kthread_run(xfsaild, mp, "xfsaild");
-       if (IS_ERR(mp->m_ail.xa_task))
-               return -PTR_ERR(mp->m_ail.xa_task);
+       ailp->xa_target = 0;
+       ailp->xa_task = kthread_run(xfsaild, ailp, "xfsaild");
+       if (IS_ERR(ailp->xa_task))
+               return -PTR_ERR(ailp->xa_task);
        return 0;
 }
 
 void
 xfsaild_stop(
-       xfs_mount_t     *mp)
+       struct xfs_ail  *ailp)
 {
-       kthread_stop(mp->m_ail.xa_task);
+       kthread_stop(ailp->xa_task);
 }
 
 
-
+/* Catch misguided souls that try to use this interface on XFS */
 STATIC struct inode *
 xfs_fs_alloc_inode(
        struct super_block      *sb)
 {
-       return kmem_zone_alloc(xfs_vnode_zone, KM_SLEEP);
+       BUG();
+       return NULL;
 }
 
+/*
+ * Now that the generic code is guaranteed not to be accessing
+ * the linux inode, we can reclaim the inode.
+ */
 STATIC void
 xfs_fs_destroy_inode(
-       struct inode            *inode)
+       struct inode    *inode)
 {
-       kmem_zone_free(xfs_vnode_zone, inode);
+       xfs_inode_t             *ip = XFS_I(inode);
+
+       XFS_STATS_INC(vn_reclaim);
+       if (xfs_reclaim(ip))
+               panic("%s: cannot reclaim 0x%p\n", __func__, inode);
 }
 
+/*
+ * Slab object creation initialisation for the XFS inode.
+ * This covers only the idempotent fields in the XFS inode;
+ * all other fields need to be initialised on allocation
+ * from the slab. This avoids the need to repeatedly intialise
+ * fields in the xfs inode that left in the initialise state
+ * when freeing the inode.
+ */
 STATIC void
 xfs_fs_inode_init_once(
-       void                    *vnode)
+       void                    *inode)
 {
-       inode_init_once((struct inode *)vnode);
+       struct xfs_inode        *ip = inode;
+
+       memset(ip, 0, sizeof(struct xfs_inode));
+
+       /* vfs inode */
+       inode_init_once(VFS_I(ip));
+
+       /* xfs inode */
+       atomic_set(&ip->i_iocount, 0);
+       atomic_set(&ip->i_pincount, 0);
+       spin_lock_init(&ip->i_flags_lock);
+       init_waitqueue_head(&ip->i_ipin_wait);
+       /*
+        * Because we want to use a counting completion, complete
+        * the flush completion once to allow a single access to
+        * the flush completion without blocking.
+        */
+       init_completion(&ip->i_flush);
+       complete(&ip->i_flush);
+
+       mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER,
+                    "xfsino", ip->i_ino);
+       mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
 }
 
 /*
@@ -912,7 +1004,7 @@ xfs_fs_write_inode(
         * it dirty again so we'll try again later.
         */
        if (error)
-               mark_inode_dirty_sync(inode);
+               xfs_mark_inode_dirty_sync(XFS_I(inode));
 
        return -error;
 }
@@ -923,164 +1015,13 @@ xfs_fs_clear_inode(
 {
        xfs_inode_t             *ip = XFS_I(inode);
 
-       /*
-        * ip can be null when xfs_iget_core calls xfs_idestroy if we
-        * find an inode with di_mode == 0 but without IGET_CREATE set.
-        */
-       if (ip) {
-               xfs_itrace_entry(ip);
-               XFS_STATS_INC(vn_rele);
-               XFS_STATS_INC(vn_remove);
-               XFS_STATS_INC(vn_reclaim);
-               XFS_STATS_DEC(vn_active);
-
-               xfs_inactive(ip);
-               xfs_iflags_clear(ip, XFS_IMODIFIED);
-               if (xfs_reclaim(ip))
-                       panic("%s: cannot reclaim 0x%p\n", __func__, inode);
-       }
+       xfs_itrace_entry(ip);
+       XFS_STATS_INC(vn_rele);
+       XFS_STATS_INC(vn_remove);
+       XFS_STATS_DEC(vn_active);
 
-       ASSERT(XFS_I(inode) == NULL);
-}
-
-/*
- * Enqueue a work item to be picked up by the vfs xfssyncd thread.
- * Doing this has two advantages:
- * - It saves on stack space, which is tight in certain situations
- * - It can be used (with care) as a mechanism to avoid deadlocks.
- * Flushing while allocating in a full filesystem requires both.
- */
-STATIC void
-xfs_syncd_queue_work(
-       struct xfs_mount *mp,
-       void            *data,
-       void            (*syncer)(struct xfs_mount *, void *))
-{
-       struct bhv_vfs_sync_work *work;
-
-       work = kmem_alloc(sizeof(struct bhv_vfs_sync_work), KM_SLEEP);
-       INIT_LIST_HEAD(&work->w_list);
-       work->w_syncer = syncer;
-       work->w_data = data;
-       work->w_mount = mp;
-       spin_lock(&mp->m_sync_lock);
-       list_add_tail(&work->w_list, &mp->m_sync_list);
-       spin_unlock(&mp->m_sync_lock);
-       wake_up_process(mp->m_sync_task);
-}
-
-/*
- * Flush delayed allocate data, attempting to free up reserved space
- * from existing allocations.  At this point a new allocation attempt
- * has failed with ENOSPC and we are in the process of scratching our
- * heads, looking about for more room...
- */
-STATIC void
-xfs_flush_inode_work(
-       struct xfs_mount *mp,
-       void            *arg)
-{
-       struct inode    *inode = arg;
-       filemap_flush(inode->i_mapping);
-       iput(inode);
-}
-
-void
-xfs_flush_inode(
-       xfs_inode_t     *ip)
-{
-       struct inode    *inode = VFS_I(ip);
-
-       igrab(inode);
-       xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inode_work);
-       delay(msecs_to_jiffies(500));
-}
-
-/*
- * This is the "bigger hammer" version of xfs_flush_inode_work...
- * (IOW, "If at first you don't succeed, use a Bigger Hammer").
- */
-STATIC void
-xfs_flush_device_work(
-       struct xfs_mount *mp,
-       void            *arg)
-{
-       struct inode    *inode = arg;
-       sync_blockdev(mp->m_super->s_bdev);
-       iput(inode);
-}
-
-void
-xfs_flush_device(
-       xfs_inode_t     *ip)
-{
-       struct inode    *inode = VFS_I(ip);
-
-       igrab(inode);
-       xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_device_work);
-       delay(msecs_to_jiffies(500));
-       xfs_log_force(ip->i_mount, (xfs_lsn_t)0, XFS_LOG_FORCE|XFS_LOG_SYNC);
-}
-
-STATIC void
-xfs_sync_worker(
-       struct xfs_mount *mp,
-       void            *unused)
-{
-       int             error;
-
-       if (!(mp->m_flags & XFS_MOUNT_RDONLY))
-               error = xfs_sync(mp, SYNC_FSDATA | SYNC_BDFLUSH | SYNC_ATTR);
-       mp->m_sync_seq++;
-       wake_up(&mp->m_wait_single_sync_task);
-}
-
-STATIC int
-xfssyncd(
-       void                    *arg)
-{
-       struct xfs_mount        *mp = arg;
-       long                    timeleft;
-       bhv_vfs_sync_work_t     *work, *n;
-       LIST_HEAD               (tmp);
-
-       set_freezable();
-       timeleft = xfs_syncd_centisecs * msecs_to_jiffies(10);
-       for (;;) {
-               timeleft = schedule_timeout_interruptible(timeleft);
-               /* swsusp */
-               try_to_freeze();
-               if (kthread_should_stop() && list_empty(&mp->m_sync_list))
-                       break;
-
-               spin_lock(&mp->m_sync_lock);
-               /*
-                * We can get woken by laptop mode, to do a sync -
-                * that's the (only!) case where the list would be
-                * empty with time remaining.
-                */
-               if (!timeleft || list_empty(&mp->m_sync_list)) {
-                       if (!timeleft)
-                               timeleft = xfs_syncd_centisecs *
-                                                       msecs_to_jiffies(10);
-                       INIT_LIST_HEAD(&mp->m_sync_work.w_list);
-                       list_add_tail(&mp->m_sync_work.w_list,
-                                       &mp->m_sync_list);
-               }
-               list_for_each_entry_safe(work, n, &mp->m_sync_list, w_list)
-                       list_move(&work->w_list, &tmp);
-               spin_unlock(&mp->m_sync_lock);
-
-               list_for_each_entry_safe(work, n, &tmp, w_list) {
-                       (*work->w_syncer)(mp, work->w_data);
-                       list_del(&work->w_list);
-                       if (work == &mp->m_sync_work)
-                               continue;
-                       kmem_free(work);
-               }
-       }
-
-       return 0;
+       xfs_inactive(ip);
+       xfs_iflags_clear(ip, XFS_IMODIFIED);
 }
 
 STATIC void
@@ -1101,9 +1042,8 @@ xfs_fs_put_super(
        int                     unmount_event_flags = 0;
        int                     error;
 
-       kthread_stop(mp->m_sync_task);
-
-       xfs_sync(mp, SYNC_ATTR | SYNC_DELWRI);
+       xfs_syncd_stop(mp);
+       xfs_sync_inodes(mp, SYNC_ATTR|SYNC_DELWRI);
 
 #ifdef HAVE_DMAPI
        if (mp->m_flags & XFS_MOUNT_DMAPI) {
@@ -1131,16 +1071,6 @@ xfs_fs_put_super(
        error = xfs_unmount_flush(mp, 0);
        WARN_ON(error);
 
-       /*
-        * If we're forcing a shutdown, typically because of a media error,
-        * we want to make sure we invalidate dirty pages that belong to
-        * referenced vnodes as well.
-        */
-       if (XFS_FORCED_SHUTDOWN(mp)) {
-               error = xfs_sync(mp, SYNC_WAIT | SYNC_CLOSE);
-               ASSERT(error != EFSCORRUPTED);
-       }
-
        if (mp->m_flags & XFS_MOUNT_DMAPI) {
                XFS_SEND_UNMOUNT(mp, rip, DM_RIGHT_NULL, 0, 0,
                                unmount_event_flags);
@@ -1161,7 +1091,7 @@ xfs_fs_write_super(
        struct super_block      *sb)
 {
        if (!(sb->s_flags & MS_RDONLY))
-               xfs_sync(XFS_M(sb), SYNC_FSDATA);
+               xfs_sync_fsdata(XFS_M(sb), 0);
        sb->s_dirt = 0;
 }
 
@@ -1172,7 +1102,6 @@ xfs_fs_sync_super(
 {
        struct xfs_mount        *mp = XFS_M(sb);
        int                     error;
-       int                     flags;
 
        /*
         * Treat a sync operation like a freeze.  This is to work
@@ -1186,20 +1115,10 @@ xfs_fs_sync_super(
         * dirty the Linux inode until after the transaction I/O
         * completes.
         */
-       if (wait || unlikely(sb->s_frozen == SB_FREEZE_WRITE)) {
-               /*
-                * First stage of freeze - no more writers will make progress
-                * now we are here, so we flush delwri and delalloc buffers
-                * here, then wait for all I/O to complete.  Data is frozen at
-                * that point. Metadata is not frozen, transactions can still
-                * occur here so don't bother flushing the buftarg (i.e
-                * SYNC_QUIESCE) because it'll just get dirty again.
-                */
-               flags = SYNC_DATA_QUIESCE;
-       } else
-               flags = SYNC_FSDATA;
-
-       error = xfs_sync(mp, flags);
+       if (wait || unlikely(sb->s_frozen == SB_FREEZE_WRITE))
+               error = xfs_quiesce_data(mp);
+       else
+               error = xfs_sync_fsdata(mp, 0);
        sb->s_dirt = 0;
 
        if (unlikely(laptop_mode)) {
@@ -1337,9 +1256,8 @@ xfs_fs_remount(
 
        /* rw -> ro */
        if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & MS_RDONLY)) {
-               xfs_filestream_flush(mp);
-               xfs_sync(mp, SYNC_DATA_QUIESCE);
-               xfs_attr_quiesce(mp);
+               xfs_quiesce_data(mp);
+               xfs_quiesce_attr(mp);
                mp->m_flags |= XFS_MOUNT_RDONLY;
        }
 
@@ -1348,7 +1266,7 @@ xfs_fs_remount(
 
 /*
  * Second stage of a freeze. The data is already frozen so we only
- * need to take care of themetadata. Once that's done write a dummy
+ * need to take care of the metadata. Once that's done write a dummy
  * record to dirty the log in case of a crash while frozen.
  */
 STATIC void
@@ -1357,7 +1275,7 @@ xfs_fs_lockfs(
 {
        struct xfs_mount        *mp = XFS_M(sb);
 
-       xfs_attr_quiesce(mp);
+       xfs_quiesce_attr(mp);
        xfs_fs_log_dummy(mp);
 }
 
@@ -1420,177 +1338,30 @@ xfs_fs_setxquota(
                                   Q_XSETPQLIM), id, (caddr_t)fdq);
 }
 
-/*
- * This function fills in xfs_mount_t fields based on mount args.
- * Note: the superblock has _not_ yet been read in.
- */
-STATIC int
-xfs_start_flags(
-       struct xfs_mount_args   *ap,
-       struct xfs_mount        *mp)
-{
-       int                     error;
-
-       /* Values are in BBs */
-       if ((ap->flags & XFSMNT_NOALIGN) != XFSMNT_NOALIGN) {
-               /*
-                * At this point the superblock has not been read
-                * in, therefore we do not know the block size.
-                * Before the mount call ends we will convert
-                * these to FSBs.
-                */
-               mp->m_dalign = ap->sunit;
-               mp->m_swidth = ap->swidth;
-       }
-
-       if (ap->logbufs != -1 &&
-           ap->logbufs != 0 &&
-           (ap->logbufs < XLOG_MIN_ICLOGS ||
-            ap->logbufs > XLOG_MAX_ICLOGS)) {
-               cmn_err(CE_WARN,
-                       "XFS: invalid logbufs value: %d [not %d-%d]",
-                       ap->logbufs, XLOG_MIN_ICLOGS, XLOG_MAX_ICLOGS);
-               return XFS_ERROR(EINVAL);
-       }
-       mp->m_logbufs = ap->logbufs;
-       if (ap->logbufsize != -1 &&
-           ap->logbufsize !=  0 &&
-           (ap->logbufsize < XLOG_MIN_RECORD_BSIZE ||
-            ap->logbufsize > XLOG_MAX_RECORD_BSIZE ||
-            !is_power_of_2(ap->logbufsize))) {
-               cmn_err(CE_WARN,
-       "XFS: invalid logbufsize: %d [not 16k,32k,64k,128k or 256k]",
-                       ap->logbufsize);
-               return XFS_ERROR(EINVAL);
-       }
-
-       error = ENOMEM;
-
-       mp->m_logbsize = ap->logbufsize;
-       mp->m_fsname_len = strlen(ap->fsname) + 1;
-
-       mp->m_fsname = kstrdup(ap->fsname, GFP_KERNEL);
-       if (!mp->m_fsname)
-               goto out;
-
-       if (ap->rtname[0]) {
-               mp->m_rtname = kstrdup(ap->rtname, GFP_KERNEL);
-               if (!mp->m_rtname)
-                       goto out_free_fsname;
-
-       }
-
-       if (ap->logname[0]) {
-               mp->m_logname = kstrdup(ap->logname, GFP_KERNEL);
-               if (!mp->m_logname)
-                       goto out_free_rtname;
-       }
-
-       if (ap->flags & XFSMNT_WSYNC)
-               mp->m_flags |= XFS_MOUNT_WSYNC;
-#if XFS_BIG_INUMS
-       if (ap->flags & XFSMNT_INO64) {
-               mp->m_flags |= XFS_MOUNT_INO64;
-               mp->m_inoadd = XFS_INO64_OFFSET;
-       }
-#endif
-       if (ap->flags & XFSMNT_RETERR)
-               mp->m_flags |= XFS_MOUNT_RETERR;
-       if (ap->flags & XFSMNT_NOALIGN)
-               mp->m_flags |= XFS_MOUNT_NOALIGN;
-       if (ap->flags & XFSMNT_SWALLOC)
-               mp->m_flags |= XFS_MOUNT_SWALLOC;
-       if (ap->flags & XFSMNT_OSYNCISOSYNC)
-               mp->m_flags |= XFS_MOUNT_OSYNCISOSYNC;
-       if (ap->flags & XFSMNT_32BITINODES)
-               mp->m_flags |= XFS_MOUNT_32BITINODES;
-
-       if (ap->flags & XFSMNT_IOSIZE) {
-               if (ap->iosizelog > XFS_MAX_IO_LOG ||
-                   ap->iosizelog < XFS_MIN_IO_LOG) {
-                       cmn_err(CE_WARN,
-               "XFS: invalid log iosize: %d [not %d-%d]",
-                               ap->iosizelog, XFS_MIN_IO_LOG,
-                               XFS_MAX_IO_LOG);
-                       return XFS_ERROR(EINVAL);
-               }
-
-               mp->m_flags |= XFS_MOUNT_DFLT_IOSIZE;
-               mp->m_readio_log = mp->m_writeio_log = ap->iosizelog;
-       }
-
-       if (ap->flags & XFSMNT_IKEEP)
-               mp->m_flags |= XFS_MOUNT_IKEEP;
-       if (ap->flags & XFSMNT_DIRSYNC)
-               mp->m_flags |= XFS_MOUNT_DIRSYNC;
-       if (ap->flags & XFSMNT_ATTR2)
-               mp->m_flags |= XFS_MOUNT_ATTR2;
-       if (ap->flags & XFSMNT_NOATTR2)
-               mp->m_flags |= XFS_MOUNT_NOATTR2;
-
-       if (ap->flags2 & XFSMNT2_COMPAT_IOSIZE)
-               mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE;
-
-       /*
-        * no recovery flag requires a read-only mount
-        */
-       if (ap->flags & XFSMNT_NORECOVERY) {
-               if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
-                       cmn_err(CE_WARN,
-       "XFS: tried to mount a FS read-write without recovery!");
-                       return XFS_ERROR(EINVAL);
-               }
-               mp->m_flags |= XFS_MOUNT_NORECOVERY;
-       }
-
-       if (ap->flags & XFSMNT_NOUUID)
-               mp->m_flags |= XFS_MOUNT_NOUUID;
-       if (ap->flags & XFSMNT_BARRIER)
-               mp->m_flags |= XFS_MOUNT_BARRIER;
-       else
-               mp->m_flags &= ~XFS_MOUNT_BARRIER;
-
-       if (ap->flags2 & XFSMNT2_FILESTREAMS)
-               mp->m_flags |= XFS_MOUNT_FILESTREAMS;
-
-       if (ap->flags & XFSMNT_DMAPI)
-               mp->m_flags |= XFS_MOUNT_DMAPI;
-       return 0;
-
-
- out_free_rtname:
-       kfree(mp->m_rtname);
- out_free_fsname:
-       kfree(mp->m_fsname);
- out:
-       return error;
-}
-
 /*
  * This function fills in xfs_mount_t fields based on mount args.
  * Note: the superblock _has_ now been read in.
  */
 STATIC int
 xfs_finish_flags(
-       struct xfs_mount_args   *ap,
        struct xfs_mount        *mp)
 {
        int                     ronly = (mp->m_flags & XFS_MOUNT_RDONLY);
 
        /* Fail a mount where the logbuf is smaller then the log stripe */
        if (xfs_sb_version_haslogv2(&mp->m_sb)) {
-               if ((ap->logbufsize <= 0) &&
-                   (mp->m_sb.sb_logsunit > XLOG_BIG_RECORD_BSIZE)) {
+               if (mp->m_logbsize <= 0 &&
+                   mp->m_sb.sb_logsunit > XLOG_BIG_RECORD_BSIZE) {
                        mp->m_logbsize = mp->m_sb.sb_logsunit;
-               } else if (ap->logbufsize > 0 &&
-                          ap->logbufsize < mp->m_sb.sb_logsunit) {
+               } else if (mp->m_logbsize > 0 &&
+                          mp->m_logbsize < mp->m_sb.sb_logsunit) {
                        cmn_err(CE_WARN,
        "XFS: logbuf size must be greater than or equal to log stripe size");
                        return XFS_ERROR(EINVAL);
                }
        } else {
                /* Fail a mount if the logbuf is larger than 32K */
-               if (ap->logbufsize > XLOG_BIG_RECORD_BSIZE) {
+               if (mp->m_logbsize > XLOG_BIG_RECORD_BSIZE) {
                        cmn_err(CE_WARN,
        "XFS: logbuf size for version 1 logs must be 16K or 32K");
                        return XFS_ERROR(EINVAL);
@@ -1602,7 +1373,7 @@ xfs_finish_flags(
         * told by noattr2 to turn it off
         */
        if (xfs_sb_version_hasattr2(&mp->m_sb) &&
-           !(ap->flags & XFSMNT_NOATTR2))
+           !(mp->m_flags & XFS_MOUNT_NOATTR2))
                mp->m_flags |= XFS_MOUNT_ATTR2;
 
        /*
@@ -1614,6 +1385,7 @@ xfs_finish_flags(
                return XFS_ERROR(EROFS);
        }
 
+#if 0 /* shared mounts were never supported on Linux */
        /*
         * check for shared mount.
         */
@@ -1636,25 +1408,11 @@ xfs_finish_flags(
                /*
                 * Shared XFS V0 can't deal with DMI.  Return EINVAL.
                 */
-               if (mp->m_sb.sb_shared_vn == 0 && (ap->flags & XFSMNT_DMAPI))
+               if (mp->m_sb.sb_shared_vn == 0 &&
+                   (mp->m_flags & XFS_MOUNT_DMAPI))
                        return XFS_ERROR(EINVAL);
        }
-
-       if (ap->flags & XFSMNT_UQUOTA) {
-               mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE);
-               if (ap->flags & XFSMNT_UQUOTAENF)
-                       mp->m_qflags |= XFS_UQUOTA_ENFD;
-       }
-
-       if (ap->flags & XFSMNT_GQUOTA) {
-               mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE);
-               if (ap->flags & XFSMNT_GQUOTAENF)
-                       mp->m_qflags |= XFS_OQUOTA_ENFD;
-       } else if (ap->flags & XFSMNT_PQUOTA) {
-               mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE);
-               if (ap->flags & XFSMNT_PQUOTAENF)
-                       mp->m_qflags |= XFS_OQUOTA_ENFD;
-       }
+#endif
 
        return 0;
 }
@@ -1667,19 +1425,14 @@ xfs_fs_fill_super(
 {
        struct inode            *root;
        struct xfs_mount        *mp = NULL;
-       struct xfs_mount_args   *args;
        int                     flags = 0, error = ENOMEM;
-
-       args = xfs_args_allocate(sb, silent);
-       if (!args)
-               return -ENOMEM;
+       char                    *mtpt = NULL;
 
        mp = kzalloc(sizeof(struct xfs_mount), GFP_KERNEL);
        if (!mp)
-               goto out_free_args;
+               goto out;
 
        spin_lock_init(&mp->m_sb_lock);
-       mutex_init(&mp->m_ilock);
        mutex_init(&mp->m_growlock);
        atomic_set(&mp->m_active_trans, 0);
        INIT_LIST_HEAD(&mp->m_sync_list);
@@ -1689,12 +1442,9 @@ xfs_fs_fill_super(
        mp->m_super = sb;
        sb->s_fs_info = mp;
 
-       if (sb->s_flags & MS_RDONLY)
-               mp->m_flags |= XFS_MOUNT_RDONLY;
-
-       error = xfs_parseargs(mp, (char *)data, args, 0);
+       error = xfs_parseargs(mp, (char *)data, &mtpt);
        if (error)
-               goto out_free_mp;
+               goto out_free_fsname;
 
        sb_min_blocksize(sb, BBSIZE);
        sb->s_xattr = xfs_xattr_handlers;
@@ -1702,33 +1452,28 @@ xfs_fs_fill_super(
        sb->s_qcop = &xfs_quotactl_operations;
        sb->s_op = &xfs_super_operations;
 
-       error = xfs_dmops_get(mp, args);
+       error = xfs_dmops_get(mp);
        if (error)
-               goto out_free_mp;
-       error = xfs_qmops_get(mp, args);
+               goto out_free_fsname;
+       error = xfs_qmops_get(mp);
        if (error)
                goto out_put_dmops;
 
-       if (args->flags & XFSMNT_QUIET)
+       if (silent)
                flags |= XFS_MFSI_QUIET;
 
-       error = xfs_open_devices(mp, args);
+       error = xfs_open_devices(mp);
        if (error)
                goto out_put_qmops;
 
        if (xfs_icsb_init_counters(mp))
                mp->m_flags |= XFS_MOUNT_NO_PERCPU_SB;
 
-       /*
-        * Setup flags based on mount(2) options and then the superblock
-        */
-       error = xfs_start_flags(args, mp);
-       if (error)
-               goto out_free_fsname;
        error = xfs_readsb(mp, flags);
        if (error)
-               goto out_free_fsname;
-       error = xfs_finish_flags(args, mp);
+               goto out_destroy_counters;
+
+       error = xfs_finish_flags(mp);
        if (error)
                goto out_free_sb;
 
@@ -1747,7 +1492,7 @@ xfs_fs_fill_super(
        if (error)
                goto out_filestream_unmount;
 
-       XFS_SEND_MOUNT(mp, DM_RIGHT_NULL, args->mtpt, args->fsname);
+       XFS_SEND_MOUNT(mp, DM_RIGHT_NULL, mtpt, mp->m_fsname);
 
        sb->s_dirt = 1;
        sb->s_magic = XFS_SB_MAGIC;
@@ -1772,35 +1517,31 @@ xfs_fs_fill_super(
                goto fail_vnrele;
        }
 
-       mp->m_sync_work.w_syncer = xfs_sync_worker;
-       mp->m_sync_work.w_mount = mp;
-       mp->m_sync_task = kthread_run(xfssyncd, mp, "xfssyncd");
-       if (IS_ERR(mp->m_sync_task)) {
-               error = -PTR_ERR(mp->m_sync_task);
+       error = xfs_syncd_init(mp);
+       if (error)
                goto fail_vnrele;
-       }
 
-       xfs_itrace_exit(XFS_I(sb->s_root->d_inode));
+       kfree(mtpt);
 
-       kfree(args);
+       xfs_itrace_exit(XFS_I(sb->s_root->d_inode));
        return 0;
 
  out_filestream_unmount:
        xfs_filestream_unmount(mp);
  out_free_sb:
        xfs_freesb(mp);
- out_free_fsname:
-       xfs_free_fsname(mp);
+ out_destroy_counters:
        xfs_icsb_destroy_counters(mp);
        xfs_close_devices(mp);
  out_put_qmops:
        xfs_qmops_put(mp);
  out_put_dmops:
        xfs_dmops_put(mp);
- out_free_mp:
+ out_free_fsname:
+       xfs_free_fsname(mp);
+       kfree(mtpt);
        kfree(mp);
- out_free_args:
-       kfree(args);
+ out:
        return -error;
 
  fail_vnrele:
@@ -1882,10 +1623,19 @@ xfs_alloc_trace_bufs(void)
        if (!xfs_bmap_trace_buf)
                goto out_free_alloc_trace;
 #endif
-#ifdef XFS_BMBT_TRACE
+#ifdef XFS_BTREE_TRACE
+       xfs_allocbt_trace_buf = ktrace_alloc(XFS_ALLOCBT_TRACE_SIZE,
+                                            KM_MAYFAIL);
+       if (!xfs_allocbt_trace_buf)
+               goto out_free_bmap_trace;
+
+       xfs_inobt_trace_buf = ktrace_alloc(XFS_INOBT_TRACE_SIZE, KM_MAYFAIL);
+       if (!xfs_inobt_trace_buf)
+               goto out_free_allocbt_trace;
+
        xfs_bmbt_trace_buf = ktrace_alloc(XFS_BMBT_TRACE_SIZE, KM_MAYFAIL);
        if (!xfs_bmbt_trace_buf)
-               goto out_free_bmap_trace;
+               goto out_free_inobt_trace;
 #endif
 #ifdef XFS_ATTR_TRACE
        xfs_attr_trace_buf = ktrace_alloc(XFS_ATTR_TRACE_SIZE, KM_MAYFAIL);
@@ -1907,8 +1657,12 @@ xfs_alloc_trace_bufs(void)
        ktrace_free(xfs_attr_trace_buf);
  out_free_bmbt_trace:
 #endif
-#ifdef XFS_BMBT_TRACE
+#ifdef XFS_BTREE_TRACE
        ktrace_free(xfs_bmbt_trace_buf);
+ out_free_inobt_trace:
+       ktrace_free(xfs_inobt_trace_buf);
+ out_free_allocbt_trace:
+       ktrace_free(xfs_allocbt_trace_buf);
  out_free_bmap_trace:
 #endif
 #ifdef XFS_BMAP_TRACE
@@ -1931,8 +1685,10 @@ xfs_free_trace_bufs(void)
 #ifdef XFS_ATTR_TRACE
        ktrace_free(xfs_attr_trace_buf);
 #endif
-#ifdef XFS_BMBT_TRACE
+#ifdef XFS_BTREE_TRACE
        ktrace_free(xfs_bmbt_trace_buf);
+       ktrace_free(xfs_inobt_trace_buf);
+       ktrace_free(xfs_allocbt_trace_buf);
 #endif
 #ifdef XFS_BMAP_TRACE
        ktrace_free(xfs_bmap_trace_buf);
@@ -1945,16 +1701,10 @@ xfs_free_trace_bufs(void)
 STATIC int __init
 xfs_init_zones(void)
 {
-       xfs_vnode_zone = kmem_zone_init_flags(sizeof(struct inode), "xfs_vnode",
-                                       KM_ZONE_HWALIGN | KM_ZONE_RECLAIM |
-                                       KM_ZONE_SPREAD,
-                                       xfs_fs_inode_init_once);
-       if (!xfs_vnode_zone)
-               goto out;
 
        xfs_ioend_zone = kmem_zone_init(sizeof(xfs_ioend_t), "xfs_ioend");
        if (!xfs_ioend_zone)
-               goto out_destroy_vnode_zone;
+               goto out;
 
        xfs_ioend_pool = mempool_create_slab_pool(4 * MAX_BUF_PER_PAGE,
                                                  xfs_ioend_zone);
@@ -1970,6 +1720,7 @@ xfs_init_zones(void)
                                                "xfs_bmap_free_item");
        if (!xfs_bmap_free_item_zone)
                goto out_destroy_log_ticket_zone;
+
        xfs_btree_cur_zone = kmem_zone_init(sizeof(xfs_btree_cur_t),
                                                "xfs_btree_cur");
        if (!xfs_btree_cur_zone)
@@ -2017,8 +1768,8 @@ xfs_init_zones(void)
 
        xfs_inode_zone =
                kmem_zone_init_flags(sizeof(xfs_inode_t), "xfs_inode",
-                                       KM_ZONE_HWALIGN | KM_ZONE_RECLAIM |
-                                       KM_ZONE_SPREAD, NULL);
+                       KM_ZONE_HWALIGN | KM_ZONE_RECLAIM | KM_ZONE_SPREAD,
+                       xfs_fs_inode_init_once);
        if (!xfs_inode_zone)
                goto out_destroy_efi_zone;
 
@@ -2066,8 +1817,6 @@ xfs_init_zones(void)
        mempool_destroy(xfs_ioend_pool);
  out_destroy_ioend_zone:
        kmem_zone_destroy(xfs_ioend_zone);
- out_destroy_vnode_zone:
-       kmem_zone_destroy(xfs_vnode_zone);
  out:
        return -ENOMEM;
 }
@@ -2092,7 +1841,6 @@ xfs_destroy_zones(void)
        kmem_zone_destroy(xfs_log_ticket_zone);
        mempool_destroy(xfs_ioend_pool);
        kmem_zone_destroy(xfs_ioend_zone);
-       kmem_zone_destroy(xfs_vnode_zone);
 
 }
 
index fe2ef4e6a0f9e50fdb7a69ea8c60deb0e1892b7f..56dc48a76fab4118c8cb8b1ced4a3f823bbae312 100644 (file)
@@ -101,9 +101,6 @@ struct block_device;
 
 extern __uint64_t xfs_max_file_offset(unsigned int);
 
-extern void xfs_flush_inode(struct xfs_inode *);
-extern void xfs_flush_device(struct xfs_inode *);
-
 extern void xfs_blkdev_issue_flush(struct xfs_buftarg *);
 
 extern const struct export_operations xfs_export_operations;
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
new file mode 100644 (file)
index 0000000..fb5cca3
--- /dev/null
@@ -0,0 +1,763 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_dir2.h"
+#include "xfs_dmapi.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_btree.h"
+#include "xfs_dir2_sf.h"
+#include "xfs_attr_sf.h"
+#include "xfs_inode.h"
+#include "xfs_dinode.h"
+#include "xfs_error.h"
+#include "xfs_mru_cache.h"
+#include "xfs_filestream.h"
+#include "xfs_vnodeops.h"
+#include "xfs_utils.h"
+#include "xfs_buf_item.h"
+#include "xfs_inode_item.h"
+#include "xfs_rw.h"
+
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+
+/*
+ * Sync all the inodes in the given AG according to the
+ * direction given by the flags.
+ */
+STATIC int
+xfs_sync_inodes_ag(
+       xfs_mount_t     *mp,
+       int             ag,
+       int             flags)
+{
+       xfs_perag_t     *pag = &mp->m_perag[ag];
+       int             nr_found;
+       uint32_t        first_index = 0;
+       int             error = 0;
+       int             last_error = 0;
+       int             fflag = XFS_B_ASYNC;
+
+       if (flags & SYNC_DELWRI)
+               fflag = XFS_B_DELWRI;
+       if (flags & SYNC_WAIT)
+               fflag = 0;              /* synchronous overrides all */
+
+       do {
+               struct inode    *inode;
+               xfs_inode_t     *ip = NULL;
+               int             lock_flags = XFS_ILOCK_SHARED;
+
+               /*
+                * use a gang lookup to find the next inode in the tree
+                * as the tree is sparse and a gang lookup walks to find
+                * the number of objects requested.
+                */
+               read_lock(&pag->pag_ici_lock);
+               nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
+                               (void**)&ip, first_index, 1);
+
+               if (!nr_found) {
+                       read_unlock(&pag->pag_ici_lock);
+                       break;
+               }
+
+               /*
+                * Update the index for the next lookup. Catch overflows
+                * into the next AG range which can occur if we have inodes
+                * in the last block of the AG and we are currently
+                * pointing to the last inode.
+                */
+               first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
+               if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) {
+                       read_unlock(&pag->pag_ici_lock);
+                       break;
+               }
+
+               /* nothing to sync during shutdown */
+               if (XFS_FORCED_SHUTDOWN(mp)) {
+                       read_unlock(&pag->pag_ici_lock);
+                       return 0;
+               }
+
+               /*
+                * If we can't get a reference on the inode, it must be
+                * in reclaim. Leave it for the reclaim code to flush.
+                */
+               inode = VFS_I(ip);
+               if (!igrab(inode)) {
+                       read_unlock(&pag->pag_ici_lock);
+                       continue;
+               }
+               read_unlock(&pag->pag_ici_lock);
+
+               /* bad inodes are dealt with elsewhere */
+               if (is_bad_inode(inode)) {
+                       IRELE(ip);
+                       continue;
+               }
+
+               /*
+                * If we have to flush data or wait for I/O completion
+                * we need to hold the iolock.
+                */
+               if ((flags & SYNC_DELWRI) && VN_DIRTY(inode)) {
+                       xfs_ilock(ip, XFS_IOLOCK_SHARED);
+                       lock_flags |= XFS_IOLOCK_SHARED;
+                       error = xfs_flush_pages(ip, 0, -1, fflag, FI_NONE);
+                       if (flags & SYNC_IOWAIT)
+                               vn_iowait(ip);
+               }
+               xfs_ilock(ip, XFS_ILOCK_SHARED);
+
+               if ((flags & SYNC_ATTR) && !xfs_inode_clean(ip)) {
+                       if (flags & SYNC_WAIT) {
+                               xfs_iflock(ip);
+                               if (!xfs_inode_clean(ip))
+                                       error = xfs_iflush(ip, XFS_IFLUSH_SYNC);
+                               else
+                                       xfs_ifunlock(ip);
+                       } else if (xfs_iflock_nowait(ip)) {
+                               if (!xfs_inode_clean(ip))
+                                       error = xfs_iflush(ip, XFS_IFLUSH_DELWRI);
+                               else
+                                       xfs_ifunlock(ip);
+                       }
+               }
+               xfs_iput(ip, lock_flags);
+
+               if (error)
+                       last_error = error;
+               /*
+                * bail out if the filesystem is corrupted.
+                */
+               if (error == EFSCORRUPTED)
+                       return XFS_ERROR(error);
+
+       } while (nr_found);
+
+       return last_error;
+}
+
+int
+xfs_sync_inodes(
+       xfs_mount_t     *mp,
+       int             flags)
+{
+       int             error;
+       int             last_error;
+       int             i;
+       int             lflags = XFS_LOG_FORCE;
+
+       if (mp->m_flags & XFS_MOUNT_RDONLY)
+               return 0;
+       error = 0;
+       last_error = 0;
+
+       if (flags & SYNC_WAIT)
+               lflags |= XFS_LOG_SYNC;
+
+       for (i = 0; i < mp->m_sb.sb_agcount; i++) {
+               if (!mp->m_perag[i].pag_ici_init)
+                       continue;
+               error = xfs_sync_inodes_ag(mp, i, flags);
+               if (error)
+                       last_error = error;
+               if (error == EFSCORRUPTED)
+                       break;
+       }
+       if (flags & SYNC_DELWRI)
+               xfs_log_force(mp, 0, lflags);
+
+       return XFS_ERROR(last_error);
+}
+
+STATIC int
+xfs_commit_dummy_trans(
+       struct xfs_mount        *mp,
+       uint                    log_flags)
+{
+       struct xfs_inode        *ip = mp->m_rootip;
+       struct xfs_trans        *tp;
+       int                     error;
+
+       /*
+        * Put a dummy transaction in the log to tell recovery
+        * that all others are OK.
+        */
+       tp = xfs_trans_alloc(mp, XFS_TRANS_DUMMY1);
+       error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0);
+       if (error) {
+               xfs_trans_cancel(tp, 0);
+               return error;
+       }
+
+       xfs_ilock(ip, XFS_ILOCK_EXCL);
+
+       xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+       xfs_trans_ihold(tp, ip);
+       xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+       /* XXX(hch): ignoring the error here.. */
+       error = xfs_trans_commit(tp, 0);
+
+       xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+       xfs_log_force(mp, 0, log_flags);
+       return 0;
+}
+
+int
+xfs_sync_fsdata(
+       struct xfs_mount        *mp,
+       int                     flags)
+{
+       struct xfs_buf          *bp;
+       struct xfs_buf_log_item *bip;
+       int                     error = 0;
+
+       /*
+        * If this is xfssyncd() then only sync the superblock if we can
+        * lock it without sleeping and it is not pinned.
+        */
+       if (flags & SYNC_BDFLUSH) {
+               ASSERT(!(flags & SYNC_WAIT));
+
+               bp = xfs_getsb(mp, XFS_BUF_TRYLOCK);
+               if (!bp)
+                       goto out;
+
+               bip = XFS_BUF_FSPRIVATE(bp, struct xfs_buf_log_item *);
+               if (!bip || !xfs_buf_item_dirty(bip) || XFS_BUF_ISPINNED(bp))
+                       goto out_brelse;
+       } else {
+               bp = xfs_getsb(mp, 0);
+
+               /*
+                * If the buffer is pinned then push on the log so we won't
+                * get stuck waiting in the write for someone, maybe
+                * ourselves, to flush the log.
+                *
+                * Even though we just pushed the log above, we did not have
+                * the superblock buffer locked at that point so it can
+                * become pinned in between there and here.
+                */
+               if (XFS_BUF_ISPINNED(bp))
+                       xfs_log_force(mp, 0, XFS_LOG_FORCE);
+       }
+
+
+       if (flags & SYNC_WAIT)
+               XFS_BUF_UNASYNC(bp);
+       else
+               XFS_BUF_ASYNC(bp);
+
+       return xfs_bwrite(mp, bp);
+
+ out_brelse:
+       xfs_buf_relse(bp);
+ out:
+       return error;
+}
+
+/*
+ * When remounting a filesystem read-only or freezing the filesystem, we have
+ * two phases to execute. This first phase is syncing the data before we
+ * quiesce the filesystem, and the second is flushing all the inodes out after
+ * we've waited for all the transactions created by the first phase to
+ * complete. The second phase ensures that the inodes are written to their
+ * location on disk rather than just existing in transactions in the log. This
+ * means after a quiesce there is no log replay required to write the inodes to
+ * disk (this is the main difference between a sync and a quiesce).
+ */
+/*
+ * First stage of freeze - no writers will make progress now we are here,
+ * so we flush delwri and delalloc buffers here, then wait for all I/O to
+ * complete.  Data is frozen at that point. Metadata is not frozen,
+ * transactions can still occur here so don't bother flushing the buftarg
+ * because it'll just get dirty again.
+ */
+int
+xfs_quiesce_data(
+       struct xfs_mount        *mp)
+{
+       int error;
+
+       /* push non-blocking */
+       xfs_sync_inodes(mp, SYNC_DELWRI|SYNC_BDFLUSH);
+       XFS_QM_DQSYNC(mp, SYNC_BDFLUSH);
+       xfs_filestream_flush(mp);
+
+       /* push and block */
+       xfs_sync_inodes(mp, SYNC_DELWRI|SYNC_WAIT|SYNC_IOWAIT);
+       XFS_QM_DQSYNC(mp, SYNC_WAIT);
+
+       /* write superblock and hoover up shutdown errors */
+       error = xfs_sync_fsdata(mp, 0);
+
+       /* flush data-only devices */
+       if (mp->m_rtdev_targp)
+               XFS_bflush(mp->m_rtdev_targp);
+
+       return error;
+}
+
+STATIC void
+xfs_quiesce_fs(
+       struct xfs_mount        *mp)
+{
+       int     count = 0, pincount;
+
+       xfs_flush_buftarg(mp->m_ddev_targp, 0);
+       xfs_reclaim_inodes(mp, 0, XFS_IFLUSH_DELWRI_ELSE_ASYNC);
+
+       /*
+        * This loop must run at least twice.  The first instance of the loop
+        * will flush most meta data but that will generate more meta data
+        * (typically directory updates).  Which then must be flushed and
+        * logged before we can write the unmount record.
+        */
+       do {
+               xfs_sync_inodes(mp, SYNC_ATTR|SYNC_WAIT);
+               pincount = xfs_flush_buftarg(mp->m_ddev_targp, 1);
+               if (!pincount) {
+                       delay(50);
+                       count++;
+               }
+       } while (count < 2);
+}
+
+/*
+ * Second stage of a quiesce. The data is already synced, now we have to take
+ * care of the metadata. New transactions are already blocked, so we need to
+ * wait for any remaining transactions to drain out before proceding.
+ */
+void
+xfs_quiesce_attr(
+       struct xfs_mount        *mp)
+{
+       int     error = 0;
+
+       /* wait for all modifications to complete */
+       while (atomic_read(&mp->m_active_trans) > 0)
+               delay(100);
+
+       /* flush inodes and push all remaining buffers out to disk */
+       xfs_quiesce_fs(mp);
+
+       ASSERT_ALWAYS(atomic_read(&mp->m_active_trans) == 0);
+
+       /* Push the superblock and write an unmount record */
+       error = xfs_log_sbcount(mp, 1);
+       if (error)
+               xfs_fs_cmn_err(CE_WARN, mp,
+                               "xfs_attr_quiesce: failed to log sb changes. "
+                               "Frozen image may not be consistent.");
+       xfs_log_unmount_write(mp);
+       xfs_unmountfs_writesb(mp);
+}
+
+/*
+ * Enqueue a work item to be picked up by the vfs xfssyncd thread.
+ * Doing this has two advantages:
+ * - It saves on stack space, which is tight in certain situations
+ * - It can be used (with care) as a mechanism to avoid deadlocks.
+ * Flushing while allocating in a full filesystem requires both.
+ */
+STATIC void
+xfs_syncd_queue_work(
+       struct xfs_mount *mp,
+       void            *data,
+       void            (*syncer)(struct xfs_mount *, void *))
+{
+       struct bhv_vfs_sync_work *work;
+
+       work = kmem_alloc(sizeof(struct bhv_vfs_sync_work), KM_SLEEP);
+       INIT_LIST_HEAD(&work->w_list);
+       work->w_syncer = syncer;
+       work->w_data = data;
+       work->w_mount = mp;
+       spin_lock(&mp->m_sync_lock);
+       list_add_tail(&work->w_list, &mp->m_sync_list);
+       spin_unlock(&mp->m_sync_lock);
+       wake_up_process(mp->m_sync_task);
+}
+
+/*
+ * Flush delayed allocate data, attempting to free up reserved space
+ * from existing allocations.  At this point a new allocation attempt
+ * has failed with ENOSPC and we are in the process of scratching our
+ * heads, looking about for more room...
+ */
+STATIC void
+xfs_flush_inode_work(
+       struct xfs_mount *mp,
+       void            *arg)
+{
+       struct inode    *inode = arg;
+       filemap_flush(inode->i_mapping);
+       iput(inode);
+}
+
+void
+xfs_flush_inode(
+       xfs_inode_t     *ip)
+{
+       struct inode    *inode = VFS_I(ip);
+
+       igrab(inode);
+       xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inode_work);
+       delay(msecs_to_jiffies(500));
+}
+
+/*
+ * This is the "bigger hammer" version of xfs_flush_inode_work...
+ * (IOW, "If at first you don't succeed, use a Bigger Hammer").
+ */
+STATIC void
+xfs_flush_device_work(
+       struct xfs_mount *mp,
+       void            *arg)
+{
+       struct inode    *inode = arg;
+       sync_blockdev(mp->m_super->s_bdev);
+       iput(inode);
+}
+
+void
+xfs_flush_device(
+       xfs_inode_t     *ip)
+{
+       struct inode    *inode = VFS_I(ip);
+
+       igrab(inode);
+       xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_device_work);
+       delay(msecs_to_jiffies(500));
+       xfs_log_force(ip->i_mount, (xfs_lsn_t)0, XFS_LOG_FORCE|XFS_LOG_SYNC);
+}
+
+/*
+ * Every sync period we need to unpin all items, reclaim inodes, sync
+ * quota and write out the superblock. We might need to cover the log
+ * to indicate it is idle.
+ */
+STATIC void
+xfs_sync_worker(
+       struct xfs_mount *mp,
+       void            *unused)
+{
+       int             error;
+
+       if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
+               xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE);
+               xfs_reclaim_inodes(mp, 0, XFS_IFLUSH_DELWRI_ELSE_ASYNC);
+               /* dgc: errors ignored here */
+               error = XFS_QM_DQSYNC(mp, SYNC_BDFLUSH);
+               error = xfs_sync_fsdata(mp, SYNC_BDFLUSH);
+               if (xfs_log_need_covered(mp))
+                       error = xfs_commit_dummy_trans(mp, XFS_LOG_FORCE);
+       }
+       mp->m_sync_seq++;
+       wake_up(&mp->m_wait_single_sync_task);
+}
+
+STATIC int
+xfssyncd(
+       void                    *arg)
+{
+       struct xfs_mount        *mp = arg;
+       long                    timeleft;
+       bhv_vfs_sync_work_t     *work, *n;
+       LIST_HEAD               (tmp);
+
+       set_freezable();
+       timeleft = xfs_syncd_centisecs * msecs_to_jiffies(10);
+       for (;;) {
+               timeleft = schedule_timeout_interruptible(timeleft);
+               /* swsusp */
+               try_to_freeze();
+               if (kthread_should_stop() && list_empty(&mp->m_sync_list))
+                       break;
+
+               spin_lock(&mp->m_sync_lock);
+               /*
+                * We can get woken by laptop mode, to do a sync -
+                * that's the (only!) case where the list would be
+                * empty with time remaining.
+                */
+               if (!timeleft || list_empty(&mp->m_sync_list)) {
+                       if (!timeleft)
+                               timeleft = xfs_syncd_centisecs *
+                                                       msecs_to_jiffies(10);
+                       INIT_LIST_HEAD(&mp->m_sync_work.w_list);
+                       list_add_tail(&mp->m_sync_work.w_list,
+                                       &mp->m_sync_list);
+               }
+               list_for_each_entry_safe(work, n, &mp->m_sync_list, w_list)
+                       list_move(&work->w_list, &tmp);
+               spin_unlock(&mp->m_sync_lock);
+
+               list_for_each_entry_safe(work, n, &tmp, w_list) {
+                       (*work->w_syncer)(mp, work->w_data);
+                       list_del(&work->w_list);
+                       if (work == &mp->m_sync_work)
+                               continue;
+                       kmem_free(work);
+               }
+       }
+
+       return 0;
+}
+
+int
+xfs_syncd_init(
+       struct xfs_mount        *mp)
+{
+       mp->m_sync_work.w_syncer = xfs_sync_worker;
+       mp->m_sync_work.w_mount = mp;
+       mp->m_sync_task = kthread_run(xfssyncd, mp, "xfssyncd");
+       if (IS_ERR(mp->m_sync_task))
+               return -PTR_ERR(mp->m_sync_task);
+       return 0;
+}
+
+void
+xfs_syncd_stop(
+       struct xfs_mount        *mp)
+{
+       kthread_stop(mp->m_sync_task);
+}
+
+int
+xfs_reclaim_inode(
+       xfs_inode_t     *ip,
+       int             locked,
+       int             sync_mode)
+{
+       xfs_perag_t     *pag = xfs_get_perag(ip->i_mount, ip->i_ino);
+
+       /* The hash lock here protects a thread in xfs_iget_core from
+        * racing with us on linking the inode back with a vnode.
+        * Once we have the XFS_IRECLAIM flag set it will not touch
+        * us.
+        */
+       write_lock(&pag->pag_ici_lock);
+       spin_lock(&ip->i_flags_lock);
+       if (__xfs_iflags_test(ip, XFS_IRECLAIM) ||
+           !__xfs_iflags_test(ip, XFS_IRECLAIMABLE)) {
+               spin_unlock(&ip->i_flags_lock);
+               write_unlock(&pag->pag_ici_lock);
+               if (locked) {
+                       xfs_ifunlock(ip);
+                       xfs_iunlock(ip, XFS_ILOCK_EXCL);
+               }
+               return 1;
+       }
+       __xfs_iflags_set(ip, XFS_IRECLAIM);
+       spin_unlock(&ip->i_flags_lock);
+       write_unlock(&pag->pag_ici_lock);
+       xfs_put_perag(ip->i_mount, pag);
+
+       /*
+        * If the inode is still dirty, then flush it out.  If the inode
+        * is not in the AIL, then it will be OK to flush it delwri as
+        * long as xfs_iflush() does not keep any references to the inode.
+        * We leave that decision up to xfs_iflush() since it has the
+        * knowledge of whether it's OK to simply do a delwri flush of
+        * the inode or whether we need to wait until the inode is
+        * pulled from the AIL.
+        * We get the flush lock regardless, though, just to make sure
+        * we don't free it while it is being flushed.
+        */
+       if (!locked) {
+               xfs_ilock(ip, XFS_ILOCK_EXCL);
+               xfs_iflock(ip);
+       }
+
+       /*
+        * In the case of a forced shutdown we rely on xfs_iflush() to
+        * wait for the inode to be unpinned before returning an error.
+        */
+       if (!is_bad_inode(VFS_I(ip)) && xfs_iflush(ip, sync_mode) == 0) {
+               /* synchronize with xfs_iflush_done */
+               xfs_iflock(ip);
+               xfs_ifunlock(ip);
+       }
+
+       xfs_iunlock(ip, XFS_ILOCK_EXCL);
+       xfs_ireclaim(ip);
+       return 0;
+}
+
+/*
+ * We set the inode flag atomically with the radix tree tag.
+ * Once we get tag lookups on the radix tree, this inode flag
+ * can go away.
+ */
+void
+xfs_inode_set_reclaim_tag(
+       xfs_inode_t     *ip)
+{
+       xfs_mount_t     *mp = ip->i_mount;
+       xfs_perag_t     *pag = xfs_get_perag(mp, ip->i_ino);
+
+       read_lock(&pag->pag_ici_lock);
+       spin_lock(&ip->i_flags_lock);
+       radix_tree_tag_set(&pag->pag_ici_root,
+                       XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
+       __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
+       spin_unlock(&ip->i_flags_lock);
+       read_unlock(&pag->pag_ici_lock);
+       xfs_put_perag(mp, pag);
+}
+
+void
+__xfs_inode_clear_reclaim_tag(
+       xfs_mount_t     *mp,
+       xfs_perag_t     *pag,
+       xfs_inode_t     *ip)
+{
+       radix_tree_tag_clear(&pag->pag_ici_root,
+                       XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
+}
+
+void
+xfs_inode_clear_reclaim_tag(
+       xfs_inode_t     *ip)
+{
+       xfs_mount_t     *mp = ip->i_mount;
+       xfs_perag_t     *pag = xfs_get_perag(mp, ip->i_ino);
+
+       read_lock(&pag->pag_ici_lock);
+       spin_lock(&ip->i_flags_lock);
+       __xfs_inode_clear_reclaim_tag(mp, pag, ip);
+       spin_unlock(&ip->i_flags_lock);
+       read_unlock(&pag->pag_ici_lock);
+       xfs_put_perag(mp, pag);
+}
+
+
+STATIC void
+xfs_reclaim_inodes_ag(
+       xfs_mount_t     *mp,
+       int             ag,
+       int             noblock,
+       int             mode)
+{
+       xfs_inode_t     *ip = NULL;
+       xfs_perag_t     *pag = &mp->m_perag[ag];
+       int             nr_found;
+       uint32_t        first_index;
+       int             skipped;
+
+restart:
+       first_index = 0;
+       skipped = 0;
+       do {
+               /*
+                * use a gang lookup to find the next inode in the tree
+                * as the tree is sparse and a gang lookup walks to find
+                * the number of objects requested.
+                */
+               read_lock(&pag->pag_ici_lock);
+               nr_found = radix_tree_gang_lookup_tag(&pag->pag_ici_root,
+                                       (void**)&ip, first_index, 1,
+                                       XFS_ICI_RECLAIM_TAG);
+
+               if (!nr_found) {
+                       read_unlock(&pag->pag_ici_lock);
+                       break;
+               }
+
+               /*
+                * Update the index for the next lookup. Catch overflows
+                * into the next AG range which can occur if we have inodes
+                * in the last block of the AG and we are currently
+                * pointing to the last inode.
+                */
+               first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
+               if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) {
+                       read_unlock(&pag->pag_ici_lock);
+                       break;
+               }
+
+               ASSERT(xfs_iflags_test(ip, (XFS_IRECLAIMABLE|XFS_IRECLAIM)));
+
+               /* ignore if already under reclaim */
+               if (xfs_iflags_test(ip, XFS_IRECLAIM)) {
+                       read_unlock(&pag->pag_ici_lock);
+                       continue;
+               }
+
+               if (noblock) {
+                       if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
+                               read_unlock(&pag->pag_ici_lock);
+                               continue;
+                       }
+                       if (xfs_ipincount(ip) ||
+                           !xfs_iflock_nowait(ip)) {
+                               xfs_iunlock(ip, XFS_ILOCK_EXCL);
+                               read_unlock(&pag->pag_ici_lock);
+                               continue;
+                       }
+               }
+               read_unlock(&pag->pag_ici_lock);
+
+               /*
+                * hmmm - this is an inode already in reclaim. Do
+                * we even bother catching it here?
+                */
+               if (xfs_reclaim_inode(ip, noblock, mode))
+                       skipped++;
+       } while (nr_found);
+
+       if (skipped) {
+               delay(1);
+               goto restart;
+       }
+       return;
+
+}
+
+int
+xfs_reclaim_inodes(
+       xfs_mount_t     *mp,
+       int              noblock,
+       int             mode)
+{
+       int             i;
+
+       for (i = 0; i < mp->m_sb.sb_agcount; i++) {
+               if (!mp->m_perag[i].pag_ici_init)
+                       continue;
+               xfs_reclaim_inodes_ag(mp, i, noblock, mode);
+       }
+       return 0;
+}
+
+
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
new file mode 100644 (file)
index 0000000..5f6de1e
--- /dev/null
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef XFS_SYNC_H
+#define XFS_SYNC_H 1
+
+struct xfs_mount;
+
+typedef struct bhv_vfs_sync_work {
+       struct list_head        w_list;
+       struct xfs_mount        *w_mount;
+       void                    *w_data;        /* syncer routine argument */
+       void                    (*w_syncer)(struct xfs_mount *, void *);
+} bhv_vfs_sync_work_t;
+
+#define SYNC_ATTR              0x0001  /* sync attributes */
+#define SYNC_DELWRI            0x0002  /* look at delayed writes */
+#define SYNC_WAIT              0x0004  /* wait for i/o to complete */
+#define SYNC_BDFLUSH           0x0008  /* BDFLUSH is calling -- don't block */
+#define SYNC_IOWAIT            0x0010  /* wait for all I/O to complete */
+
+int xfs_syncd_init(struct xfs_mount *mp);
+void xfs_syncd_stop(struct xfs_mount *mp);
+
+int xfs_sync_inodes(struct xfs_mount *mp, int flags);
+int xfs_sync_fsdata(struct xfs_mount *mp, int flags);
+
+int xfs_quiesce_data(struct xfs_mount *mp);
+void xfs_quiesce_attr(struct xfs_mount *mp);
+
+void xfs_flush_inode(struct xfs_inode *ip);
+void xfs_flush_device(struct xfs_inode *ip);
+
+int xfs_reclaim_inode(struct xfs_inode *ip, int locked, int sync_mode);
+int xfs_reclaim_inodes(struct xfs_mount *mp, int noblock, int mode);
+
+void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
+void xfs_inode_clear_reclaim_tag(struct xfs_inode *ip);
+void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag,
+                               struct xfs_inode *ip);
+#endif
index 7dacb5bbde3f30123da154cc1fb3563046b694af..916c0ffb6083de2be56fdf56656ee199e248df73 100644 (file)
@@ -55,17 +55,6 @@ xfs_stats_clear_proc_handler(
 #endif /* CONFIG_PROC_FS */
 
 static ctl_table xfs_table[] = {
-       {
-               .ctl_name       = XFS_RESTRICT_CHOWN,
-               .procname       = "restrict_chown",
-               .data           = &xfs_params.restrict_chown.val,
-               .maxlen         = sizeof(int),
-               .mode           = 0644,
-               .proc_handler   = &proc_dointvec_minmax,
-               .strategy       = &sysctl_intvec,
-               .extra1         = &xfs_params.restrict_chown.min,
-               .extra2         = &xfs_params.restrict_chown.max
-       },
        {
                .ctl_name       = XFS_SGID_INHERIT,
                .procname       = "irix_sgid_inherit",
index 4aadb8056c373c95b0a867e40397234e505d10ae..b9937d450f8e6d72e01e186c09f446c791028ef2 100644 (file)
@@ -31,7 +31,6 @@ typedef struct xfs_sysctl_val {
 } xfs_sysctl_val_t;
 
 typedef struct xfs_param {
-       xfs_sysctl_val_t restrict_chown;/* Root/non-root can give away files.*/
        xfs_sysctl_val_t sgid_inherit;  /* Inherit S_ISGID if process' GID is
                                         * not a member of parent dir GID. */
        xfs_sysctl_val_t symlink_mode;  /* Link creat mode affected by umask */
@@ -68,7 +67,7 @@ typedef struct xfs_param {
 enum {
        /* XFS_REFCACHE_SIZE = 1 */
        /* XFS_REFCACHE_PURGE = 2 */
-       XFS_RESTRICT_CHOWN = 3,
+       /* XFS_RESTRICT_CHOWN = 3 */
        XFS_SGID_INHERIT = 4,
        XFS_SYMLINK_MODE = 5,
        XFS_PANIC_MASK = 6,
index 7e60c7776b1cd0e93c2786671241227dce9a1798..0ab60bc2e76140644d2bdb78c32a5f5d54b569bb 100644 (file)
@@ -33,37 +33,6 @@ struct xfs_mount_args;
 
 typedef struct kstatfs bhv_statvfs_t;
 
-typedef struct bhv_vfs_sync_work {
-       struct list_head        w_list;
-       struct xfs_mount        *w_mount;
-       void                    *w_data;        /* syncer routine argument */
-       void                    (*w_syncer)(struct xfs_mount *, void *);
-} bhv_vfs_sync_work_t;
-
-#define SYNC_ATTR              0x0001  /* sync attributes */
-#define SYNC_CLOSE             0x0002  /* close file system down */
-#define SYNC_DELWRI            0x0004  /* look at delayed writes */
-#define SYNC_WAIT              0x0008  /* wait for i/o to complete */
-#define SYNC_BDFLUSH           0x0010  /* BDFLUSH is calling -- don't block */
-#define SYNC_FSDATA            0x0020  /* flush fs data (e.g. superblocks) */
-#define SYNC_REFCACHE          0x0040  /* prune some of the nfs ref cache */
-#define SYNC_REMOUNT           0x0080  /* remount readonly, no dummy LRs */
-#define SYNC_IOWAIT            0x0100  /* wait for all I/O to complete */
-
-/*
- * When remounting a filesystem read-only or freezing the filesystem,
- * we have two phases to execute. This first phase is syncing the data
- * before we quiesce the fielsystem, and the second is flushing all the
- * inodes out after we've waited for all the transactions created by
- * the first phase to complete. The second phase uses SYNC_INODE_QUIESCE
- * to ensure that the inodes are written to their location on disk
- * rather than just existing in transactions in the log. This means
- * after a quiesce there is no log replay required to write the inodes
- * to disk (this is the main difference between a sync and a quiesce).
- */
-#define SYNC_DATA_QUIESCE      (SYNC_DELWRI|SYNC_FSDATA|SYNC_WAIT|SYNC_IOWAIT)
-#define SYNC_INODE_QUIESCE     (SYNC_REMOUNT|SYNC_ATTR|SYNC_WAIT)
-
 #define SHUTDOWN_META_IO_ERROR 0x0001  /* write attempt to metadata failed */
 #define SHUTDOWN_LOG_IO_ERROR  0x0002  /* write attempt to the log failed */
 #define SHUTDOWN_FORCE_UMOUNT  0x0004  /* shutdown from a forced unmount */
index b52528bbbfff7953510639c5fda9d447e995adee..ad18262d651b7f3c711a9d70ecb11bb8cbb7e819 100644 (file)
@@ -84,25 +84,12 @@ vn_ioerror(
 
 #ifdef XFS_INODE_TRACE
 
-/*
- * Reference count of Linux inode if present, -1 if the xfs_inode
- * has no associated Linux inode.
- */
-static inline int xfs_icount(struct xfs_inode *ip)
-{
-       struct inode *vp = VFS_I(ip);
-
-       if (vp)
-               return vn_count(vp);
-       return -1;
-}
-
 #define KTRACE_ENTER(ip, vk, s, line, ra)                      \
        ktrace_enter(   (ip)->i_trace,                          \
 /*  0 */               (void *)(__psint_t)(vk),                \
 /*  1 */               (void *)(s),                            \
 /*  2 */               (void *)(__psint_t) line,               \
-/*  3 */               (void *)(__psint_t)xfs_icount(ip),      \
+/*  3 */               (void *)(__psint_t)atomic_read(&VFS_I(ip)->i_count), \
 /*  4 */               (void *)(ra),                           \
 /*  5 */               NULL,                                   \
 /*  6 */               (void *)(__psint_t)current_cpu(),       \
index 683ce16210ffefe8ee69dd54c3c85e46b29331ae..bf89e41c3b8df64d2112d60db661c8900d3acad0 100644 (file)
@@ -80,11 +80,6 @@ do { \
        iput(VFS_I(ip)); \
 } while (0)
 
-static inline struct inode *vn_grab(struct inode *vp)
-{
-       return igrab(vp);
-}
-
 /*
  * Dealing with bad inodes
  */
index f2705f2fd43c264674660da0285adb5e5a2c39db..591ca6602bfb6738e4677a43c094aa62d5238254 100644 (file)
@@ -101,7 +101,7 @@ xfs_qm_dqinit(
        if (brandnewdquot) {
                dqp->dq_flnext = dqp->dq_flprev = dqp;
                mutex_init(&dqp->q_qlock);
-               sv_init(&dqp->q_pinwait, SV_DEFAULT, "pdq");
+               init_waitqueue_head(&dqp->q_pinwait);
 
                /*
                 * Because we want to use a counting completion, complete
@@ -131,7 +131,7 @@ xfs_qm_dqinit(
                 dqp->q_res_bcount = 0;
                 dqp->q_res_icount = 0;
                 dqp->q_res_rtbcount = 0;
-                dqp->q_pincount = 0;
+                atomic_set(&dqp->q_pincount, 0);
                 dqp->q_hash = NULL;
                 ASSERT(dqp->dq_flnext == dqp->dq_flprev);
 
@@ -1221,16 +1221,14 @@ xfs_qm_dqflush(
        xfs_dqtrace_entry(dqp, "DQFLUSH");
 
        /*
-        * If not dirty, nada.
+        * If not dirty, or it's pinned and we are not supposed to
+        * block, nada.
         */
-       if (!XFS_DQ_IS_DIRTY(dqp)) {
+       if (!XFS_DQ_IS_DIRTY(dqp) ||
+           (!(flags & XFS_QMOPT_SYNC) && atomic_read(&dqp->q_pincount) > 0)) {
                xfs_dqfunlock(dqp);
-               return (0);
+               return 0;
        }
-
-       /*
-        * Cant flush a pinned dquot. Wait for it.
-        */
        xfs_qm_dqunpin_wait(dqp);
 
        /*
@@ -1274,10 +1272,8 @@ xfs_qm_dqflush(
        dqp->dq_flags &= ~(XFS_DQ_DIRTY);
        mp = dqp->q_mount;
 
-       /* lsn is 64 bits */
-       spin_lock(&mp->m_ail_lock);
-       dqp->q_logitem.qli_flush_lsn = dqp->q_logitem.qli_item.li_lsn;
-       spin_unlock(&mp->m_ail_lock);
+       xfs_trans_ail_copy_lsn(mp->m_ail, &dqp->q_logitem.qli_flush_lsn,
+                                       &dqp->q_logitem.qli_item.li_lsn);
 
        /*
         * Attach an iodone routine so that we can remove this dquot from the
@@ -1323,8 +1319,10 @@ xfs_qm_dqflush_done(
        xfs_dq_logitem_t        *qip)
 {
        xfs_dquot_t             *dqp;
+       struct xfs_ail          *ailp;
 
        dqp = qip->qli_dquot;
+       ailp = qip->qli_item.li_ailp;
 
        /*
         * We only want to pull the item from the AIL if its
@@ -1337,15 +1335,12 @@ xfs_qm_dqflush_done(
        if ((qip->qli_item.li_flags & XFS_LI_IN_AIL) &&
            qip->qli_item.li_lsn == qip->qli_flush_lsn) {
 
-               spin_lock(&dqp->q_mount->m_ail_lock);
-               /*
-                * xfs_trans_delete_ail() drops the AIL lock.
-                */
+               /* xfs_trans_ail_delete() drops the AIL lock. */
+               spin_lock(&ailp->xa_lock);
                if (qip->qli_item.li_lsn == qip->qli_flush_lsn)
-                       xfs_trans_delete_ail(dqp->q_mount,
-                                            (xfs_log_item_t*)qip);
+                       xfs_trans_ail_delete(ailp, (xfs_log_item_t*)qip);
                else
-                       spin_unlock(&dqp->q_mount->m_ail_lock);
+                       spin_unlock(&ailp->xa_lock);
        }
 
        /*
@@ -1375,7 +1370,7 @@ xfs_dqunlock(
        mutex_unlock(&(dqp->q_qlock));
        if (dqp->q_logitem.qli_dquot == dqp) {
                /* Once was dqp->q_mount, but might just have been cleared */
-               xfs_trans_unlocked_item(dqp->q_logitem.qli_item.li_mountp,
+               xfs_trans_unlocked_item(dqp->q_logitem.qli_item.li_ailp,
                                        (xfs_log_item_t*)&(dqp->q_logitem));
        }
 }
@@ -1489,7 +1484,7 @@ xfs_qm_dqpurge(
                                "xfs_qm_dqpurge: dquot %p flush failed", dqp);
                xfs_dqflock(dqp);
        }
-       ASSERT(dqp->q_pincount == 0);
+       ASSERT(atomic_read(&dqp->q_pincount) == 0);
        ASSERT(XFS_FORCED_SHUTDOWN(mp) ||
               !(dqp->q_logitem.qli_item.li_flags & XFS_LI_IN_AIL));
 
index 8958d0faf8d37606c806a1ce22d9c996036bdf08..7e455337e2ba55038199a8afeb1ac85a70786d9c 100644 (file)
@@ -83,8 +83,8 @@ typedef struct xfs_dquot {
        xfs_qcnt_t       q_res_rtbcount;/* total realtime blks used+reserved */
        mutex_t          q_qlock;       /* quota lock */
        struct completion q_flush;      /* flush completion queue */
-       uint             q_pincount;    /* pin count for this dquot */
-       sv_t             q_pinwait;     /* sync var for pinning */
+       atomic_t          q_pincount;   /* dquot pin count */
+       wait_queue_head_t q_pinwait;    /* dquot pinning wait queue */
 #ifdef XFS_DQUOT_TRACE
        struct ktrace   *q_trace;       /* trace header structure */
 #endif
index f028644caa5eb090e93620aae600bd6d9d87d26d..1728f6a7c4f50d4a37ee22900140b1a02e90c7d8 100644 (file)
@@ -88,25 +88,22 @@ xfs_qm_dquot_logitem_format(
 
 /*
  * Increment the pin count of the given dquot.
- * This value is protected by pinlock spinlock in the xQM structure.
  */
 STATIC void
 xfs_qm_dquot_logitem_pin(
        xfs_dq_logitem_t *logitem)
 {
-       xfs_dquot_t *dqp;
+       xfs_dquot_t *dqp = logitem->qli_dquot;
 
-       dqp = logitem->qli_dquot;
        ASSERT(XFS_DQ_IS_LOCKED(dqp));
-       spin_lock(&(XFS_DQ_TO_QINF(dqp)->qi_pinlock));
-       dqp->q_pincount++;
-       spin_unlock(&(XFS_DQ_TO_QINF(dqp)->qi_pinlock));
+       atomic_inc(&dqp->q_pincount);
 }
 
 /*
  * Decrement the pin count of the given dquot, and wake up
  * anyone in xfs_dqwait_unpin() if the count goes to 0.         The
- * dquot must have been previously pinned with a call to xfs_dqpin().
+ * dquot must have been previously pinned with a call to
+ * xfs_qm_dquot_logitem_pin().
  */
 /* ARGSUSED */
 STATIC void
@@ -114,16 +111,11 @@ xfs_qm_dquot_logitem_unpin(
        xfs_dq_logitem_t *logitem,
        int               stale)
 {
-       xfs_dquot_t *dqp;
+       xfs_dquot_t *dqp = logitem->qli_dquot;
 
-       dqp = logitem->qli_dquot;
-       ASSERT(dqp->q_pincount > 0);
-       spin_lock(&(XFS_DQ_TO_QINF(dqp)->qi_pinlock));
-       dqp->q_pincount--;
-       if (dqp->q_pincount == 0) {
-               sv_broadcast(&dqp->q_pinwait);
-       }
-       spin_unlock(&(XFS_DQ_TO_QINF(dqp)->qi_pinlock));
+       ASSERT(atomic_read(&dqp->q_pincount) > 0);
+       if (atomic_dec_and_test(&dqp->q_pincount))
+               wake_up(&dqp->q_pinwait);
 }
 
 /* ARGSUSED */
@@ -193,21 +185,14 @@ xfs_qm_dqunpin_wait(
        xfs_dquot_t     *dqp)
 {
        ASSERT(XFS_DQ_IS_LOCKED(dqp));
-       if (dqp->q_pincount == 0) {
+       if (atomic_read(&dqp->q_pincount) == 0)
                return;
-       }
 
        /*
         * Give the log a push so we don't wait here too long.
         */
        xfs_log_force(dqp->q_mount, (xfs_lsn_t)0, XFS_LOG_FORCE);
-       spin_lock(&(XFS_DQ_TO_QINF(dqp)->qi_pinlock));
-       if (dqp->q_pincount == 0) {
-               spin_unlock(&(XFS_DQ_TO_QINF(dqp)->qi_pinlock));
-               return;
-       }
-       sv_wait(&(dqp->q_pinwait), PINOD,
-               &(XFS_DQ_TO_QINF(dqp)->qi_pinlock), s);
+       wait_event(dqp->q_pinwait, (atomic_read(&dqp->q_pincount) == 0));
 }
 
 /*
@@ -310,7 +295,7 @@ xfs_qm_dquot_logitem_trylock(
        uint                    retval;
 
        dqp = qip->qli_dquot;
-       if (dqp->q_pincount > 0)
+       if (atomic_read(&dqp->q_pincount) > 0)
                return (XFS_ITEM_PINNED);
 
        if (! xfs_qm_dqlock_nowait(dqp))
@@ -568,14 +553,16 @@ xfs_qm_qoffend_logitem_committed(
        xfs_lsn_t lsn)
 {
        xfs_qoff_logitem_t      *qfs;
+       struct xfs_ail          *ailp;
 
        qfs = qfe->qql_start_lip;
-       spin_lock(&qfs->qql_item.li_mountp->m_ail_lock);
+       ailp = qfs->qql_item.li_ailp;
+       spin_lock(&ailp->xa_lock);
        /*
         * Delete the qoff-start logitem from the AIL.
-        * xfs_trans_delete_ail() drops the AIL lock.
+        * xfs_trans_ail_delete() drops the AIL lock.
         */
-       xfs_trans_delete_ail(qfs->qql_item.li_mountp, (xfs_log_item_t *)qfs);
+       xfs_trans_ail_delete(ailp, (xfs_log_item_t *)qfs);
        kmem_free(qfs);
        kmem_free(qfe);
        return (xfs_lsn_t)-1;
index df0ffef9775ae75d54c647a7b56c4ba248028a01..5b198d15e76bf4afe703ac83a8e036ac2d35e940 100644 (file)
@@ -20,7 +20,6 @@
 #include "xfs_bit.h"
 #include "xfs_log.h"
 #include "xfs_inum.h"
-#include "xfs_clnt.h"
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
@@ -987,14 +986,10 @@ xfs_qm_dqdetach(
 }
 
 /*
- * This is called by VFS_SYNC and flags arg determines the caller,
- * and its motives, as done in xfs_sync.
- *
- * vfs_sync: SYNC_FSDATA|SYNC_ATTR|SYNC_BDFLUSH 0x31
- * syscall sync: SYNC_FSDATA|SYNC_ATTR|SYNC_DELWRI 0x25
- * umountroot : SYNC_WAIT | SYNC_CLOSE | SYNC_ATTR | SYNC_FSDATA
+ * This is called to sync quotas. We can be told to use non-blocking
+ * semantics by either the SYNC_BDFLUSH flag or the absence of the
+ * SYNC_WAIT flag.
  */
-
 int
 xfs_qm_sync(
        xfs_mount_t     *mp,
@@ -1137,7 +1132,6 @@ xfs_qm_init_quotainfo(
                return error;
        }
 
-       spin_lock_init(&qinf->qi_pinlock);
        xfs_qm_list_init(&qinf->qi_dqlist, "mpdqlist", 0);
        qinf->qi_dqreclaims = 0;
 
@@ -1234,7 +1228,6 @@ xfs_qm_destroy_quotainfo(
         */
        xfs_qm_rele_quotafs_ref(mp);
 
-       spinlock_destroy(&qi->qi_pinlock);
        xfs_qm_list_destroy(&qi->qi_dqlist);
 
        if (qi->qi_uquotaip) {
index 44f25349e478bb7eb7db859da7c49c24c0cf75d1..4f2de9771728d40ac45925809f844740a503bc7a 100644 (file)
@@ -106,7 +106,6 @@ typedef struct xfs_qm {
 typedef struct xfs_quotainfo {
        xfs_inode_t     *qi_uquotaip;    /* user quota inode */
        xfs_inode_t     *qi_gquotaip;    /* group quota inode */
-       spinlock_t       qi_pinlock;     /* dquot pinning lock */
        xfs_dqlist_t     qi_dqlist;      /* all dquots in filesys */
        int              qi_dqreclaims;  /* a change here indicates
                                            a removal in the dqlist */
index eea2e60b456b6e962bd30872383e9ce07a0bfe31..9556df9f7dabbd717394ae173f6a1f467374229f 100644 (file)
@@ -20,7 +20,6 @@
 #include "xfs_bit.h"
 #include "xfs_log.h"
 #include "xfs_inum.h"
-#include "xfs_clnt.h"
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
index 1a3b803dfa55fbd2ca671e589e60098f6f5a0194..9ff28e6c5b8be9969158e1ef6378e4f52441d136 100644 (file)
@@ -127,7 +127,7 @@ xfs_qm_quotactl(
                break;
 
        case Q_XQUOTASYNC:
-               return (xfs_sync_inodes(mp, SYNC_DELWRI, NULL));
+               return xfs_sync_inodes(mp, SYNC_DELWRI);
 
        default:
                break;
@@ -1022,101 +1022,92 @@ xfs_qm_export_flags(
 
 
 /*
- * Go thru all the inodes in the file system, releasing their dquots.
- * Note that the mount structure gets modified to indicate that quotas are off
- * AFTER this, in the case of quotaoff. This also gets called from
- * xfs_rootumount.
+ * Release all the dquots on the inodes in an AG.
  */
-void
-xfs_qm_dqrele_all_inodes(
-       struct xfs_mount *mp,
-       uint             flags)
+STATIC void
+xfs_qm_dqrele_inodes_ag(
+       xfs_mount_t     *mp,
+       int             ag,
+       uint            flags)
 {
-       xfs_inode_t     *ip, *topino;
-       uint            ireclaims;
-       struct inode    *vp;
-       boolean_t       vnode_refd;
-
-       ASSERT(mp->m_quotainfo);
+       xfs_inode_t     *ip = NULL;
+       xfs_perag_t     *pag = &mp->m_perag[ag];
+       int             first_index = 0;
+       int             nr_found;
 
-       XFS_MOUNT_ILOCK(mp);
-again:
-       ip = mp->m_inodes;
-       if (ip == NULL) {
-               XFS_MOUNT_IUNLOCK(mp);
-               return;
-       }
        do {
-               /* Skip markers inserted by xfs_sync */
-               if (ip->i_mount == NULL) {
-                       ip = ip->i_mnext;
-                       continue;
-               }
-               /* Root inode, rbmip and rsumip have associated blocks */
-               if (ip == XFS_QI_UQIP(mp) || ip == XFS_QI_GQIP(mp)) {
-                       ASSERT(ip->i_udquot == NULL);
-                       ASSERT(ip->i_gdquot == NULL);
-                       ip = ip->i_mnext;
-                       continue;
+               boolean_t       inode_refed;
+               struct inode    *inode;
+
+               /*
+                * use a gang lookup to find the next inode in the tree
+                * as the tree is sparse and a gang lookup walks to find
+                * the number of objects requested.
+                */
+               read_lock(&pag->pag_ici_lock);
+               nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
+                               (void**)&ip, first_index, 1);
+
+               if (!nr_found) {
+                       read_unlock(&pag->pag_ici_lock);
+                       break;
                }
-               vp = VFS_I(ip);
-               if (!vp) {
+
+               /* update the index for the next lookup */
+               first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
+
+               /* skip quota inodes and those in reclaim */
+               inode = VFS_I(ip);
+               if (!inode || ip == XFS_QI_UQIP(mp) || ip == XFS_QI_GQIP(mp)) {
                        ASSERT(ip->i_udquot == NULL);
                        ASSERT(ip->i_gdquot == NULL);
-                       ip = ip->i_mnext;
+                       read_unlock(&pag->pag_ici_lock);
                        continue;
                }
-               vnode_refd = B_FALSE;
                if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL) == 0) {
-                       ireclaims = mp->m_ireclaims;
-                       topino = mp->m_inodes;
-                       vp = vn_grab(vp);
-                       if (!vp)
-                               goto again;
-
-                       XFS_MOUNT_IUNLOCK(mp);
-                       /* XXX restart limit ? */
+                       inode = igrab(inode);
+                       read_unlock(&pag->pag_ici_lock);
+                       if (!inode)
+                               continue;
+                       inode_refed = B_TRUE;
                        xfs_ilock(ip, XFS_ILOCK_EXCL);
-                       vnode_refd = B_TRUE;
                } else {
-                       ireclaims = mp->m_ireclaims;
-                       topino = mp->m_inodes;
-                       XFS_MOUNT_IUNLOCK(mp);
+                       read_unlock(&pag->pag_ici_lock);
                }
-
-               /*
-                * We don't keep the mountlock across the dqrele() call,
-                * since it can take a while..
-                */
                if ((flags & XFS_UQUOTA_ACCT) && ip->i_udquot) {
                        xfs_qm_dqrele(ip->i_udquot);
                        ip->i_udquot = NULL;
                }
-               if (flags & (XFS_PQUOTA_ACCT|XFS_GQUOTA_ACCT) && ip->i_gdquot) {
+               if (flags & (XFS_PQUOTA_ACCT|XFS_GQUOTA_ACCT) &&
+                   ip->i_gdquot) {
                        xfs_qm_dqrele(ip->i_gdquot);
                        ip->i_gdquot = NULL;
                }
                xfs_iunlock(ip, XFS_ILOCK_EXCL);
-               /*
-                * Wait until we've dropped the ilock and mountlock to
-                * do the vn_rele. Or be condemned to an eternity in the
-                * inactive code in hell.
-                */
-               if (vnode_refd)
+               if (inode_refed)
                        IRELE(ip);
-               XFS_MOUNT_ILOCK(mp);
-               /*
-                * If an inode was inserted or removed, we gotta
-                * start over again.
-                */
-               if (topino != mp->m_inodes || mp->m_ireclaims != ireclaims) {
-                       /* XXX use a sentinel */
-                       goto again;
-               }
-               ip = ip->i_mnext;
-       } while (ip != mp->m_inodes);
+       } while (nr_found);
+}
+
+/*
+ * Go thru all the inodes in the file system, releasing their dquots.
+ * Note that the mount structure gets modified to indicate that quotas are off
+ * AFTER this, in the case of quotaoff. This also gets called from
+ * xfs_rootumount.
+ */
+void
+xfs_qm_dqrele_all_inodes(
+       struct xfs_mount *mp,
+       uint             flags)
+{
+       int             i;
 
-       XFS_MOUNT_IUNLOCK(mp);
+       ASSERT(mp->m_quotainfo);
+       for (i = 0; i < mp->m_sb.sb_agcount; i++) {
+               if (!mp->m_perag[i].pag_ici_init)
+                       continue;
+               xfs_qm_dqrele_inodes_ag(mp, i, flags);
+       }
 }
 
 /*------------------------------------------------------------------------*/
index c27abef7b84f9498821dfb9704e21b15bb86e781..636104254cfdc9001ca245dbd27a9fb3b271aa61 100644 (file)
@@ -84,5 +84,5 @@ assfail(char *expr, char *file, int line)
 void
 xfs_hex_dump(void *p, int length)
 {
-       print_hex_dump(KERN_ALERT, "", DUMP_PREFIX_OFFSET, 16, 1, p, length, 1);
+       print_hex_dump(KERN_ALERT, "", DUMP_PREFIX_ADDRESS, 16, 1, p, length, 1);
 }
index 540e4c98982512a365eeccc4bd7f5d16b1d9db63..17254b529c54de506e8969826e1f905051763b80 100644 (file)
@@ -30,7 +30,7 @@
 #define XFS_ATTR_TRACE 1
 #define XFS_BLI_TRACE 1
 #define XFS_BMAP_TRACE 1
-#define XFS_BMBT_TRACE 1
+#define XFS_BTREE_TRACE 1
 #define XFS_DIR2_TRACE 1
 #define XFS_DQUOT_TRACE 1
 #define XFS_ILOCK_TRACE 1
index b2f639a1416f0ecd1c0b91e4958584de823c3a46..a8cdd73999a405080cf5748d8814306cf333170e 100644 (file)
@@ -366,7 +366,7 @@ xfs_acl_allow_set(
                return ENOTDIR;
        if (vp->i_sb->s_flags & MS_RDONLY)
                return EROFS;
-       if (XFS_I(vp)->i_d.di_uid != current->fsuid && !capable(CAP_FOWNER))
+       if (XFS_I(vp)->i_d.di_uid != current_fsuid() && !capable(CAP_FOWNER))
                return EPERM;
        return 0;
 }
@@ -413,13 +413,13 @@ xfs_acl_access(
                switch (fap->acl_entry[i].ae_tag) {
                case ACL_USER_OBJ:
                        seen_userobj = 1;
-                       if (fuid != current->fsuid)
+                       if (fuid != current_fsuid())
                                continue;
                        matched.ae_tag = ACL_USER_OBJ;
                        matched.ae_perm = allows;
                        break;
                case ACL_USER:
-                       if (fap->acl_entry[i].ae_id != current->fsuid)
+                       if (fap->acl_entry[i].ae_id != current_fsuid())
                                continue;
                        matched.ae_tag = ACL_USER;
                        matched.ae_perm = allows;
@@ -758,7 +758,7 @@ xfs_acl_setmode(
        if (gap && nomask)
                iattr.ia_mode |= gap->ae_perm << 3;
 
-       return xfs_setattr(XFS_I(vp), &iattr, 0, sys_cred);
+       return xfs_setattr(XFS_I(vp), &iattr, 0);
 }
 
 /*
index 61b292a9fb414f2486ce92a1175b8a11b2cc6e7d..2bfd86329141750541d37732ce10842a4b0b733b 100644 (file)
@@ -192,17 +192,23 @@ typedef struct xfs_perag
        xfs_agino_t     pagi_freecount; /* number of free inodes */
        xfs_agino_t     pagi_count;     /* number of allocated inodes */
        int             pagb_count;     /* pagb slots in use */
+       xfs_perag_busy_t *pagb_list;    /* unstable blocks */
 #ifdef __KERNEL__
        spinlock_t      pagb_lock;      /* lock for pagb_list */
-#endif
-       xfs_perag_busy_t *pagb_list;    /* unstable blocks */
+
        atomic_t        pagf_fstrms;    /* # of filestreams active in this AG */
 
        int             pag_ici_init;   /* incore inode cache initialised */
        rwlock_t        pag_ici_lock;   /* incore inode lock */
        struct radix_tree_root pag_ici_root;    /* incore inode cache root */
+#endif
 } xfs_perag_t;
 
+/*
+ * tags for inode radix tree
+ */
+#define XFS_ICI_RECLAIM_TAG    0       /* inode is to be reclaimed */
+
 #define        XFS_AG_MAXLEVELS(mp)            ((mp)->m_ag_maxlevels)
 #define        XFS_MIN_FREELIST_RAW(bl,cl,mp)  \
        (MIN(bl + 1, XFS_AG_MAXLEVELS(mp)) + MIN(cl + 1, XFS_AG_MAXLEVELS(mp)))
index 1956f83489f1b8378f4459844e422547d421e0ec..c47ce9075728f513f0b844c2284fd02db447418e 100644 (file)
@@ -89,6 +89,92 @@ STATIC int xfs_alloc_ag_vextent_small(xfs_alloc_arg_t *,
  * Internal functions.
  */
 
+/*
+ * Lookup the record equal to [bno, len] in the btree given by cur.
+ */
+STATIC int                             /* error */
+xfs_alloc_lookup_eq(
+       struct xfs_btree_cur    *cur,   /* btree cursor */
+       xfs_agblock_t           bno,    /* starting block of extent */
+       xfs_extlen_t            len,    /* length of extent */
+       int                     *stat)  /* success/failure */
+{
+       cur->bc_rec.a.ar_startblock = bno;
+       cur->bc_rec.a.ar_blockcount = len;
+       return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
+}
+
+/*
+ * Lookup the first record greater than or equal to [bno, len]
+ * in the btree given by cur.
+ */
+STATIC int                             /* error */
+xfs_alloc_lookup_ge(
+       struct xfs_btree_cur    *cur,   /* btree cursor */
+       xfs_agblock_t           bno,    /* starting block of extent */
+       xfs_extlen_t            len,    /* length of extent */
+       int                     *stat)  /* success/failure */
+{
+       cur->bc_rec.a.ar_startblock = bno;
+       cur->bc_rec.a.ar_blockcount = len;
+       return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat);
+}
+
+/*
+ * Lookup the first record less than or equal to [bno, len]
+ * in the btree given by cur.
+ */
+STATIC int                             /* error */
+xfs_alloc_lookup_le(
+       struct xfs_btree_cur    *cur,   /* btree cursor */
+       xfs_agblock_t           bno,    /* starting block of extent */
+       xfs_extlen_t            len,    /* length of extent */
+       int                     *stat)  /* success/failure */
+{
+       cur->bc_rec.a.ar_startblock = bno;
+       cur->bc_rec.a.ar_blockcount = len;
+       return xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat);
+}
+
+/*
+ * Update the record referred to by cur to the value given
+ * by [bno, len].
+ * This either works (return 0) or gets an EFSCORRUPTED error.
+ */
+STATIC int                             /* error */
+xfs_alloc_update(
+       struct xfs_btree_cur    *cur,   /* btree cursor */
+       xfs_agblock_t           bno,    /* starting block of extent */
+       xfs_extlen_t            len)    /* length of extent */
+{
+       union xfs_btree_rec     rec;
+
+       rec.alloc.ar_startblock = cpu_to_be32(bno);
+       rec.alloc.ar_blockcount = cpu_to_be32(len);
+       return xfs_btree_update(cur, &rec);
+}
+
+/*
+ * Get the data from the pointed-to record.
+ */
+STATIC int                             /* error */
+xfs_alloc_get_rec(
+       struct xfs_btree_cur    *cur,   /* btree cursor */
+       xfs_agblock_t           *bno,   /* output: starting block of extent */
+       xfs_extlen_t            *len,   /* output: length of extent */
+       int                     *stat)  /* output: success/failure */
+{
+       union xfs_btree_rec     *rec;
+       int                     error;
+
+       error = xfs_btree_get_rec(cur, &rec, stat);
+       if (!error && *stat == 1) {
+               *bno = be32_to_cpu(rec->alloc.ar_startblock);
+               *len = be32_to_cpu(rec->alloc.ar_blockcount);
+       }
+       return error;
+}
+
 /*
  * Compute aligned version of the found extent.
  * Takes alignment and min length into account.
@@ -294,21 +380,20 @@ xfs_alloc_fixup_trees(
                        return error;
                XFS_WANT_CORRUPTED_RETURN(i == 1);
        }
+
 #ifdef DEBUG
-       {
-               xfs_alloc_block_t       *bnoblock;
-               xfs_alloc_block_t       *cntblock;
-
-               if (bno_cur->bc_nlevels == 1 &&
-                   cnt_cur->bc_nlevels == 1) {
-                       bnoblock = XFS_BUF_TO_ALLOC_BLOCK(bno_cur->bc_bufs[0]);
-                       cntblock = XFS_BUF_TO_ALLOC_BLOCK(cnt_cur->bc_bufs[0]);
-                       XFS_WANT_CORRUPTED_RETURN(
-                               be16_to_cpu(bnoblock->bb_numrecs) ==
-                               be16_to_cpu(cntblock->bb_numrecs));
-               }
+       if (bno_cur->bc_nlevels == 1 && cnt_cur->bc_nlevels == 1) {
+               struct xfs_btree_block  *bnoblock;
+               struct xfs_btree_block  *cntblock;
+
+               bnoblock = XFS_BUF_TO_BLOCK(bno_cur->bc_bufs[0]);
+               cntblock = XFS_BUF_TO_BLOCK(cnt_cur->bc_bufs[0]);
+
+               XFS_WANT_CORRUPTED_RETURN(
+                       bnoblock->bb_numrecs == cntblock->bb_numrecs);
        }
 #endif
+
        /*
         * Deal with all four cases: the allocated record is contained
         * within the freespace record, so we can have new freespace
@@ -333,7 +418,7 @@ xfs_alloc_fixup_trees(
        /*
         * Delete the entry from the by-size btree.
         */
-       if ((error = xfs_alloc_delete(cnt_cur, &i)))
+       if ((error = xfs_btree_delete(cnt_cur, &i)))
                return error;
        XFS_WANT_CORRUPTED_RETURN(i == 1);
        /*
@@ -343,7 +428,7 @@ xfs_alloc_fixup_trees(
                if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno1, nflen1, &i)))
                        return error;
                XFS_WANT_CORRUPTED_RETURN(i == 0);
-               if ((error = xfs_alloc_insert(cnt_cur, &i)))
+               if ((error = xfs_btree_insert(cnt_cur, &i)))
                        return error;
                XFS_WANT_CORRUPTED_RETURN(i == 1);
        }
@@ -351,7 +436,7 @@ xfs_alloc_fixup_trees(
                if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno2, nflen2, &i)))
                        return error;
                XFS_WANT_CORRUPTED_RETURN(i == 0);
-               if ((error = xfs_alloc_insert(cnt_cur, &i)))
+               if ((error = xfs_btree_insert(cnt_cur, &i)))
                        return error;
                XFS_WANT_CORRUPTED_RETURN(i == 1);
        }
@@ -362,7 +447,7 @@ xfs_alloc_fixup_trees(
                /*
                 * No remaining freespace, just delete the by-block tree entry.
                 */
-               if ((error = xfs_alloc_delete(bno_cur, &i)))
+               if ((error = xfs_btree_delete(bno_cur, &i)))
                        return error;
                XFS_WANT_CORRUPTED_RETURN(i == 1);
        } else {
@@ -379,7 +464,7 @@ xfs_alloc_fixup_trees(
                if ((error = xfs_alloc_lookup_eq(bno_cur, nfbno2, nflen2, &i)))
                        return error;
                XFS_WANT_CORRUPTED_RETURN(i == 0);
-               if ((error = xfs_alloc_insert(bno_cur, &i)))
+               if ((error = xfs_btree_insert(bno_cur, &i)))
                        return error;
                XFS_WANT_CORRUPTED_RETURN(i == 1);
        }
@@ -640,8 +725,8 @@ xfs_alloc_ag_vextent_exact(
        /*
         * Allocate/initialize a cursor for the by-number freespace btree.
         */
-       bno_cur = xfs_btree_init_cursor(args->mp, args->tp, args->agbp,
-               args->agno, XFS_BTNUM_BNO, NULL, 0);
+       bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
+               args->agno, XFS_BTNUM_BNO);
        /*
         * Lookup bno and minlen in the btree (minlen is irrelevant, really).
         * Look for the closest free block <= bno, it must contain bno
@@ -696,8 +781,8 @@ xfs_alloc_ag_vextent_exact(
         * We are allocating agbno for rlen [agbno .. end]
         * Allocate/initialize a cursor for the by-size btree.
         */
-       cnt_cur = xfs_btree_init_cursor(args->mp, args->tp, args->agbp,
-               args->agno, XFS_BTNUM_CNT, NULL, 0);
+       cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
+               args->agno, XFS_BTNUM_CNT);
        ASSERT(args->agbno + args->len <=
                be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
        if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen,
@@ -759,8 +844,8 @@ xfs_alloc_ag_vextent_near(
        /*
         * Get a cursor for the by-size btree.
         */
-       cnt_cur = xfs_btree_init_cursor(args->mp, args->tp, args->agbp,
-               args->agno, XFS_BTNUM_CNT, NULL, 0);
+       cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
+               args->agno, XFS_BTNUM_CNT);
        ltlen = 0;
        bno_cur_lt = bno_cur_gt = NULL;
        /*
@@ -818,7 +903,7 @@ xfs_alloc_ag_vextent_near(
                                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
                                if (ltlen >= args->minlen)
                                        break;
-                               if ((error = xfs_alloc_increment(cnt_cur, 0, &i)))
+                               if ((error = xfs_btree_increment(cnt_cur, 0, &i)))
                                        goto error0;
                        } while (i);
                        ASSERT(ltlen >= args->minlen);
@@ -828,7 +913,7 @@ xfs_alloc_ag_vextent_near(
                i = cnt_cur->bc_ptrs[0];
                for (j = 1, blen = 0, bdiff = 0;
                     !error && j && (blen < args->maxlen || bdiff > 0);
-                    error = xfs_alloc_increment(cnt_cur, 0, &j)) {
+                    error = xfs_btree_increment(cnt_cur, 0, &j)) {
                        /*
                         * For each entry, decide if it's better than
                         * the previous best entry.
@@ -886,8 +971,8 @@ xfs_alloc_ag_vextent_near(
                /*
                 * Set up a cursor for the by-bno tree.
                 */
-               bno_cur_lt = xfs_btree_init_cursor(args->mp, args->tp,
-                       args->agbp, args->agno, XFS_BTNUM_BNO, NULL, 0);
+               bno_cur_lt = xfs_allocbt_init_cursor(args->mp, args->tp,
+                       args->agbp, args->agno, XFS_BTNUM_BNO);
                /*
                 * Fix up the btree entries.
                 */
@@ -914,8 +999,8 @@ xfs_alloc_ag_vextent_near(
        /*
         * Allocate and initialize the cursor for the leftward search.
         */
-       bno_cur_lt = xfs_btree_init_cursor(args->mp, args->tp, args->agbp,
-               args->agno, XFS_BTNUM_BNO, NULL, 0);
+       bno_cur_lt = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
+               args->agno, XFS_BTNUM_BNO);
        /*
         * Lookup <= bno to find the leftward search's starting point.
         */
@@ -938,7 +1023,7 @@ xfs_alloc_ag_vextent_near(
         * Increment the cursor, so we will point at the entry just right
         * of the leftward entry if any, or to the leftmost entry.
         */
-       if ((error = xfs_alloc_increment(bno_cur_gt, 0, &i)))
+       if ((error = xfs_btree_increment(bno_cur_gt, 0, &i)))
                goto error0;
        if (!i) {
                /*
@@ -961,7 +1046,7 @@ xfs_alloc_ag_vextent_near(
                                        args->minlen, &ltbnoa, &ltlena);
                        if (ltlena >= args->minlen)
                                break;
-                       if ((error = xfs_alloc_decrement(bno_cur_lt, 0, &i)))
+                       if ((error = xfs_btree_decrement(bno_cur_lt, 0, &i)))
                                goto error0;
                        if (!i) {
                                xfs_btree_del_cursor(bno_cur_lt,
@@ -977,7 +1062,7 @@ xfs_alloc_ag_vextent_near(
                                        args->minlen, &gtbnoa, &gtlena);
                        if (gtlena >= args->minlen)
                                break;
-                       if ((error = xfs_alloc_increment(bno_cur_gt, 0, &i)))
+                       if ((error = xfs_btree_increment(bno_cur_gt, 0, &i)))
                                goto error0;
                        if (!i) {
                                xfs_btree_del_cursor(bno_cur_gt,
@@ -1066,7 +1151,7 @@ xfs_alloc_ag_vextent_near(
                                        /*
                                         * Fell off the right end.
                                         */
-                                       if ((error = xfs_alloc_increment(
+                                       if ((error = xfs_btree_increment(
                                                        bno_cur_gt, 0, &i)))
                                                goto error0;
                                        if (!i) {
@@ -1162,7 +1247,7 @@ xfs_alloc_ag_vextent_near(
                                        /*
                                         * Fell off the left end.
                                         */
-                                       if ((error = xfs_alloc_decrement(
+                                       if ((error = xfs_btree_decrement(
                                                        bno_cur_lt, 0, &i)))
                                                goto error0;
                                        if (!i) {
@@ -1267,8 +1352,8 @@ xfs_alloc_ag_vextent_size(
        /*
         * Allocate and initialize a cursor for the by-size btree.
         */
-       cnt_cur = xfs_btree_init_cursor(args->mp, args->tp, args->agbp,
-               args->agno, XFS_BTNUM_CNT, NULL, 0);
+       cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
+               args->agno, XFS_BTNUM_CNT);
        bno_cur = NULL;
        /*
         * Look for an entry >= maxlen+alignment-1 blocks.
@@ -1321,7 +1406,7 @@ xfs_alloc_ag_vextent_size(
                bestflen = flen;
                bestfbno = fbno;
                for (;;) {
-                       if ((error = xfs_alloc_decrement(cnt_cur, 0, &i)))
+                       if ((error = xfs_btree_decrement(cnt_cur, 0, &i)))
                                goto error0;
                        if (i == 0)
                                break;
@@ -1372,8 +1457,8 @@ xfs_alloc_ag_vextent_size(
        /*
         * Allocate and initialize a cursor for the by-block tree.
         */
-       bno_cur = xfs_btree_init_cursor(args->mp, args->tp, args->agbp,
-               args->agno, XFS_BTNUM_BNO, NULL, 0);
+       bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
+               args->agno, XFS_BTNUM_BNO);
        if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen,
                        rbno, rlen, XFSA_FIXUP_CNT_OK)))
                goto error0;
@@ -1416,7 +1501,7 @@ xfs_alloc_ag_vextent_small(
        xfs_extlen_t    flen;
        int             i;
 
-       if ((error = xfs_alloc_decrement(ccur, 0, &i)))
+       if ((error = xfs_btree_decrement(ccur, 0, &i)))
                goto error0;
        if (i) {
                if ((error = xfs_alloc_get_rec(ccur, &fbno, &flen, &i)))
@@ -1515,8 +1600,7 @@ xfs_free_ag_extent(
        /*
         * Allocate and initialize a cursor for the by-block btree.
         */
-       bno_cur = xfs_btree_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_BNO, NULL,
-               0);
+       bno_cur = xfs_allocbt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_BNO);
        cnt_cur = NULL;
        /*
         * Look for a neighboring block on the left (lower block numbers)
@@ -1549,7 +1633,7 @@ xfs_free_ag_extent(
         * Look for a neighboring block on the right (higher block numbers)
         * that is contiguous with this space.
         */
-       if ((error = xfs_alloc_increment(bno_cur, 0, &haveright)))
+       if ((error = xfs_btree_increment(bno_cur, 0, &haveright)))
                goto error0;
        if (haveright) {
                /*
@@ -1575,8 +1659,7 @@ xfs_free_ag_extent(
        /*
         * Now allocate and initialize a cursor for the by-size tree.
         */
-       cnt_cur = xfs_btree_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_CNT, NULL,
-               0);
+       cnt_cur = xfs_allocbt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_CNT);
        /*
         * Have both left and right contiguous neighbors.
         * Merge all three into a single free block.
@@ -1588,7 +1671,7 @@ xfs_free_ag_extent(
                if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
                        goto error0;
                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-               if ((error = xfs_alloc_delete(cnt_cur, &i)))
+               if ((error = xfs_btree_delete(cnt_cur, &i)))
                        goto error0;
                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
                /*
@@ -1597,19 +1680,19 @@ xfs_free_ag_extent(
                if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
                        goto error0;
                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-               if ((error = xfs_alloc_delete(cnt_cur, &i)))
+               if ((error = xfs_btree_delete(cnt_cur, &i)))
                        goto error0;
                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
                /*
                 * Delete the old by-block entry for the right block.
                 */
-               if ((error = xfs_alloc_delete(bno_cur, &i)))
+               if ((error = xfs_btree_delete(bno_cur, &i)))
                        goto error0;
                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
                /*
                 * Move the by-block cursor back to the left neighbor.
                 */
-               if ((error = xfs_alloc_decrement(bno_cur, 0, &i)))
+               if ((error = xfs_btree_decrement(bno_cur, 0, &i)))
                        goto error0;
                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
 #ifdef DEBUG
@@ -1648,14 +1731,14 @@ xfs_free_ag_extent(
                if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
                        goto error0;
                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-               if ((error = xfs_alloc_delete(cnt_cur, &i)))
+               if ((error = xfs_btree_delete(cnt_cur, &i)))
                        goto error0;
                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
                /*
                 * Back up the by-block cursor to the left neighbor, and
                 * update its length.
                 */
-               if ((error = xfs_alloc_decrement(bno_cur, 0, &i)))
+               if ((error = xfs_btree_decrement(bno_cur, 0, &i)))
                        goto error0;
                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
                nbno = ltbno;
@@ -1674,7 +1757,7 @@ xfs_free_ag_extent(
                if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
                        goto error0;
                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-               if ((error = xfs_alloc_delete(cnt_cur, &i)))
+               if ((error = xfs_btree_delete(cnt_cur, &i)))
                        goto error0;
                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
                /*
@@ -1693,7 +1776,7 @@ xfs_free_ag_extent(
        else {
                nbno = bno;
                nlen = len;
-               if ((error = xfs_alloc_insert(bno_cur, &i)))
+               if ((error = xfs_btree_insert(bno_cur, &i)))
                        goto error0;
                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
        }
@@ -1705,7 +1788,7 @@ xfs_free_ag_extent(
        if ((error = xfs_alloc_lookup_eq(cnt_cur, nbno, nlen, &i)))
                goto error0;
        XFS_WANT_CORRUPTED_GOTO(i == 0, error0);
-       if ((error = xfs_alloc_insert(cnt_cur, &i)))
+       if ((error = xfs_btree_insert(cnt_cur, &i)))
                goto error0;
        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
        xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
@@ -2188,6 +2271,9 @@ xfs_alloc_read_agf(
                be32_to_cpu(agf->agf_flfirst) < XFS_AGFL_SIZE(mp) &&
                be32_to_cpu(agf->agf_fllast) < XFS_AGFL_SIZE(mp) &&
                be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp);
+       if (xfs_sb_version_haslazysbcount(&mp->m_sb))
+               agf_ok = agf_ok && be32_to_cpu(agf->agf_btreeblks) <=
+                                               be32_to_cpu(agf->agf_length);
        if (unlikely(XFS_TEST_ERROR(!agf_ok, mp, XFS_ERRTAG_ALLOC_READ_AGF,
                        XFS_RANDOM_ALLOC_READ_AGF))) {
                XFS_CORRUPTION_ERROR("xfs_alloc_read_agf",
@@ -2213,6 +2299,7 @@ xfs_alloc_read_agf(
 #ifdef DEBUG
        else if (!XFS_FORCED_SHUTDOWN(mp)) {
                ASSERT(pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks));
+               ASSERT(pag->pagf_btreeblks == be32_to_cpu(agf->agf_btreeblks));
                ASSERT(pag->pagf_flcount == be32_to_cpu(agf->agf_flcount));
                ASSERT(pag->pagf_longest == be32_to_cpu(agf->agf_longest));
                ASSERT(pag->pagf_levels[XFS_BTNUM_BNOi] ==
index 5aec15d0651e836191d794edecddebc797cdf32c..588172796f7b92c61f696591bb5241f1d8f42ccc 100644 (file)
@@ -121,6 +121,19 @@ extern ktrace_t *xfs_alloc_trace_buf;
 #define        XFS_ALLOC_KTRACE_BUSYSEARCH     6
 #endif
 
+void
+xfs_alloc_mark_busy(xfs_trans_t *tp,
+               xfs_agnumber_t agno,
+               xfs_agblock_t bno,
+               xfs_extlen_t len);
+
+void
+xfs_alloc_clear_busy(xfs_trans_t *tp,
+               xfs_agnumber_t ag,
+               int idx);
+
+#endif /* __KERNEL__ */
+
 /*
  * Compute and fill in value of m_ag_maxlevels.
  */
@@ -196,18 +209,4 @@ xfs_free_extent(
        xfs_fsblock_t   bno,    /* starting block number of extent */
        xfs_extlen_t    len);   /* length of extent */
 
-void
-xfs_alloc_mark_busy(xfs_trans_t *tp,
-               xfs_agnumber_t agno,
-               xfs_agblock_t bno,
-               xfs_extlen_t len);
-
-void
-xfs_alloc_clear_busy(xfs_trans_t *tp,
-               xfs_agnumber_t ag,
-               int idx);
-
-
-#endif /* __KERNEL__ */
-
 #endif /* __XFS_ALLOC_H__ */
index 3ce2645508ae70ea826cc3b77cb2aeeaa1830785..733cb75a8c5d956b96743f4b67c6a654850a366a 100644 (file)
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_btree.h"
+#include "xfs_btree_trace.h"
 #include "xfs_ialloc.h"
 #include "xfs_alloc.h"
 #include "xfs_error.h"
 
-/*
- * Prototypes for internal functions.
- */
-
-STATIC void xfs_alloc_log_block(xfs_trans_t *, xfs_buf_t *, int);
-STATIC void xfs_alloc_log_keys(xfs_btree_cur_t *, xfs_buf_t *, int, int);
-STATIC void xfs_alloc_log_ptrs(xfs_btree_cur_t *, xfs_buf_t *, int, int);
-STATIC void xfs_alloc_log_recs(xfs_btree_cur_t *, xfs_buf_t *, int, int);
-STATIC int xfs_alloc_lshift(xfs_btree_cur_t *, int, int *);
-STATIC int xfs_alloc_newroot(xfs_btree_cur_t *, int *);
-STATIC int xfs_alloc_rshift(xfs_btree_cur_t *, int, int *);
-STATIC int xfs_alloc_split(xfs_btree_cur_t *, int, xfs_agblock_t *,
-               xfs_alloc_key_t *, xfs_btree_cur_t **, int *);
-STATIC int xfs_alloc_updkey(xfs_btree_cur_t *, xfs_alloc_key_t *, int);
-
-/*
- * Internal functions.
- */
-
-/*
- * Single level of the xfs_alloc_delete record deletion routine.
- * Delete record pointed to by cur/level.
- * Remove the record from its block then rebalance the tree.
- * Return 0 for error, 1 for done, 2 to go on to the next level.
- */
-STATIC int                             /* error */
-xfs_alloc_delrec(
-       xfs_btree_cur_t         *cur,   /* btree cursor */
-       int                     level,  /* level removing record from */
-       int                     *stat)  /* fail/done/go-on */
-{
-       xfs_agf_t               *agf;   /* allocation group freelist header */
-       xfs_alloc_block_t       *block; /* btree block record/key lives in */
-       xfs_agblock_t           bno;    /* btree block number */
-       xfs_buf_t               *bp;    /* buffer for block */
-       int                     error;  /* error return value */
-       int                     i;      /* loop index */
-       xfs_alloc_key_t         key;    /* kp points here if block is level 0 */
-       xfs_agblock_t           lbno;   /* left block's block number */
-       xfs_buf_t               *lbp;   /* left block's buffer pointer */
-       xfs_alloc_block_t       *left;  /* left btree block */
-       xfs_alloc_key_t         *lkp=NULL;      /* left block key pointer */
-       xfs_alloc_ptr_t         *lpp=NULL;      /* left block address pointer */
-       int                     lrecs=0;        /* number of records in left block */
-       xfs_alloc_rec_t         *lrp;   /* left block record pointer */
-       xfs_mount_t             *mp;    /* mount structure */
-       int                     ptr;    /* index in btree block for this rec */
-       xfs_agblock_t           rbno;   /* right block's block number */
-       xfs_buf_t               *rbp;   /* right block's buffer pointer */
-       xfs_alloc_block_t       *right; /* right btree block */
-       xfs_alloc_key_t         *rkp;   /* right block key pointer */
-       xfs_alloc_ptr_t         *rpp;   /* right block address pointer */
-       int                     rrecs=0;        /* number of records in right block */
-       int                     numrecs;
-       xfs_alloc_rec_t         *rrp;   /* right block record pointer */
-       xfs_btree_cur_t         *tcur;  /* temporary btree cursor */
-
-       /*
-        * Get the index of the entry being deleted, check for nothing there.
-        */
-       ptr = cur->bc_ptrs[level];
-       if (ptr == 0) {
-               *stat = 0;
-               return 0;
-       }
-       /*
-        * Get the buffer & block containing the record or key/ptr.
-        */
-       bp = cur->bc_bufs[level];
-       block = XFS_BUF_TO_ALLOC_BLOCK(bp);
-#ifdef DEBUG
-       if ((error = xfs_btree_check_sblock(cur, block, level, bp)))
-               return error;
-#endif
-       /*
-        * Fail if we're off the end of the block.
-        */
-       numrecs = be16_to_cpu(block->bb_numrecs);
-       if (ptr > numrecs) {
-               *stat = 0;
-               return 0;
-       }
-       XFS_STATS_INC(xs_abt_delrec);
-       /*
-        * It's a nonleaf.  Excise the key and ptr being deleted, by
-        * sliding the entries past them down one.
-        * Log the changed areas of the block.
-        */
-       if (level > 0) {
-               lkp = XFS_ALLOC_KEY_ADDR(block, 1, cur);
-               lpp = XFS_ALLOC_PTR_ADDR(block, 1, cur);
-#ifdef DEBUG
-               for (i = ptr; i < numrecs; i++) {
-                       if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(lpp[i]), level)))
-                               return error;
-               }
-#endif
-               if (ptr < numrecs) {
-                       memmove(&lkp[ptr - 1], &lkp[ptr],
-                               (numrecs - ptr) * sizeof(*lkp));
-                       memmove(&lpp[ptr - 1], &lpp[ptr],
-                               (numrecs - ptr) * sizeof(*lpp));
-                       xfs_alloc_log_ptrs(cur, bp, ptr, numrecs - 1);
-                       xfs_alloc_log_keys(cur, bp, ptr, numrecs - 1);
-               }
-       }
-       /*
-        * It's a leaf.  Excise the record being deleted, by sliding the
-        * entries past it down one.  Log the changed areas of the block.
-        */
-       else {
-               lrp = XFS_ALLOC_REC_ADDR(block, 1, cur);
-               if (ptr < numrecs) {
-                       memmove(&lrp[ptr - 1], &lrp[ptr],
-                               (numrecs - ptr) * sizeof(*lrp));
-                       xfs_alloc_log_recs(cur, bp, ptr, numrecs - 1);
-               }
-               /*
-                * If it's the first record in the block, we'll need a key
-                * structure to pass up to the next level (updkey).
-                */
-               if (ptr == 1) {
-                       key.ar_startblock = lrp->ar_startblock;
-                       key.ar_blockcount = lrp->ar_blockcount;
-                       lkp = &key;
-               }
-       }
-       /*
-        * Decrement and log the number of entries in the block.
-        */
-       numrecs--;
-       block->bb_numrecs = cpu_to_be16(numrecs);
-       xfs_alloc_log_block(cur->bc_tp, bp, XFS_BB_NUMRECS);
-       /*
-        * See if the longest free extent in the allocation group was
-        * changed by this operation.  True if it's the by-size btree, and
-        * this is the leaf level, and there is no right sibling block,
-        * and this was the last record.
-        */
-       agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
-       mp = cur->bc_mp;
-
-       if (level == 0 &&
-           cur->bc_btnum == XFS_BTNUM_CNT &&
-           be32_to_cpu(block->bb_rightsib) == NULLAGBLOCK &&
-           ptr > numrecs) {
-               ASSERT(ptr == numrecs + 1);
-               /*
-                * There are still records in the block.  Grab the size
-                * from the last one.
-                */
-               if (numrecs) {
-                       rrp = XFS_ALLOC_REC_ADDR(block, numrecs, cur);
-                       agf->agf_longest = rrp->ar_blockcount;
-               }
-               /*
-                * No free extents left.
-                */
-               else
-                       agf->agf_longest = 0;
-               mp->m_perag[be32_to_cpu(agf->agf_seqno)].pagf_longest =
-                       be32_to_cpu(agf->agf_longest);
-               xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp,
-                       XFS_AGF_LONGEST);
-       }
-       /*
-        * Is this the root level?  If so, we're almost done.
-        */
-       if (level == cur->bc_nlevels - 1) {
-               /*
-                * If this is the root level,
-                * and there's only one entry left,
-                * and it's NOT the leaf level,
-                * then we can get rid of this level.
-                */
-               if (numrecs == 1 && level > 0) {
-                       /*
-                        * lpp is still set to the first pointer in the block.
-                        * Make it the new root of the btree.
-                        */
-                       bno = be32_to_cpu(agf->agf_roots[cur->bc_btnum]);
-                       agf->agf_roots[cur->bc_btnum] = *lpp;
-                       be32_add_cpu(&agf->agf_levels[cur->bc_btnum], -1);
-                       mp->m_perag[be32_to_cpu(agf->agf_seqno)].pagf_levels[cur->bc_btnum]--;
-                       /*
-                        * Put this buffer/block on the ag's freelist.
-                        */
-                       error = xfs_alloc_put_freelist(cur->bc_tp,
-                                       cur->bc_private.a.agbp, NULL, bno, 1);
-                       if (error)
-                               return error;
-                       /*
-                        * Since blocks move to the free list without the
-                        * coordination used in xfs_bmap_finish, we can't allow
-                        * block to be available for reallocation and
-                        * non-transaction writing (user data) until we know
-                        * that the transaction that moved it to the free list
-                        * is permanently on disk. We track the blocks by
-                        * declaring these blocks as "busy"; the busy list is
-                        * maintained on a per-ag basis and each transaction
-                        * records which entries should be removed when the
-                        * iclog commits to disk. If a busy block is
-                        * allocated, the iclog is pushed up to the LSN
-                        * that freed the block.
-                        */
-                       xfs_alloc_mark_busy(cur->bc_tp,
-                               be32_to_cpu(agf->agf_seqno), bno, 1);
-
-                       xfs_trans_agbtree_delta(cur->bc_tp, -1);
-                       xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp,
-                               XFS_AGF_ROOTS | XFS_AGF_LEVELS);
-                       /*
-                        * Update the cursor so there's one fewer level.
-                        */
-                       xfs_btree_setbuf(cur, level, NULL);
-                       cur->bc_nlevels--;
-               } else if (level > 0 &&
-                          (error = xfs_alloc_decrement(cur, level, &i)))
-                       return error;
-               *stat = 1;
-               return 0;
-       }
-       /*
-        * If we deleted the leftmost entry in the block, update the
-        * key values above us in the tree.
-        */
-       if (ptr == 1 && (error = xfs_alloc_updkey(cur, lkp, level + 1)))
-               return error;
-       /*
-        * If the number of records remaining in the block is at least
-        * the minimum, we're done.
-        */
-       if (numrecs >= XFS_ALLOC_BLOCK_MINRECS(level, cur)) {
-               if (level > 0 && (error = xfs_alloc_decrement(cur, level, &i)))
-                       return error;
-               *stat = 1;
-               return 0;
-       }
-       /*
-        * Otherwise, we have to move some records around to keep the
-        * tree balanced.  Look at the left and right sibling blocks to
-        * see if we can re-balance by moving only one record.
-        */
-       rbno = be32_to_cpu(block->bb_rightsib);
-       lbno = be32_to_cpu(block->bb_leftsib);
-       bno = NULLAGBLOCK;
-       ASSERT(rbno != NULLAGBLOCK || lbno != NULLAGBLOCK);
-       /*
-        * Duplicate the cursor so our btree manipulations here won't
-        * disrupt the next level up.
-        */
-       if ((error = xfs_btree_dup_cursor(cur, &tcur)))
-               return error;
-       /*
-        * If there's a right sibling, see if it's ok to shift an entry
-        * out of it.
-        */
-       if (rbno != NULLAGBLOCK) {
-               /*
-                * Move the temp cursor to the last entry in the next block.
-                * Actually any entry but the first would suffice.
-                */
-               i = xfs_btree_lastrec(tcur, level);
-               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-               if ((error = xfs_alloc_increment(tcur, level, &i)))
-                       goto error0;
-               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-               i = xfs_btree_lastrec(tcur, level);
-               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-               /*
-                * Grab a pointer to the block.
-                */
-               rbp = tcur->bc_bufs[level];
-               right = XFS_BUF_TO_ALLOC_BLOCK(rbp);
-#ifdef DEBUG
-               if ((error = xfs_btree_check_sblock(cur, right, level, rbp)))
-                       goto error0;
-#endif
-               /*
-                * Grab the current block number, for future use.
-                */
-               bno = be32_to_cpu(right->bb_leftsib);
-               /*
-                * If right block is full enough so that removing one entry
-                * won't make it too empty, and left-shifting an entry out
-                * of right to us works, we're done.
-                */
-               if (be16_to_cpu(right->bb_numrecs) - 1 >=
-                    XFS_ALLOC_BLOCK_MINRECS(level, cur)) {
-                       if ((error = xfs_alloc_lshift(tcur, level, &i)))
-                               goto error0;
-                       if (i) {
-                               ASSERT(be16_to_cpu(block->bb_numrecs) >=
-                                      XFS_ALLOC_BLOCK_MINRECS(level, cur));
-                               xfs_btree_del_cursor(tcur,
-                                                    XFS_BTREE_NOERROR);
-                               if (level > 0 &&
-                                   (error = xfs_alloc_decrement(cur, level,
-                                           &i)))
-                                       return error;
-                               *stat = 1;
-                               return 0;
-                       }
-               }
-               /*
-                * Otherwise, grab the number of records in right for
-                * future reference, and fix up the temp cursor to point
-                * to our block again (last record).
-                */
-               rrecs = be16_to_cpu(right->bb_numrecs);
-               if (lbno != NULLAGBLOCK) {
-                       i = xfs_btree_firstrec(tcur, level);
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                       if ((error = xfs_alloc_decrement(tcur, level, &i)))
-                               goto error0;
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-               }
-       }
-       /*
-        * If there's a left sibling, see if it's ok to shift an entry
-        * out of it.
-        */
-       if (lbno != NULLAGBLOCK) {
-               /*
-                * Move the temp cursor to the first entry in the
-                * previous block.
-                */
-               i = xfs_btree_firstrec(tcur, level);
-               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-               if ((error = xfs_alloc_decrement(tcur, level, &i)))
-                       goto error0;
-               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-               xfs_btree_firstrec(tcur, level);
-               /*
-                * Grab a pointer to the block.
-                */
-               lbp = tcur->bc_bufs[level];
-               left = XFS_BUF_TO_ALLOC_BLOCK(lbp);
-#ifdef DEBUG
-               if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
-                       goto error0;
-#endif
-               /*
-                * Grab the current block number, for future use.
-                */
-               bno = be32_to_cpu(left->bb_rightsib);
-               /*
-                * If left block is full enough so that removing one entry
-                * won't make it too empty, and right-shifting an entry out
-                * of left to us works, we're done.
-                */
-               if (be16_to_cpu(left->bb_numrecs) - 1 >=
-                    XFS_ALLOC_BLOCK_MINRECS(level, cur)) {
-                       if ((error = xfs_alloc_rshift(tcur, level, &i)))
-                               goto error0;
-                       if (i) {
-                               ASSERT(be16_to_cpu(block->bb_numrecs) >=
-                                      XFS_ALLOC_BLOCK_MINRECS(level, cur));
-                               xfs_btree_del_cursor(tcur,
-                                                    XFS_BTREE_NOERROR);
-                               if (level == 0)
-                                       cur->bc_ptrs[0]++;
-                               *stat = 1;
-                               return 0;
-                       }
-               }
-               /*
-                * Otherwise, grab the number of records in right for
-                * future reference.
-                */
-               lrecs = be16_to_cpu(left->bb_numrecs);
-       }
-       /*
-        * Delete the temp cursor, we're done with it.
-        */
-       xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
-       /*
-        * If here, we need to do a join to keep the tree balanced.
-        */
-       ASSERT(bno != NULLAGBLOCK);
-       /*
-        * See if we can join with the left neighbor block.
-        */
-       if (lbno != NULLAGBLOCK &&
-           lrecs + numrecs <= XFS_ALLOC_BLOCK_MAXRECS(level, cur)) {
-               /*
-                * Set "right" to be the starting block,
-                * "left" to be the left neighbor.
-                */
-               rbno = bno;
-               right = block;
-               rrecs = be16_to_cpu(right->bb_numrecs);
-               rbp = bp;
-               if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
-                               cur->bc_private.a.agno, lbno, 0, &lbp,
-                               XFS_ALLOC_BTREE_REF)))
-                       return error;
-               left = XFS_BUF_TO_ALLOC_BLOCK(lbp);
-               lrecs = be16_to_cpu(left->bb_numrecs);
-               if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
-                       return error;
-       }
-       /*
-        * If that won't work, see if we can join with the right neighbor block.
-        */
-       else if (rbno != NULLAGBLOCK &&
-                rrecs + numrecs <= XFS_ALLOC_BLOCK_MAXRECS(level, cur)) {
-               /*
-                * Set "left" to be the starting block,
-                * "right" to be the right neighbor.
-                */
-               lbno = bno;
-               left = block;
-               lrecs = be16_to_cpu(left->bb_numrecs);
-               lbp = bp;
-               if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
-                               cur->bc_private.a.agno, rbno, 0, &rbp,
-                               XFS_ALLOC_BTREE_REF)))
-                       return error;
-               right = XFS_BUF_TO_ALLOC_BLOCK(rbp);
-               rrecs = be16_to_cpu(right->bb_numrecs);
-               if ((error = xfs_btree_check_sblock(cur, right, level, rbp)))
-                       return error;
-       }
-       /*
-        * Otherwise, we can't fix the imbalance.
-        * Just return.  This is probably a logic error, but it's not fatal.
-        */
-       else {
-               if (level > 0 && (error = xfs_alloc_decrement(cur, level, &i)))
-                       return error;
-               *stat = 1;
-               return 0;
-       }
-       /*
-        * We're now going to join "left" and "right" by moving all the stuff
-        * in "right" to "left" and deleting "right".
-        */
-       if (level > 0) {
-               /*
-                * It's a non-leaf.  Move keys and pointers.
-                */
-               lkp = XFS_ALLOC_KEY_ADDR(left, lrecs + 1, cur);
-               lpp = XFS_ALLOC_PTR_ADDR(left, lrecs + 1, cur);
-               rkp = XFS_ALLOC_KEY_ADDR(right, 1, cur);
-               rpp = XFS_ALLOC_PTR_ADDR(right, 1, cur);
-#ifdef DEBUG
-               for (i = 0; i < rrecs; i++) {
-                       if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(rpp[i]), level)))
-                               return error;
-               }
-#endif
-               memcpy(lkp, rkp, rrecs * sizeof(*lkp));
-               memcpy(lpp, rpp, rrecs * sizeof(*lpp));
-               xfs_alloc_log_keys(cur, lbp, lrecs + 1, lrecs + rrecs);
-               xfs_alloc_log_ptrs(cur, lbp, lrecs + 1, lrecs + rrecs);
-       } else {
-               /*
-                * It's a leaf.  Move records.
-                */
-               lrp = XFS_ALLOC_REC_ADDR(left, lrecs + 1, cur);
-               rrp = XFS_ALLOC_REC_ADDR(right, 1, cur);
-               memcpy(lrp, rrp, rrecs * sizeof(*lrp));
-               xfs_alloc_log_recs(cur, lbp, lrecs + 1, lrecs + rrecs);
-       }
-       /*
-        * If we joined with the left neighbor, set the buffer in the
-        * cursor to the left block, and fix up the index.
-        */
-       if (bp != lbp) {
-               xfs_btree_setbuf(cur, level, lbp);
-               cur->bc_ptrs[level] += lrecs;
-       }
-       /*
-        * If we joined with the right neighbor and there's a level above
-        * us, increment the cursor at that level.
-        */
-       else if (level + 1 < cur->bc_nlevels &&
-                (error = xfs_alloc_increment(cur, level + 1, &i)))
-               return error;
-       /*
-        * Fix up the number of records in the surviving block.
-        */
-       lrecs += rrecs;
-       left->bb_numrecs = cpu_to_be16(lrecs);
-       /*
-        * Fix up the right block pointer in the surviving block, and log it.
-        */
-       left->bb_rightsib = right->bb_rightsib;
-       xfs_alloc_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
-       /*
-        * If there is a right sibling now, make it point to the
-        * remaining block.
-        */
-       if (be32_to_cpu(left->bb_rightsib) != NULLAGBLOCK) {
-               xfs_alloc_block_t       *rrblock;
-               xfs_buf_t               *rrbp;
-
-               if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
-                               cur->bc_private.a.agno, be32_to_cpu(left->bb_rightsib), 0,
-                               &rrbp, XFS_ALLOC_BTREE_REF)))
-                       return error;
-               rrblock = XFS_BUF_TO_ALLOC_BLOCK(rrbp);
-               if ((error = xfs_btree_check_sblock(cur, rrblock, level, rrbp)))
-                       return error;
-               rrblock->bb_leftsib = cpu_to_be32(lbno);
-               xfs_alloc_log_block(cur->bc_tp, rrbp, XFS_BB_LEFTSIB);
-       }
-       /*
-        * Free the deleting block by putting it on the freelist.
-        */
-       error = xfs_alloc_put_freelist(cur->bc_tp,
-                                        cur->bc_private.a.agbp, NULL, rbno, 1);
-       if (error)
-               return error;
-       /*
-        * Since blocks move to the free list without the coordination
-        * used in xfs_bmap_finish, we can't allow block to be available
-        * for reallocation and non-transaction writing (user data)
-        * until we know that the transaction that moved it to the free
-        * list is permanently on disk. We track the blocks by declaring
-        * these blocks as "busy"; the busy list is maintained on a
-        * per-ag basis and each transaction records which entries
-        * should be removed when the iclog commits to disk. If a
-        * busy block is allocated, the iclog is pushed up to the
-        * LSN that freed the block.
-        */
-       xfs_alloc_mark_busy(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1);
-       xfs_trans_agbtree_delta(cur->bc_tp, -1);
-
-       /*
-        * Adjust the current level's cursor so that we're left referring
-        * to the right node, after we're done.
-        * If this leaves the ptr value 0 our caller will fix it up.
-        */
-       if (level > 0)
-               cur->bc_ptrs[level]--;
-       /*
-        * Return value means the next level up has something to do.
-        */
-       *stat = 2;
-       return 0;
-
-error0:
-       xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
-       return error;
-}
-
-/*
- * Insert one record/level.  Return information to the caller
- * allowing the next level up to proceed if necessary.
- */
-STATIC int                             /* error */
-xfs_alloc_insrec(
-       xfs_btree_cur_t         *cur,   /* btree cursor */
-       int                     level,  /* level to insert record at */
-       xfs_agblock_t           *bnop,  /* i/o: block number inserted */
-       xfs_alloc_rec_t         *recp,  /* i/o: record data inserted */
-       xfs_btree_cur_t         **curp, /* output: new cursor replacing cur */
-       int                     *stat)  /* output: success/failure */
-{
-       xfs_agf_t               *agf;   /* allocation group freelist header */
-       xfs_alloc_block_t       *block; /* btree block record/key lives in */
-       xfs_buf_t               *bp;    /* buffer for block */
-       int                     error;  /* error return value */
-       int                     i;      /* loop index */
-       xfs_alloc_key_t         key;    /* key value being inserted */
-       xfs_alloc_key_t         *kp;    /* pointer to btree keys */
-       xfs_agblock_t           nbno;   /* block number of allocated block */
-       xfs_btree_cur_t         *ncur;  /* new cursor to be used at next lvl */
-       xfs_alloc_key_t         nkey;   /* new key value, from split */
-       xfs_alloc_rec_t         nrec;   /* new record value, for caller */
-       int                     numrecs;
-       int                     optr;   /* old ptr value */
-       xfs_alloc_ptr_t         *pp;    /* pointer to btree addresses */
-       int                     ptr;    /* index in btree block for this rec */
-       xfs_alloc_rec_t         *rp;    /* pointer to btree records */
-
-       ASSERT(be32_to_cpu(recp->ar_blockcount) > 0);
-
-       /*
-        * GCC doesn't understand the (arguably complex) control flow in
-        * this function and complains about uninitialized structure fields
-        * without this.
-        */
-       memset(&nrec, 0, sizeof(nrec));
-
-       /*
-        * If we made it to the root level, allocate a new root block
-        * and we're done.
-        */
-       if (level >= cur->bc_nlevels) {
-               XFS_STATS_INC(xs_abt_insrec);
-               if ((error = xfs_alloc_newroot(cur, &i)))
-                       return error;
-               *bnop = NULLAGBLOCK;
-               *stat = i;
-               return 0;
-       }
-       /*
-        * Make a key out of the record data to be inserted, and save it.
-        */
-       key.ar_startblock = recp->ar_startblock;
-       key.ar_blockcount = recp->ar_blockcount;
-       optr = ptr = cur->bc_ptrs[level];
-       /*
-        * If we're off the left edge, return failure.
-        */
-       if (ptr == 0) {
-               *stat = 0;
-               return 0;
-       }
-       XFS_STATS_INC(xs_abt_insrec);
-       /*
-        * Get pointers to the btree buffer and block.
-        */
-       bp = cur->bc_bufs[level];
-       block = XFS_BUF_TO_ALLOC_BLOCK(bp);
-       numrecs = be16_to_cpu(block->bb_numrecs);
-#ifdef DEBUG
-       if ((error = xfs_btree_check_sblock(cur, block, level, bp)))
-               return error;
-       /*
-        * Check that the new entry is being inserted in the right place.
-        */
-       if (ptr <= numrecs) {
-               if (level == 0) {
-                       rp = XFS_ALLOC_REC_ADDR(block, ptr, cur);
-                       xfs_btree_check_rec(cur->bc_btnum, recp, rp);
-               } else {
-                       kp = XFS_ALLOC_KEY_ADDR(block, ptr, cur);
-                       xfs_btree_check_key(cur->bc_btnum, &key, kp);
-               }
-       }
-#endif
-       nbno = NULLAGBLOCK;
-       ncur = NULL;
-       /*
-        * If the block is full, we can't insert the new entry until we
-        * make the block un-full.
-        */
-       if (numrecs == XFS_ALLOC_BLOCK_MAXRECS(level, cur)) {
-               /*
-                * First, try shifting an entry to the right neighbor.
-                */
-               if ((error = xfs_alloc_rshift(cur, level, &i)))
-                       return error;
-               if (i) {
-                       /* nothing */
-               }
-               /*
-                * Next, try shifting an entry to the left neighbor.
-                */
-               else {
-                       if ((error = xfs_alloc_lshift(cur, level, &i)))
-                               return error;
-                       if (i)
-                               optr = ptr = cur->bc_ptrs[level];
-                       else {
-                               /*
-                                * Next, try splitting the current block in
-                                * half. If this works we have to re-set our
-                                * variables because we could be in a
-                                * different block now.
-                                */
-                               if ((error = xfs_alloc_split(cur, level, &nbno,
-                                               &nkey, &ncur, &i)))
-                                       return error;
-                               if (i) {
-                                       bp = cur->bc_bufs[level];
-                                       block = XFS_BUF_TO_ALLOC_BLOCK(bp);
-#ifdef DEBUG
-                                       if ((error =
-                                               xfs_btree_check_sblock(cur,
-                                                       block, level, bp)))
-                                               return error;
-#endif
-                                       ptr = cur->bc_ptrs[level];
-                                       nrec.ar_startblock = nkey.ar_startblock;
-                                       nrec.ar_blockcount = nkey.ar_blockcount;
-                               }
-                               /*
-                                * Otherwise the insert fails.
-                                */
-                               else {
-                                       *stat = 0;
-                                       return 0;
-                               }
-                       }
-               }
-       }
-       /*
-        * At this point we know there's room for our new entry in the block
-        * we're pointing at.
-        */
-       numrecs = be16_to_cpu(block->bb_numrecs);
-       if (level > 0) {
-               /*
-                * It's a non-leaf entry.  Make a hole for the new data
-                * in the key and ptr regions of the block.
-                */
-               kp = XFS_ALLOC_KEY_ADDR(block, 1, cur);
-               pp = XFS_ALLOC_PTR_ADDR(block, 1, cur);
-#ifdef DEBUG
-               for (i = numrecs; i >= ptr; i--) {
-                       if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(pp[i - 1]), level)))
-                               return error;
-               }
-#endif
-               memmove(&kp[ptr], &kp[ptr - 1],
-                       (numrecs - ptr + 1) * sizeof(*kp));
-               memmove(&pp[ptr], &pp[ptr - 1],
-                       (numrecs - ptr + 1) * sizeof(*pp));
-#ifdef DEBUG
-               if ((error = xfs_btree_check_sptr(cur, *bnop, level)))
-                       return error;
-#endif
-               /*
-                * Now stuff the new data in, bump numrecs and log the new data.
-                */
-               kp[ptr - 1] = key;
-               pp[ptr - 1] = cpu_to_be32(*bnop);
-               numrecs++;
-               block->bb_numrecs = cpu_to_be16(numrecs);
-               xfs_alloc_log_keys(cur, bp, ptr, numrecs);
-               xfs_alloc_log_ptrs(cur, bp, ptr, numrecs);
-#ifdef DEBUG
-               if (ptr < numrecs)
-                       xfs_btree_check_key(cur->bc_btnum, kp + ptr - 1,
-                               kp + ptr);
-#endif
-       } else {
-               /*
-                * It's a leaf entry.  Make a hole for the new record.
-                */
-               rp = XFS_ALLOC_REC_ADDR(block, 1, cur);
-               memmove(&rp[ptr], &rp[ptr - 1],
-                       (numrecs - ptr + 1) * sizeof(*rp));
-               /*
-                * Now stuff the new record in, bump numrecs
-                * and log the new data.
-                */
-               rp[ptr - 1] = *recp;
-               numrecs++;
-               block->bb_numrecs = cpu_to_be16(numrecs);
-               xfs_alloc_log_recs(cur, bp, ptr, numrecs);
-#ifdef DEBUG
-               if (ptr < numrecs)
-                       xfs_btree_check_rec(cur->bc_btnum, rp + ptr - 1,
-                               rp + ptr);
-#endif
-       }
-       /*
-        * Log the new number of records in the btree header.
-        */
-       xfs_alloc_log_block(cur->bc_tp, bp, XFS_BB_NUMRECS);
-       /*
-        * If we inserted at the start of a block, update the parents' keys.
-        */
-       if (optr == 1 && (error = xfs_alloc_updkey(cur, &key, level + 1)))
-               return error;
-       /*
-        * Look to see if the longest extent in the allocation group
-        * needs to be updated.
-        */
 
-       agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
-       if (level == 0 &&
-           cur->bc_btnum == XFS_BTNUM_CNT &&
-           be32_to_cpu(block->bb_rightsib) == NULLAGBLOCK &&
-           be32_to_cpu(recp->ar_blockcount) > be32_to_cpu(agf->agf_longest)) {
-               /*
-                * If this is a leaf in the by-size btree and there
-                * is no right sibling block and this block is bigger
-                * than the previous longest block, update it.
-                */
-               agf->agf_longest = recp->ar_blockcount;
-               cur->bc_mp->m_perag[be32_to_cpu(agf->agf_seqno)].pagf_longest
-                       = be32_to_cpu(recp->ar_blockcount);
-               xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp,
-                       XFS_AGF_LONGEST);
-       }
-       /*
-        * Return the new block number, if any.
-        * If there is one, give back a record value and a cursor too.
-        */
-       *bnop = nbno;
-       if (nbno != NULLAGBLOCK) {
-               *recp = nrec;
-               *curp = ncur;
-       }
-       *stat = 1;
-       return 0;
-}
-
-/*
- * Log header fields from a btree block.
- */
-STATIC void
-xfs_alloc_log_block(
-       xfs_trans_t             *tp,    /* transaction pointer */
-       xfs_buf_t               *bp,    /* buffer containing btree block */
-       int                     fields) /* mask of fields: XFS_BB_... */
+STATIC struct xfs_btree_cur *
+xfs_allocbt_dup_cursor(
+       struct xfs_btree_cur    *cur)
 {
-       int                     first;  /* first byte offset logged */
-       int                     last;   /* last byte offset logged */
-       static const short      offsets[] = {   /* table of offsets */
-               offsetof(xfs_alloc_block_t, bb_magic),
-               offsetof(xfs_alloc_block_t, bb_level),
-               offsetof(xfs_alloc_block_t, bb_numrecs),
-               offsetof(xfs_alloc_block_t, bb_leftsib),
-               offsetof(xfs_alloc_block_t, bb_rightsib),
-               sizeof(xfs_alloc_block_t)
-       };
-
-       xfs_btree_offsets(fields, offsets, XFS_BB_NUM_BITS, &first, &last);
-       xfs_trans_log_buf(tp, bp, first, last);
+       return xfs_allocbt_init_cursor(cur->bc_mp, cur->bc_tp,
+                       cur->bc_private.a.agbp, cur->bc_private.a.agno,
+                       cur->bc_btnum);
 }
-
-/*
- * Log keys from a btree block (nonleaf).
- */
-STATIC void
-xfs_alloc_log_keys(
-       xfs_btree_cur_t         *cur,   /* btree cursor */
-       xfs_buf_t               *bp,    /* buffer containing btree block */
-       int                     kfirst, /* index of first key to log */
-       int                     klast)  /* index of last key to log */
-{
-       xfs_alloc_block_t       *block; /* btree block to log from */
-       int                     first;  /* first byte offset logged */
-       xfs_alloc_key_t         *kp;    /* key pointer in btree block */
-       int                     last;   /* last byte offset logged */
-
-       block = XFS_BUF_TO_ALLOC_BLOCK(bp);
-       kp = XFS_ALLOC_KEY_ADDR(block, 1, cur);
-       first = (int)((xfs_caddr_t)&kp[kfirst - 1] - (xfs_caddr_t)block);
-       last = (int)(((xfs_caddr_t)&kp[klast] - 1) - (xfs_caddr_t)block);
-       xfs_trans_log_buf(cur->bc_tp, bp, first, last);
-}
-
-/*
- * Log block pointer fields from a btree block (nonleaf).
- */
-STATIC void
-xfs_alloc_log_ptrs(
-       xfs_btree_cur_t         *cur,   /* btree cursor */
-       xfs_buf_t               *bp,    /* buffer containing btree block */
-       int                     pfirst, /* index of first pointer to log */
-       int                     plast)  /* index of last pointer to log */
-{
-       xfs_alloc_block_t       *block; /* btree block to log from */
-       int                     first;  /* first byte offset logged */
-       int                     last;   /* last byte offset logged */
-       xfs_alloc_ptr_t         *pp;    /* block-pointer pointer in btree blk */
-
-       block = XFS_BUF_TO_ALLOC_BLOCK(bp);
-       pp = XFS_ALLOC_PTR_ADDR(block, 1, cur);
-       first = (int)((xfs_caddr_t)&pp[pfirst - 1] - (xfs_caddr_t)block);
-       last = (int)(((xfs_caddr_t)&pp[plast] - 1) - (xfs_caddr_t)block);
-       xfs_trans_log_buf(cur->bc_tp, bp, first, last);
-}
-
-/*
- * Log records from a btree block (leaf).
- */
-STATIC void
-xfs_alloc_log_recs(
-       xfs_btree_cur_t         *cur,   /* btree cursor */
-       xfs_buf_t               *bp,    /* buffer containing btree block */
-       int                     rfirst, /* index of first record to log */
-       int                     rlast)  /* index of last record to log */
-{
-       xfs_alloc_block_t       *block; /* btree block to log from */
-       int                     first;  /* first byte offset logged */
-       int                     last;   /* last byte offset logged */
-       xfs_alloc_rec_t         *rp;    /* record pointer for btree block */
-
-
-       block = XFS_BUF_TO_ALLOC_BLOCK(bp);
-       rp = XFS_ALLOC_REC_ADDR(block, 1, cur);
-#ifdef DEBUG
-       {
-               xfs_agf_t       *agf;
-               xfs_alloc_rec_t *p;
-
-               agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
-               for (p = &rp[rfirst - 1]; p <= &rp[rlast - 1]; p++)
-                       ASSERT(be32_to_cpu(p->ar_startblock) +
-                              be32_to_cpu(p->ar_blockcount) <=
-                              be32_to_cpu(agf->agf_length));
-       }
-#endif
-       first = (int)((xfs_caddr_t)&rp[rfirst - 1] - (xfs_caddr_t)block);
-       last = (int)(((xfs_caddr_t)&rp[rlast] - 1) - (xfs_caddr_t)block);
-       xfs_trans_log_buf(cur->bc_tp, bp, first, last);
-}
-
-/*
- * Lookup the record.  The cursor is made to point to it, based on dir.
- * Return 0 if can't find any such record, 1 for success.
- */
-STATIC int                             /* error */
-xfs_alloc_lookup(
-       xfs_btree_cur_t         *cur,   /* btree cursor */
-       xfs_lookup_t            dir,    /* <=, ==, or >= */
-       int                     *stat)  /* success/failure */
-{
-       xfs_agblock_t           agbno;  /* a.g. relative btree block number */
-       xfs_agnumber_t          agno;   /* allocation group number */
-       xfs_alloc_block_t       *block=NULL;    /* current btree block */
-       int                     diff;   /* difference for the current key */
-       int                     error;  /* error return value */
-       int                     keyno=0;        /* current key number */
-       int                     level;  /* level in the btree */
-       xfs_mount_t             *mp;    /* file system mount point */
-
-       XFS_STATS_INC(xs_abt_lookup);
-       /*
-        * Get the allocation group header, and the root block number.
-        */
-       mp = cur->bc_mp;
-
-       {
-               xfs_agf_t       *agf;   /* a.g. freespace header */
-
-               agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
-               agno = be32_to_cpu(agf->agf_seqno);
-               agbno = be32_to_cpu(agf->agf_roots[cur->bc_btnum]);
-       }
-       /*
-        * Iterate over each level in the btree, starting at the root.
-        * For each level above the leaves, find the key we need, based
-        * on the lookup record, then follow the corresponding block
-        * pointer down to the next level.
-        */
-       for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) {
-               xfs_buf_t       *bp;    /* buffer pointer for btree block */
-               xfs_daddr_t     d;      /* disk address of btree block */
-
-               /*
-                * Get the disk address we're looking for.
-                */
-               d = XFS_AGB_TO_DADDR(mp, agno, agbno);
-               /*
-                * If the old buffer at this level is for a different block,
-                * throw it away, otherwise just use it.
-                */
-               bp = cur->bc_bufs[level];
-               if (bp && XFS_BUF_ADDR(bp) != d)
-                       bp = NULL;
-               if (!bp) {
-                       /*
-                        * Need to get a new buffer.  Read it, then
-                        * set it in the cursor, releasing the old one.
-                        */
-                       if ((error = xfs_btree_read_bufs(mp, cur->bc_tp, agno,
-                                       agbno, 0, &bp, XFS_ALLOC_BTREE_REF)))
-                               return error;
-                       xfs_btree_setbuf(cur, level, bp);
-                       /*
-                        * Point to the btree block, now that we have the buffer
-                        */
-                       block = XFS_BUF_TO_ALLOC_BLOCK(bp);
-                       if ((error = xfs_btree_check_sblock(cur, block, level,
-                                       bp)))
-                               return error;
-               } else
-                       block = XFS_BUF_TO_ALLOC_BLOCK(bp);
-               /*
-                * If we already had a key match at a higher level, we know
-                * we need to use the first entry in this block.
-                */
-               if (diff == 0)
-                       keyno = 1;
-               /*
-                * Otherwise we need to search this block.  Do a binary search.
-                */
-               else {
-                       int             high;   /* high entry number */
-                       xfs_alloc_key_t *kkbase=NULL;/* base of keys in block */
-                       xfs_alloc_rec_t *krbase=NULL;/* base of records in block */
-                       int             low;    /* low entry number */
-
-                       /*
-                        * Get a pointer to keys or records.
-                        */
-                       if (level > 0)
-                               kkbase = XFS_ALLOC_KEY_ADDR(block, 1, cur);
-                       else
-                               krbase = XFS_ALLOC_REC_ADDR(block, 1, cur);
-                       /*
-                        * Set low and high entry numbers, 1-based.
-                        */
-                       low = 1;
-                       if (!(high = be16_to_cpu(block->bb_numrecs))) {
-                               /*
-                                * If the block is empty, the tree must
-                                * be an empty leaf.
-                                */
-                               ASSERT(level == 0 && cur->bc_nlevels == 1);
-                               cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE;
-                               *stat = 0;
-                               return 0;
-                       }
-                       /*
-                        * Binary search the block.
-                        */
-                       while (low <= high) {
-                               xfs_extlen_t    blockcount;     /* key value */
-                               xfs_agblock_t   startblock;     /* key value */
-
-                               XFS_STATS_INC(xs_abt_compare);
-                               /*
-                                * keyno is average of low and high.
-                                */
-                               keyno = (low + high) >> 1;
-                               /*
-                                * Get startblock & blockcount.
-                                */
-                               if (level > 0) {
-                                       xfs_alloc_key_t *kkp;
-
-                                       kkp = kkbase + keyno - 1;
-                                       startblock = be32_to_cpu(kkp->ar_startblock);
-                                       blockcount = be32_to_cpu(kkp->ar_blockcount);
-                               } else {
-                                       xfs_alloc_rec_t *krp;
-
-                                       krp = krbase + keyno - 1;
-                                       startblock = be32_to_cpu(krp->ar_startblock);
-                                       blockcount = be32_to_cpu(krp->ar_blockcount);
-                               }
-                               /*
-                                * Compute difference to get next direction.
-                                */
-                               if (cur->bc_btnum == XFS_BTNUM_BNO)
-                                       diff = (int)startblock -
-                                              (int)cur->bc_rec.a.ar_startblock;
-                               else if (!(diff = (int)blockcount -
-                                           (int)cur->bc_rec.a.ar_blockcount))
-                                       diff = (int)startblock -
-                                           (int)cur->bc_rec.a.ar_startblock;
-                               /*
-                                * Less than, move right.
-                                */
-                               if (diff < 0)
-                                       low = keyno + 1;
-                               /*
-                                * Greater than, move left.
-                                */
-                               else if (diff > 0)
-                                       high = keyno - 1;
-                               /*
-                                * Equal, we're done.
-                                */
-                               else
-                                       break;
-                       }
-               }
-               /*
-                * If there are more levels, set up for the next level
-                * by getting the block number and filling in the cursor.
-                */
-               if (level > 0) {
-                       /*
-                        * If we moved left, need the previous key number,
-                        * unless there isn't one.
-                        */
-                       if (diff > 0 && --keyno < 1)
-                               keyno = 1;
-                       agbno = be32_to_cpu(*XFS_ALLOC_PTR_ADDR(block, keyno, cur));
-#ifdef DEBUG
-                       if ((error = xfs_btree_check_sptr(cur, agbno, level)))
-                               return error;
-#endif
-                       cur->bc_ptrs[level] = keyno;
-               }
-       }
-       /*
-        * Done with the search.
-        * See if we need to adjust the results.
-        */
-       if (dir != XFS_LOOKUP_LE && diff < 0) {
-               keyno++;
-               /*
-                * If ge search and we went off the end of the block, but it's
-                * not the last block, we're in the wrong block.
-                */
-               if (dir == XFS_LOOKUP_GE &&
-                   keyno > be16_to_cpu(block->bb_numrecs) &&
-                   be32_to_cpu(block->bb_rightsib) != NULLAGBLOCK) {
-                       int     i;
-
-                       cur->bc_ptrs[0] = keyno;
-                       if ((error = xfs_alloc_increment(cur, 0, &i)))
-                               return error;
-                       XFS_WANT_CORRUPTED_RETURN(i == 1);
-                       *stat = 1;
-                       return 0;
-               }
-       }
-       else if (dir == XFS_LOOKUP_LE && diff > 0)
-               keyno--;
-       cur->bc_ptrs[0] = keyno;
-       /*
-        * Return if we succeeded or not.
-        */
-       if (keyno == 0 || keyno > be16_to_cpu(block->bb_numrecs))
-               *stat = 0;
-       else
-               *stat = ((dir != XFS_LOOKUP_EQ) || (diff == 0));
-       return 0;
-}
-
-/*
- * Move 1 record left from cur/level if possible.
- * Update cur to reflect the new path.
- */
-STATIC int                             /* error */
-xfs_alloc_lshift(
-       xfs_btree_cur_t         *cur,   /* btree cursor */
-       int                     level,  /* level to shift record on */
-       int                     *stat)  /* success/failure */
-{
-       int                     error;  /* error return value */
-#ifdef DEBUG
-       int                     i;      /* loop index */
-#endif
-       xfs_alloc_key_t         key;    /* key value for leaf level upward */
-       xfs_buf_t               *lbp;   /* buffer for left neighbor block */
-       xfs_alloc_block_t       *left;  /* left neighbor btree block */
-       int                     nrec;   /* new number of left block entries */
-       xfs_buf_t               *rbp;   /* buffer for right (current) block */
-       xfs_alloc_block_t       *right; /* right (current) btree block */
-       xfs_alloc_key_t         *rkp=NULL;      /* key pointer for right block */
-       xfs_alloc_ptr_t         *rpp=NULL;      /* address pointer for right block */
-       xfs_alloc_rec_t         *rrp=NULL;      /* record pointer for right block */
-
-       /*
-        * Set up variables for this block as "right".
-        */
-       rbp = cur->bc_bufs[level];
-       right = XFS_BUF_TO_ALLOC_BLOCK(rbp);
-#ifdef DEBUG
-       if ((error = xfs_btree_check_sblock(cur, right, level, rbp)))
-               return error;
-#endif
-       /*
-        * If we've got no left sibling then we can't shift an entry left.
-        */
-       if (be32_to_cpu(right->bb_leftsib) == NULLAGBLOCK) {
-               *stat = 0;
-               return 0;
-       }
-       /*
-        * If the cursor entry is the one that would be moved, don't
-        * do it... it's too complicated.
-        */
-       if (cur->bc_ptrs[level] <= 1) {
-               *stat = 0;
-               return 0;
-       }
-       /*
-        * Set up the left neighbor as "left".
-        */
-       if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
-                       cur->bc_private.a.agno, be32_to_cpu(right->bb_leftsib),
-                       0, &lbp, XFS_ALLOC_BTREE_REF)))
-               return error;
-       left = XFS_BUF_TO_ALLOC_BLOCK(lbp);
-       if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
-               return error;
-       /*
-        * If it's full, it can't take another entry.
-        */
-       if (be16_to_cpu(left->bb_numrecs) == XFS_ALLOC_BLOCK_MAXRECS(level, cur)) {
-               *stat = 0;
-               return 0;
-       }
-       nrec = be16_to_cpu(left->bb_numrecs) + 1;
-       /*
-        * If non-leaf, copy a key and a ptr to the left block.
-        */
-       if (level > 0) {
-               xfs_alloc_key_t *lkp;   /* key pointer for left block */
-               xfs_alloc_ptr_t *lpp;   /* address pointer for left block */
-
-               lkp = XFS_ALLOC_KEY_ADDR(left, nrec, cur);
-               rkp = XFS_ALLOC_KEY_ADDR(right, 1, cur);
-               *lkp = *rkp;
-               xfs_alloc_log_keys(cur, lbp, nrec, nrec);
-               lpp = XFS_ALLOC_PTR_ADDR(left, nrec, cur);
-               rpp = XFS_ALLOC_PTR_ADDR(right, 1, cur);
-#ifdef DEBUG
-               if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(*rpp), level)))
-                       return error;
-#endif
-               *lpp = *rpp;
-               xfs_alloc_log_ptrs(cur, lbp, nrec, nrec);
-               xfs_btree_check_key(cur->bc_btnum, lkp - 1, lkp);
-       }
-       /*
-        * If leaf, copy a record to the left block.
-        */
-       else {
-               xfs_alloc_rec_t *lrp;   /* record pointer for left block */
-
-               lrp = XFS_ALLOC_REC_ADDR(left, nrec, cur);
-               rrp = XFS_ALLOC_REC_ADDR(right, 1, cur);
-               *lrp = *rrp;
-               xfs_alloc_log_recs(cur, lbp, nrec, nrec);
-               xfs_btree_check_rec(cur->bc_btnum, lrp - 1, lrp);
-       }
-       /*
-        * Bump and log left's numrecs, decrement and log right's numrecs.
-        */
-       be16_add_cpu(&left->bb_numrecs, 1);
-       xfs_alloc_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS);
-       be16_add_cpu(&right->bb_numrecs, -1);
-       xfs_alloc_log_block(cur->bc_tp, rbp, XFS_BB_NUMRECS);
-       /*
-        * Slide the contents of right down one entry.
-        */
-       if (level > 0) {
-#ifdef DEBUG
-               for (i = 0; i < be16_to_cpu(right->bb_numrecs); i++) {
-                       if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(rpp[i + 1]),
-                                       level)))
-                               return error;
-               }
-#endif
-               memmove(rkp, rkp + 1, be16_to_cpu(right->bb_numrecs) * sizeof(*rkp));
-               memmove(rpp, rpp + 1, be16_to_cpu(right->bb_numrecs) * sizeof(*rpp));
-               xfs_alloc_log_keys(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
-               xfs_alloc_log_ptrs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
-       } else {
-               memmove(rrp, rrp + 1, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp));
-               xfs_alloc_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
-               key.ar_startblock = rrp->ar_startblock;
-               key.ar_blockcount = rrp->ar_blockcount;
-               rkp = &key;
-       }
-       /*
-        * Update the parent key values of right.
-        */
-       if ((error = xfs_alloc_updkey(cur, rkp, level + 1)))
-               return error;
-       /*
-        * Slide the cursor value left one.
-        */
-       cur->bc_ptrs[level]--;
-       *stat = 1;
-       return 0;
-}
-
-/*
- * Allocate a new root block, fill it in.
- */
-STATIC int                             /* error */
-xfs_alloc_newroot(
-       xfs_btree_cur_t         *cur,   /* btree cursor */
-       int                     *stat)  /* success/failure */
-{
-       int                     error;  /* error return value */
-       xfs_agblock_t           lbno;   /* left block number */
-       xfs_buf_t               *lbp;   /* left btree buffer */
-       xfs_alloc_block_t       *left;  /* left btree block */
-       xfs_mount_t             *mp;    /* mount structure */
-       xfs_agblock_t           nbno;   /* new block number */
-       xfs_buf_t               *nbp;   /* new (root) buffer */
-       xfs_alloc_block_t       *new;   /* new (root) btree block */
-       int                     nptr;   /* new value for key index, 1 or 2 */
-       xfs_agblock_t           rbno;   /* right block number */
-       xfs_buf_t               *rbp;   /* right btree buffer */
-       xfs_alloc_block_t       *right; /* right btree block */
-
-       mp = cur->bc_mp;
-
-       ASSERT(cur->bc_nlevels < XFS_AG_MAXLEVELS(mp));
-       /*
-        * Get a buffer from the freelist blocks, for the new root.
-        */
-       error = xfs_alloc_get_freelist(cur->bc_tp,
-                                       cur->bc_private.a.agbp, &nbno, 1);
-       if (error)
-               return error;
-       /*
-        * None available, we fail.
-        */
-       if (nbno == NULLAGBLOCK) {
-               *stat = 0;
-               return 0;
-       }
-       xfs_trans_agbtree_delta(cur->bc_tp, 1);
-       nbp = xfs_btree_get_bufs(mp, cur->bc_tp, cur->bc_private.a.agno, nbno,
-               0);
-       new = XFS_BUF_TO_ALLOC_BLOCK(nbp);
-       /*
-        * Set the root data in the a.g. freespace structure.
-        */
-       {
-               xfs_agf_t       *agf;   /* a.g. freespace header */
-               xfs_agnumber_t  seqno;
-
-               agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
-               agf->agf_roots[cur->bc_btnum] = cpu_to_be32(nbno);
-               be32_add_cpu(&agf->agf_levels[cur->bc_btnum], 1);
-               seqno = be32_to_cpu(agf->agf_seqno);
-               mp->m_perag[seqno].pagf_levels[cur->bc_btnum]++;
-               xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp,
-                       XFS_AGF_ROOTS | XFS_AGF_LEVELS);
-       }
-       /*
-        * At the previous root level there are now two blocks: the old
-        * root, and the new block generated when it was split.
-        * We don't know which one the cursor is pointing at, so we
-        * set up variables "left" and "right" for each case.
-        */
-       lbp = cur->bc_bufs[cur->bc_nlevels - 1];
-       left = XFS_BUF_TO_ALLOC_BLOCK(lbp);
-#ifdef DEBUG
-       if ((error = xfs_btree_check_sblock(cur, left, cur->bc_nlevels - 1, lbp)))
-               return error;
-#endif
-       if (be32_to_cpu(left->bb_rightsib) != NULLAGBLOCK) {
-               /*
-                * Our block is left, pick up the right block.
-                */
-               lbno = XFS_DADDR_TO_AGBNO(mp, XFS_BUF_ADDR(lbp));
-               rbno = be32_to_cpu(left->bb_rightsib);
-               if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
-                               cur->bc_private.a.agno, rbno, 0, &rbp,
-                               XFS_ALLOC_BTREE_REF)))
-                       return error;
-               right = XFS_BUF_TO_ALLOC_BLOCK(rbp);
-               if ((error = xfs_btree_check_sblock(cur, right,
-                               cur->bc_nlevels - 1, rbp)))
-                       return error;
-               nptr = 1;
-       } else {
-               /*
-                * Our block is right, pick up the left block.
-                */
-               rbp = lbp;
-               right = left;
-               rbno = XFS_DADDR_TO_AGBNO(mp, XFS_BUF_ADDR(rbp));
-               lbno = be32_to_cpu(right->bb_leftsib);
-               if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
-                               cur->bc_private.a.agno, lbno, 0, &lbp,
-                               XFS_ALLOC_BTREE_REF)))
-                       return error;
-               left = XFS_BUF_TO_ALLOC_BLOCK(lbp);
-               if ((error = xfs_btree_check_sblock(cur, left,
-                               cur->bc_nlevels - 1, lbp)))
-                       return error;
-               nptr = 2;
-       }
-       /*
-        * Fill in the new block's btree header and log it.
-        */
-       new->bb_magic = cpu_to_be32(xfs_magics[cur->bc_btnum]);
-       new->bb_level = cpu_to_be16(cur->bc_nlevels);
-       new->bb_numrecs = cpu_to_be16(2);
-       new->bb_leftsib = cpu_to_be32(NULLAGBLOCK);
-       new->bb_rightsib = cpu_to_be32(NULLAGBLOCK);
-       xfs_alloc_log_block(cur->bc_tp, nbp, XFS_BB_ALL_BITS);
-       ASSERT(lbno != NULLAGBLOCK && rbno != NULLAGBLOCK);
-       /*
-        * Fill in the key data in the new root.
-        */
-       {
-               xfs_alloc_key_t         *kp;    /* btree key pointer */
-
-               kp = XFS_ALLOC_KEY_ADDR(new, 1, cur);
-               if (be16_to_cpu(left->bb_level) > 0) {
-                       kp[0] = *XFS_ALLOC_KEY_ADDR(left, 1, cur);
-                       kp[1] = *XFS_ALLOC_KEY_ADDR(right, 1, cur);
-               } else {
-                       xfs_alloc_rec_t *rp;    /* btree record pointer */
-
-                       rp = XFS_ALLOC_REC_ADDR(left, 1, cur);
-                       kp[0].ar_startblock = rp->ar_startblock;
-                       kp[0].ar_blockcount = rp->ar_blockcount;
-                       rp = XFS_ALLOC_REC_ADDR(right, 1, cur);
-                       kp[1].ar_startblock = rp->ar_startblock;
-                       kp[1].ar_blockcount = rp->ar_blockcount;
-               }
-       }
-       xfs_alloc_log_keys(cur, nbp, 1, 2);
-       /*
-        * Fill in the pointer data in the new root.
-        */
-       {
-               xfs_alloc_ptr_t         *pp;    /* btree address pointer */
-
-               pp = XFS_ALLOC_PTR_ADDR(new, 1, cur);
-               pp[0] = cpu_to_be32(lbno);
-               pp[1] = cpu_to_be32(rbno);
-       }
-       xfs_alloc_log_ptrs(cur, nbp, 1, 2);
-       /*
-        * Fix up the cursor.
-        */
-       xfs_btree_setbuf(cur, cur->bc_nlevels, nbp);
-       cur->bc_ptrs[cur->bc_nlevels] = nptr;
-       cur->bc_nlevels++;
-       *stat = 1;
-       return 0;
-}
-
-/*
- * Move 1 record right from cur/level if possible.
- * Update cur to reflect the new path.
- */
-STATIC int                             /* error */
-xfs_alloc_rshift(
-       xfs_btree_cur_t         *cur,   /* btree cursor */
-       int                     level,  /* level to shift record on */
-       int                     *stat)  /* success/failure */
-{
-       int                     error;  /* error return value */
-       int                     i;      /* loop index */
-       xfs_alloc_key_t         key;    /* key value for leaf level upward */
-       xfs_buf_t               *lbp;   /* buffer for left (current) block */
-       xfs_alloc_block_t       *left;  /* left (current) btree block */
-       xfs_buf_t               *rbp;   /* buffer for right neighbor block */
-       xfs_alloc_block_t       *right; /* right neighbor btree block */
-       xfs_alloc_key_t         *rkp;   /* key pointer for right block */
-       xfs_btree_cur_t         *tcur;  /* temporary cursor */
-
-       /*
-        * Set up variables for this block as "left".
-        */
-       lbp = cur->bc_bufs[level];
-       left = XFS_BUF_TO_ALLOC_BLOCK(lbp);
-#ifdef DEBUG
-       if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
-               return error;
-#endif
-       /*
-        * If we've got no right sibling then we can't shift an entry right.
-        */
-       if (be32_to_cpu(left->bb_rightsib) == NULLAGBLOCK) {
-               *stat = 0;
-               return 0;
-       }
-       /*
-        * If the cursor entry is the one that would be moved, don't
-        * do it... it's too complicated.
-        */
-       if (cur->bc_ptrs[level] >= be16_to_cpu(left->bb_numrecs)) {
-               *stat = 0;
-               return 0;
-       }
-       /*
-        * Set up the right neighbor as "right".
-        */
-       if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
-                       cur->bc_private.a.agno, be32_to_cpu(left->bb_rightsib),
-                       0, &rbp, XFS_ALLOC_BTREE_REF)))
-               return error;
-       right = XFS_BUF_TO_ALLOC_BLOCK(rbp);
-       if ((error = xfs_btree_check_sblock(cur, right, level, rbp)))
+
+STATIC void
+xfs_allocbt_set_root(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_ptr     *ptr,
+       int                     inc)
+{
+       struct xfs_buf          *agbp = cur->bc_private.a.agbp;
+       struct xfs_agf          *agf = XFS_BUF_TO_AGF(agbp);
+       xfs_agnumber_t          seqno = be32_to_cpu(agf->agf_seqno);
+       int                     btnum = cur->bc_btnum;
+
+       ASSERT(ptr->s != 0);
+
+       agf->agf_roots[btnum] = ptr->s;
+       be32_add_cpu(&agf->agf_levels[btnum], inc);
+       cur->bc_mp->m_perag[seqno].pagf_levels[btnum] += inc;
+
+       xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS);
+}
+
+STATIC int
+xfs_allocbt_alloc_block(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_ptr     *start,
+       union xfs_btree_ptr     *new,
+       int                     length,
+       int                     *stat)
+{
+       int                     error;
+       xfs_agblock_t           bno;
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+
+       /* Allocate the new block from the freelist. If we can't, give up.  */
+       error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_private.a.agbp,
+                                      &bno, 1);
+       if (error) {
+               XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
                return error;
-       /*
-        * If it's full, it can't take another entry.
-        */
-       if (be16_to_cpu(right->bb_numrecs) == XFS_ALLOC_BLOCK_MAXRECS(level, cur)) {
+       }
+
+       if (bno == NULLAGBLOCK) {
+               XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
                *stat = 0;
                return 0;
        }
-       /*
-        * Make a hole at the start of the right neighbor block, then
-        * copy the last left block entry to the hole.
-        */
-       if (level > 0) {
-               xfs_alloc_key_t *lkp;   /* key pointer for left block */
-               xfs_alloc_ptr_t *lpp;   /* address pointer for left block */
-               xfs_alloc_ptr_t *rpp;   /* address pointer for right block */
 
-               lkp = XFS_ALLOC_KEY_ADDR(left, be16_to_cpu(left->bb_numrecs), cur);
-               lpp = XFS_ALLOC_PTR_ADDR(left, be16_to_cpu(left->bb_numrecs), cur);
-               rkp = XFS_ALLOC_KEY_ADDR(right, 1, cur);
-               rpp = XFS_ALLOC_PTR_ADDR(right, 1, cur);
-#ifdef DEBUG
-               for (i = be16_to_cpu(right->bb_numrecs) - 1; i >= 0; i--) {
-                       if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(rpp[i]), level)))
-                               return error;
-               }
-#endif
-               memmove(rkp + 1, rkp, be16_to_cpu(right->bb_numrecs) * sizeof(*rkp));
-               memmove(rpp + 1, rpp, be16_to_cpu(right->bb_numrecs) * sizeof(*rpp));
-#ifdef DEBUG
-               if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(*lpp), level)))
-                       return error;
-#endif
-               *rkp = *lkp;
-               *rpp = *lpp;
-               xfs_alloc_log_keys(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1);
-               xfs_alloc_log_ptrs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1);
-               xfs_btree_check_key(cur->bc_btnum, rkp, rkp + 1);
-       } else {
-               xfs_alloc_rec_t *lrp;   /* record pointer for left block */
-               xfs_alloc_rec_t *rrp;   /* record pointer for right block */
+       xfs_trans_agbtree_delta(cur->bc_tp, 1);
+       new->s = cpu_to_be32(bno);
 
-               lrp = XFS_ALLOC_REC_ADDR(left, be16_to_cpu(left->bb_numrecs), cur);
-               rrp = XFS_ALLOC_REC_ADDR(right, 1, cur);
-               memmove(rrp + 1, rrp, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp));
-               *rrp = *lrp;
-               xfs_alloc_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1);
-               key.ar_startblock = rrp->ar_startblock;
-               key.ar_blockcount = rrp->ar_blockcount;
-               rkp = &key;
-               xfs_btree_check_rec(cur->bc_btnum, rrp, rrp + 1);
-       }
-       /*
-        * Decrement and log left's numrecs, bump and log right's numrecs.
-        */
-       be16_add_cpu(&left->bb_numrecs, -1);
-       xfs_alloc_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS);
-       be16_add_cpu(&right->bb_numrecs, 1);
-       xfs_alloc_log_block(cur->bc_tp, rbp, XFS_BB_NUMRECS);
-       /*
-        * Using a temporary cursor, update the parent key values of the
-        * block on the right.
-        */
-       if ((error = xfs_btree_dup_cursor(cur, &tcur)))
-               return error;
-       i = xfs_btree_lastrec(tcur, level);
-       XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-       if ((error = xfs_alloc_increment(tcur, level, &i)) ||
-           (error = xfs_alloc_updkey(tcur, rkp, level + 1)))
-               goto error0;
-       xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
        *stat = 1;
        return 0;
-error0:
-       xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
-       return error;
 }
 
-/*
- * Split cur/level block in half.
- * Return new block number and its first record (to be inserted into parent).
- */
-STATIC int                             /* error */
-xfs_alloc_split(
-       xfs_btree_cur_t         *cur,   /* btree cursor */
-       int                     level,  /* level to split */
-       xfs_agblock_t           *bnop,  /* output: block number allocated */
-       xfs_alloc_key_t         *keyp,  /* output: first key of new block */
-       xfs_btree_cur_t         **curp, /* output: new cursor */
-       int                     *stat)  /* success/failure */
+STATIC int
+xfs_allocbt_free_block(
+       struct xfs_btree_cur    *cur,
+       struct xfs_buf          *bp)
 {
-       int                     error;  /* error return value */
-       int                     i;      /* loop index/record number */
-       xfs_agblock_t           lbno;   /* left (current) block number */
-       xfs_buf_t               *lbp;   /* buffer for left block */
-       xfs_alloc_block_t       *left;  /* left (current) btree block */
-       xfs_agblock_t           rbno;   /* right (new) block number */
-       xfs_buf_t               *rbp;   /* buffer for right block */
-       xfs_alloc_block_t       *right; /* right (new) btree block */
+       struct xfs_buf          *agbp = cur->bc_private.a.agbp;
+       struct xfs_agf          *agf = XFS_BUF_TO_AGF(agbp);
+       xfs_agblock_t           bno;
+       int                     error;
 
-       /*
-        * Allocate the new block from the freelist.
-        * If we can't do it, we're toast.  Give up.
-        */
-       error = xfs_alloc_get_freelist(cur->bc_tp,
-                                        cur->bc_private.a.agbp, &rbno, 1);
+       bno = XFS_DADDR_TO_AGBNO(cur->bc_mp, XFS_BUF_ADDR(bp));
+       error = xfs_alloc_put_freelist(cur->bc_tp, agbp, NULL, bno, 1);
        if (error)
                return error;
-       if (rbno == NULLAGBLOCK) {
-               *stat = 0;
-               return 0;
-       }
-       xfs_trans_agbtree_delta(cur->bc_tp, 1);
-       rbp = xfs_btree_get_bufs(cur->bc_mp, cur->bc_tp, cur->bc_private.a.agno,
-               rbno, 0);
-       /*
-        * Set up the new block as "right".
-        */
-       right = XFS_BUF_TO_ALLOC_BLOCK(rbp);
-       /*
-        * "Left" is the current (according to the cursor) block.
-        */
-       lbp = cur->bc_bufs[level];
-       left = XFS_BUF_TO_ALLOC_BLOCK(lbp);
-#ifdef DEBUG
-       if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
-               return error;
-#endif
-       /*
-        * Fill in the btree header for the new block.
-        */
-       right->bb_magic = cpu_to_be32(xfs_magics[cur->bc_btnum]);
-       right->bb_level = left->bb_level;
-       right->bb_numrecs = cpu_to_be16(be16_to_cpu(left->bb_numrecs) / 2);
-       /*
-        * Make sure that if there's an odd number of entries now, that
-        * each new block will have the same number of entries.
-        */
-       if ((be16_to_cpu(left->bb_numrecs) & 1) &&
-           cur->bc_ptrs[level] <= be16_to_cpu(right->bb_numrecs) + 1)
-               be16_add_cpu(&right->bb_numrecs, 1);
-       i = be16_to_cpu(left->bb_numrecs) - be16_to_cpu(right->bb_numrecs) + 1;
-       /*
-        * For non-leaf blocks, copy keys and addresses over to the new block.
-        */
-       if (level > 0) {
-               xfs_alloc_key_t *lkp;   /* left btree key pointer */
-               xfs_alloc_ptr_t *lpp;   /* left btree address pointer */
-               xfs_alloc_key_t *rkp;   /* right btree key pointer */
-               xfs_alloc_ptr_t *rpp;   /* right btree address pointer */
-
-               lkp = XFS_ALLOC_KEY_ADDR(left, i, cur);
-               lpp = XFS_ALLOC_PTR_ADDR(left, i, cur);
-               rkp = XFS_ALLOC_KEY_ADDR(right, 1, cur);
-               rpp = XFS_ALLOC_PTR_ADDR(right, 1, cur);
-#ifdef DEBUG
-               for (i = 0; i < be16_to_cpu(right->bb_numrecs); i++) {
-                       if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(lpp[i]), level)))
-                               return error;
-               }
-#endif
-               memcpy(rkp, lkp, be16_to_cpu(right->bb_numrecs) * sizeof(*rkp));
-               memcpy(rpp, lpp, be16_to_cpu(right->bb_numrecs) * sizeof(*rpp));
-               xfs_alloc_log_keys(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
-               xfs_alloc_log_ptrs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
-               *keyp = *rkp;
-       }
-       /*
-        * For leaf blocks, copy records over to the new block.
-        */
-       else {
-               xfs_alloc_rec_t *lrp;   /* left btree record pointer */
-               xfs_alloc_rec_t *rrp;   /* right btree record pointer */
-
-               lrp = XFS_ALLOC_REC_ADDR(left, i, cur);
-               rrp = XFS_ALLOC_REC_ADDR(right, 1, cur);
-               memcpy(rrp, lrp, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp));
-               xfs_alloc_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
-               keyp->ar_startblock = rrp->ar_startblock;
-               keyp->ar_blockcount = rrp->ar_blockcount;
-       }
-       /*
-        * Find the left block number by looking in the buffer.
-        * Adjust numrecs, sibling pointers.
-        */
-       lbno = XFS_DADDR_TO_AGBNO(cur->bc_mp, XFS_BUF_ADDR(lbp));
-       be16_add_cpu(&left->bb_numrecs, -(be16_to_cpu(right->bb_numrecs)));
-       right->bb_rightsib = left->bb_rightsib;
-       left->bb_rightsib = cpu_to_be32(rbno);
-       right->bb_leftsib = cpu_to_be32(lbno);
-       xfs_alloc_log_block(cur->bc_tp, rbp, XFS_BB_ALL_BITS);
-       xfs_alloc_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
-       /*
-        * If there's a block to the new block's right, make that block
-        * point back to right instead of to left.
-        */
-       if (be32_to_cpu(right->bb_rightsib) != NULLAGBLOCK) {
-               xfs_alloc_block_t       *rrblock;       /* rr btree block */
-               xfs_buf_t               *rrbp;          /* buffer for rrblock */
 
-               if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
-                               cur->bc_private.a.agno, be32_to_cpu(right->bb_rightsib), 0,
-                               &rrbp, XFS_ALLOC_BTREE_REF)))
-                       return error;
-               rrblock = XFS_BUF_TO_ALLOC_BLOCK(rrbp);
-               if ((error = xfs_btree_check_sblock(cur, rrblock, level, rrbp)))
-                       return error;
-               rrblock->bb_leftsib = cpu_to_be32(rbno);
-               xfs_alloc_log_block(cur->bc_tp, rrbp, XFS_BB_LEFTSIB);
-       }
-       /*
-        * If the cursor is really in the right block, move it there.
-        * If it's just pointing past the last entry in left, then we'll
-        * insert there, so don't change anything in that case.
-        */
-       if (cur->bc_ptrs[level] > be16_to_cpu(left->bb_numrecs) + 1) {
-               xfs_btree_setbuf(cur, level, rbp);
-               cur->bc_ptrs[level] -= be16_to_cpu(left->bb_numrecs);
-       }
        /*
-        * If there are more levels, we'll need another cursor which refers to
-        * the right block, no matter where this cursor was.
+        * Since blocks move to the free list without the coordination used in
+        * xfs_bmap_finish, we can't allow block to be available for
+        * reallocation and non-transaction writing (user data) until we know
+        * that the transaction that moved it to the free list is permanently
+        * on disk. We track the blocks by declaring these blocks as "busy";
+        * the busy list is maintained on a per-ag basis and each transaction
+        * records which entries should be removed when the iclog commits to
+        * disk. If a busy block is allocated, the iclog is pushed up to the
+        * LSN that freed the block.
         */
-       if (level + 1 < cur->bc_nlevels) {
-               if ((error = xfs_btree_dup_cursor(cur, curp)))
-                       return error;
-               (*curp)->bc_ptrs[level + 1]++;
-       }
-       *bnop = rbno;
-       *stat = 1;
+       xfs_alloc_mark_busy(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1);
+       xfs_trans_agbtree_delta(cur->bc_tp, -1);
        return 0;
 }
 
 /*
- * Update keys at all levels from here to the root along the cursor's path.
+ * Update the longest extent in the AGF
  */
-STATIC int                             /* error */
-xfs_alloc_updkey(
-       xfs_btree_cur_t         *cur,   /* btree cursor */
-       xfs_alloc_key_t         *keyp,  /* new key value to update to */
-       int                     level)  /* starting level for update */
+STATIC void
+xfs_allocbt_update_lastrec(
+       struct xfs_btree_cur    *cur,
+       struct xfs_btree_block  *block,
+       union xfs_btree_rec     *rec,
+       int                     ptr,
+       int                     reason)
 {
-       int                     ptr;    /* index of key in block */
+       struct xfs_agf          *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
+       xfs_agnumber_t          seqno = be32_to_cpu(agf->agf_seqno);
+       __be32                  len;
+       int                     numrecs;
 
-       /*
-        * Go up the tree from this level toward the root.
-        * At each level, update the key value to the value input.
-        * Stop when we reach a level where the cursor isn't pointing
-        * at the first entry in the block.
-        */
-       for (ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) {
-               xfs_alloc_block_t       *block; /* btree block */
-               xfs_buf_t               *bp;    /* buffer for block */
-#ifdef DEBUG
-               int                     error;  /* error return value */
-#endif
-               xfs_alloc_key_t         *kp;    /* ptr to btree block keys */
+       ASSERT(cur->bc_btnum == XFS_BTNUM_CNT);
+
+       switch (reason) {
+       case LASTREC_UPDATE:
+               /*
+                * If this is the last leaf block and it's the last record,
+                * then update the size of the longest extent in the AG.
+                */
+               if (ptr != xfs_btree_get_numrecs(block))
+                       return;
+               len = rec->alloc.ar_blockcount;
+               break;
+       case LASTREC_INSREC:
+               if (be32_to_cpu(rec->alloc.ar_blockcount) <=
+                   be32_to_cpu(agf->agf_longest))
+                       return;
+               len = rec->alloc.ar_blockcount;
+               break;
+       case LASTREC_DELREC:
+               numrecs = xfs_btree_get_numrecs(block);
+               if (ptr <= numrecs)
+                       return;
+               ASSERT(ptr == numrecs + 1);
 
-               bp = cur->bc_bufs[level];
-               block = XFS_BUF_TO_ALLOC_BLOCK(bp);
-#ifdef DEBUG
-               if ((error = xfs_btree_check_sblock(cur, block, level, bp)))
-                       return error;
-#endif
-               ptr = cur->bc_ptrs[level];
-               kp = XFS_ALLOC_KEY_ADDR(block, ptr, cur);
-               *kp = *keyp;
-               xfs_alloc_log_keys(cur, bp, ptr, ptr);
+               if (numrecs) {
+                       xfs_alloc_rec_t *rrp;
+
+                       rrp = XFS_ALLOC_REC_ADDR(cur->bc_mp, block, numrecs);
+                       len = rrp->ar_blockcount;
+               } else {
+                       len = 0;
+               }
+
+               break;
+       default:
+               ASSERT(0);
+               return;
        }
-       return 0;
+
+       agf->agf_longest = len;
+       cur->bc_mp->m_perag[seqno].pagf_longest = be32_to_cpu(len);
+       xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp, XFS_AGF_LONGEST);
 }
 
-/*
- * Externally visible routines.
- */
+STATIC int
+xfs_allocbt_get_minrecs(
+       struct xfs_btree_cur    *cur,
+       int                     level)
+{
+       return cur->bc_mp->m_alloc_mnr[level != 0];
+}
 
-/*
- * Decrement cursor by one record at the level.
- * For nonzero levels the leaf-ward information is untouched.
- */
-int                                    /* error */
-xfs_alloc_decrement(
-       xfs_btree_cur_t         *cur,   /* btree cursor */
-       int                     level,  /* level in btree, 0 is leaf */
-       int                     *stat)  /* success/failure */
+STATIC int
+xfs_allocbt_get_maxrecs(
+       struct xfs_btree_cur    *cur,
+       int                     level)
 {
-       xfs_alloc_block_t       *block; /* btree block */
-       int                     error;  /* error return value */
-       int                     lev;    /* btree level */
+       return cur->bc_mp->m_alloc_mxr[level != 0];
+}
 
-       ASSERT(level < cur->bc_nlevels);
-       /*
-        * Read-ahead to the left at this level.
-        */
-       xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA);
-       /*
-        * Decrement the ptr at this level.  If we're still in the block
-        * then we're done.
-        */
-       if (--cur->bc_ptrs[level] > 0) {
-               *stat = 1;
-               return 0;
-       }
-       /*
-        * Get a pointer to the btree block.
-        */
-       block = XFS_BUF_TO_ALLOC_BLOCK(cur->bc_bufs[level]);
-#ifdef DEBUG
-       if ((error = xfs_btree_check_sblock(cur, block, level,
-                       cur->bc_bufs[level])))
-               return error;
-#endif
-       /*
-        * If we just went off the left edge of the tree, return failure.
-        */
-       if (be32_to_cpu(block->bb_leftsib) == NULLAGBLOCK) {
-               *stat = 0;
-               return 0;
-       }
-       /*
-        * March up the tree decrementing pointers.
-        * Stop when we don't go off the left edge of a block.
-        */
-       for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
-               if (--cur->bc_ptrs[lev] > 0)
-                       break;
-               /*
-                * Read-ahead the left block, we're going to read it
-                * in the next loop.
-                */
-               xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA);
-       }
-       /*
-        * If we went off the root then we are seriously confused.
-        */
-       ASSERT(lev < cur->bc_nlevels);
-       /*
-        * Now walk back down the tree, fixing up the cursor's buffer
-        * pointers and key numbers.
-        */
-       for (block = XFS_BUF_TO_ALLOC_BLOCK(cur->bc_bufs[lev]); lev > level; ) {
-               xfs_agblock_t   agbno;  /* block number of btree block */
-               xfs_buf_t       *bp;    /* buffer pointer for block */
+STATIC void
+xfs_allocbt_init_key_from_rec(
+       union xfs_btree_key     *key,
+       union xfs_btree_rec     *rec)
+{
+       ASSERT(rec->alloc.ar_startblock != 0);
 
-               agbno = be32_to_cpu(*XFS_ALLOC_PTR_ADDR(block, cur->bc_ptrs[lev], cur));
-               if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
-                               cur->bc_private.a.agno, agbno, 0, &bp,
-                               XFS_ALLOC_BTREE_REF)))
-                       return error;
-               lev--;
-               xfs_btree_setbuf(cur, lev, bp);
-               block = XFS_BUF_TO_ALLOC_BLOCK(bp);
-               if ((error = xfs_btree_check_sblock(cur, block, lev, bp)))
-                       return error;
-               cur->bc_ptrs[lev] = be16_to_cpu(block->bb_numrecs);
-       }
-       *stat = 1;
-       return 0;
+       key->alloc.ar_startblock = rec->alloc.ar_startblock;
+       key->alloc.ar_blockcount = rec->alloc.ar_blockcount;
 }
 
-/*
- * Delete the record pointed to by cur.
- * The cursor refers to the place where the record was (could be inserted)
- * when the operation returns.
- */
-int                                    /* error */
-xfs_alloc_delete(
-       xfs_btree_cur_t *cur,           /* btree cursor */
-       int             *stat)          /* success/failure */
+STATIC void
+xfs_allocbt_init_rec_from_key(
+       union xfs_btree_key     *key,
+       union xfs_btree_rec     *rec)
 {
-       int             error;          /* error return value */
-       int             i;              /* result code */
-       int             level;          /* btree level */
+       ASSERT(key->alloc.ar_startblock != 0);
 
-       /*
-        * Go up the tree, starting at leaf level.
-        * If 2 is returned then a join was done; go to the next level.
-        * Otherwise we are done.
-        */
-       for (level = 0, i = 2; i == 2; level++) {
-               if ((error = xfs_alloc_delrec(cur, level, &i)))
-                       return error;
-       }
-       if (i == 0) {
-               for (level = 1; level < cur->bc_nlevels; level++) {
-                       if (cur->bc_ptrs[level] == 0) {
-                               if ((error = xfs_alloc_decrement(cur, level, &i)))
-                                       return error;
-                               break;
-                       }
-               }
-       }
-       *stat = i;
-       return 0;
+       rec->alloc.ar_startblock = key->alloc.ar_startblock;
+       rec->alloc.ar_blockcount = key->alloc.ar_blockcount;
 }
 
-/*
- * Get the data from the pointed-to record.
- */
-int                                    /* error */
-xfs_alloc_get_rec(
-       xfs_btree_cur_t         *cur,   /* btree cursor */
-       xfs_agblock_t           *bno,   /* output: starting block of extent */
-       xfs_extlen_t            *len,   /* output: length of extent */
-       int                     *stat)  /* output: success/failure */
+STATIC void
+xfs_allocbt_init_rec_from_cur(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_rec     *rec)
 {
-       xfs_alloc_block_t       *block; /* btree block */
-#ifdef DEBUG
-       int                     error;  /* error return value */
-#endif
-       int                     ptr;    /* record number */
+       ASSERT(cur->bc_rec.a.ar_startblock != 0);
 
-       ptr = cur->bc_ptrs[0];
-       block = XFS_BUF_TO_ALLOC_BLOCK(cur->bc_bufs[0]);
-#ifdef DEBUG
-       if ((error = xfs_btree_check_sblock(cur, block, 0, cur->bc_bufs[0])))
-               return error;
-#endif
-       /*
-        * Off the right end or left end, return failure.
-        */
-       if (ptr > be16_to_cpu(block->bb_numrecs) || ptr <= 0) {
-               *stat = 0;
-               return 0;
-       }
-       /*
-        * Point to the record and extract its data.
-        */
-       {
-               xfs_alloc_rec_t         *rec;   /* record data */
+       rec->alloc.ar_startblock = cpu_to_be32(cur->bc_rec.a.ar_startblock);
+       rec->alloc.ar_blockcount = cpu_to_be32(cur->bc_rec.a.ar_blockcount);
+}
+
+STATIC void
+xfs_allocbt_init_ptr_from_cur(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_ptr     *ptr)
+{
+       struct xfs_agf          *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
+
+       ASSERT(cur->bc_private.a.agno == be32_to_cpu(agf->agf_seqno));
+       ASSERT(agf->agf_roots[cur->bc_btnum] != 0);
+
+       ptr->s = agf->agf_roots[cur->bc_btnum];
+}
+
+STATIC __int64_t
+xfs_allocbt_key_diff(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_key     *key)
+{
+       xfs_alloc_rec_incore_t  *rec = &cur->bc_rec.a;
+       xfs_alloc_key_t         *kp = &key->alloc;
+       __int64_t               diff;
 
-               rec = XFS_ALLOC_REC_ADDR(block, ptr, cur);
-               *bno = be32_to_cpu(rec->ar_startblock);
-               *len = be32_to_cpu(rec->ar_blockcount);
+       if (cur->bc_btnum == XFS_BTNUM_BNO) {
+               return (__int64_t)be32_to_cpu(kp->ar_startblock) -
+                               rec->ar_startblock;
        }
-       *stat = 1;
-       return 0;
+
+       diff = (__int64_t)be32_to_cpu(kp->ar_blockcount) - rec->ar_blockcount;
+       if (diff)
+               return diff;
+
+       return (__int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock;
 }
 
-/*
- * Increment cursor by one record at the level.
- * For nonzero levels the leaf-ward information is untouched.
- */
-int                                    /* error */
-xfs_alloc_increment(
-       xfs_btree_cur_t         *cur,   /* btree cursor */
-       int                     level,  /* level in btree, 0 is leaf */
-       int                     *stat)  /* success/failure */
+STATIC int
+xfs_allocbt_kill_root(
+       struct xfs_btree_cur    *cur,
+       struct xfs_buf          *bp,
+       int                     level,
+       union xfs_btree_ptr     *newroot)
 {
-       xfs_alloc_block_t       *block; /* btree block */
-       xfs_buf_t               *bp;    /* tree block buffer */
-       int                     error;  /* error return value */
-       int                     lev;    /* btree level */
+       int                     error;
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+       XFS_BTREE_STATS_INC(cur, killroot);
 
-       ASSERT(level < cur->bc_nlevels);
-       /*
-        * Read-ahead to the right at this level.
-        */
-       xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
        /*
-        * Get a pointer to the btree block.
+        * Update the root pointer, decreasing the level by 1 and then
+        * free the old root.
         */
-       bp = cur->bc_bufs[level];
-       block = XFS_BUF_TO_ALLOC_BLOCK(bp);
-#ifdef DEBUG
-       if ((error = xfs_btree_check_sblock(cur, block, level, bp)))
+       xfs_allocbt_set_root(cur, newroot, -1);
+       error = xfs_allocbt_free_block(cur, bp);
+       if (error) {
+               XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
                return error;
-#endif
-       /*
-        * Increment the ptr at this level.  If we're still in the block
-        * then we're done.
-        */
-       if (++cur->bc_ptrs[level] <= be16_to_cpu(block->bb_numrecs)) {
-               *stat = 1;
-               return 0;
-       }
-       /*
-        * If we just went off the right edge of the tree, return failure.
-        */
-       if (be32_to_cpu(block->bb_rightsib) == NULLAGBLOCK) {
-               *stat = 0;
-               return 0;
        }
-       /*
-        * March up the tree incrementing pointers.
-        * Stop when we don't go off the right edge of a block.
-        */
-       for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
-               bp = cur->bc_bufs[lev];
-               block = XFS_BUF_TO_ALLOC_BLOCK(bp);
+
+       XFS_BTREE_STATS_INC(cur, free);
+
+       xfs_btree_setbuf(cur, level, NULL);
+       cur->bc_nlevels--;
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+       return 0;
+}
+
 #ifdef DEBUG
-               if ((error = xfs_btree_check_sblock(cur, block, lev, bp)))
-                       return error;
-#endif
-               if (++cur->bc_ptrs[lev] <= be16_to_cpu(block->bb_numrecs))
-                       break;
-               /*
-                * Read-ahead the right block, we're going to read it
-                * in the next loop.
-                */
-               xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA);
+STATIC int
+xfs_allocbt_keys_inorder(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_key     *k1,
+       union xfs_btree_key     *k2)
+{
+       if (cur->bc_btnum == XFS_BTNUM_BNO) {
+               return be32_to_cpu(k1->alloc.ar_startblock) <
+                      be32_to_cpu(k2->alloc.ar_startblock);
+       } else {
+               return be32_to_cpu(k1->alloc.ar_blockcount) <
+                       be32_to_cpu(k2->alloc.ar_blockcount) ||
+                       (k1->alloc.ar_blockcount == k2->alloc.ar_blockcount &&
+                        be32_to_cpu(k1->alloc.ar_startblock) <
+                        be32_to_cpu(k2->alloc.ar_startblock));
        }
-       /*
-        * If we went off the root then we are seriously confused.
-        */
-       ASSERT(lev < cur->bc_nlevels);
-       /*
-        * Now walk back down the tree, fixing up the cursor's buffer
-        * pointers and key numbers.
-        */
-       for (bp = cur->bc_bufs[lev], block = XFS_BUF_TO_ALLOC_BLOCK(bp);
-            lev > level; ) {
-               xfs_agblock_t   agbno;  /* block number of btree block */
+}
 
-               agbno = be32_to_cpu(*XFS_ALLOC_PTR_ADDR(block, cur->bc_ptrs[lev], cur));
-               if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
-                               cur->bc_private.a.agno, agbno, 0, &bp,
-                               XFS_ALLOC_BTREE_REF)))
-                       return error;
-               lev--;
-               xfs_btree_setbuf(cur, lev, bp);
-               block = XFS_BUF_TO_ALLOC_BLOCK(bp);
-               if ((error = xfs_btree_check_sblock(cur, block, lev, bp)))
-                       return error;
-               cur->bc_ptrs[lev] = 1;
+STATIC int
+xfs_allocbt_recs_inorder(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_rec     *r1,
+       union xfs_btree_rec     *r2)
+{
+       if (cur->bc_btnum == XFS_BTNUM_BNO) {
+               return be32_to_cpu(r1->alloc.ar_startblock) +
+                       be32_to_cpu(r1->alloc.ar_blockcount) <=
+                       be32_to_cpu(r2->alloc.ar_startblock);
+       } else {
+               return be32_to_cpu(r1->alloc.ar_blockcount) <
+                       be32_to_cpu(r2->alloc.ar_blockcount) ||
+                       (r1->alloc.ar_blockcount == r2->alloc.ar_blockcount &&
+                        be32_to_cpu(r1->alloc.ar_startblock) <
+                        be32_to_cpu(r2->alloc.ar_startblock));
        }
-       *stat = 1;
-       return 0;
 }
+#endif /* DEBUG */
 
-/*
- * Insert the current record at the point referenced by cur.
- * The cursor may be inconsistent on return if splits have been done.
- */
-int                                    /* error */
-xfs_alloc_insert(
-       xfs_btree_cur_t *cur,           /* btree cursor */
-       int             *stat)          /* success/failure */
-{
-       int             error;          /* error return value */
-       int             i;              /* result value, 0 for failure */
-       int             level;          /* current level number in btree */
-       xfs_agblock_t   nbno;           /* new block number (split result) */
-       xfs_btree_cur_t *ncur;          /* new cursor (split result) */
-       xfs_alloc_rec_t nrec;           /* record being inserted this level */
-       xfs_btree_cur_t *pcur;          /* previous level's cursor */
+#ifdef XFS_BTREE_TRACE
+ktrace_t       *xfs_allocbt_trace_buf;
 
-       level = 0;
-       nbno = NULLAGBLOCK;
-       nrec.ar_startblock = cpu_to_be32(cur->bc_rec.a.ar_startblock);
-       nrec.ar_blockcount = cpu_to_be32(cur->bc_rec.a.ar_blockcount);
-       ncur = NULL;
-       pcur = cur;
-       /*
-        * Loop going up the tree, starting at the leaf level.
-        * Stop when we don't get a split block, that must mean that
-        * the insert is finished with this level.
-        */
-       do {
-               /*
-                * Insert nrec/nbno into this level of the tree.
-                * Note if we fail, nbno will be null.
-                */
-               if ((error = xfs_alloc_insrec(pcur, level++, &nbno, &nrec, &ncur,
-                               &i))) {
-                       if (pcur != cur)
-                               xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR);
-                       return error;
-               }
-               /*
-                * See if the cursor we just used is trash.
-                * Can't trash the caller's cursor, but otherwise we should
-                * if ncur is a new cursor or we're about to be done.
-                */
-               if (pcur != cur && (ncur || nbno == NULLAGBLOCK)) {
-                       cur->bc_nlevels = pcur->bc_nlevels;
-                       xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR);
-               }
-               /*
-                * If we got a new cursor, switch to it.
-                */
-               if (ncur) {
-                       pcur = ncur;
-                       ncur = NULL;
-               }
-       } while (nbno != NULLAGBLOCK);
-       *stat = i;
-       return 0;
+STATIC void
+xfs_allocbt_trace_enter(
+       struct xfs_btree_cur    *cur,
+       const char              *func,
+       char                    *s,
+       int                     type,
+       int                     line,
+       __psunsigned_t          a0,
+       __psunsigned_t          a1,
+       __psunsigned_t          a2,
+       __psunsigned_t          a3,
+       __psunsigned_t          a4,
+       __psunsigned_t          a5,
+       __psunsigned_t          a6,
+       __psunsigned_t          a7,
+       __psunsigned_t          a8,
+       __psunsigned_t          a9,
+       __psunsigned_t          a10)
+{
+       ktrace_enter(xfs_allocbt_trace_buf, (void *)(__psint_t)type,
+               (void *)func, (void *)s, NULL, (void *)cur,
+               (void *)a0, (void *)a1, (void *)a2, (void *)a3,
+               (void *)a4, (void *)a5, (void *)a6, (void *)a7,
+               (void *)a8, (void *)a9, (void *)a10);
 }
 
-/*
- * Lookup the record equal to [bno, len] in the btree given by cur.
- */
-int                                    /* error */
-xfs_alloc_lookup_eq(
-       xfs_btree_cur_t *cur,           /* btree cursor */
-       xfs_agblock_t   bno,            /* starting block of extent */
-       xfs_extlen_t    len,            /* length of extent */
-       int             *stat)          /* success/failure */
+STATIC void
+xfs_allocbt_trace_cursor(
+       struct xfs_btree_cur    *cur,
+       __uint32_t              *s0,
+       __uint64_t              *l0,
+       __uint64_t              *l1)
 {
-       cur->bc_rec.a.ar_startblock = bno;
-       cur->bc_rec.a.ar_blockcount = len;
-       return xfs_alloc_lookup(cur, XFS_LOOKUP_EQ, stat);
+       *s0 = cur->bc_private.a.agno;
+       *l0 = cur->bc_rec.a.ar_startblock;
+       *l1 = cur->bc_rec.a.ar_blockcount;
 }
 
-/*
- * Lookup the first record greater than or equal to [bno, len]
- * in the btree given by cur.
- */
-int                                    /* error */
-xfs_alloc_lookup_ge(
-       xfs_btree_cur_t *cur,           /* btree cursor */
-       xfs_agblock_t   bno,            /* starting block of extent */
-       xfs_extlen_t    len,            /* length of extent */
-       int             *stat)          /* success/failure */
+STATIC void
+xfs_allocbt_trace_key(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_key     *key,
+       __uint64_t              *l0,
+       __uint64_t              *l1)
 {
-       cur->bc_rec.a.ar_startblock = bno;
-       cur->bc_rec.a.ar_blockcount = len;
-       return xfs_alloc_lookup(cur, XFS_LOOKUP_GE, stat);
+       *l0 = be32_to_cpu(key->alloc.ar_startblock);
+       *l1 = be32_to_cpu(key->alloc.ar_blockcount);
 }
 
-/*
- * Lookup the first record less than or equal to [bno, len]
- * in the btree given by cur.
- */
-int                                    /* error */
-xfs_alloc_lookup_le(
-       xfs_btree_cur_t *cur,           /* btree cursor */
-       xfs_agblock_t   bno,            /* starting block of extent */
-       xfs_extlen_t    len,            /* length of extent */
-       int             *stat)          /* success/failure */
+STATIC void
+xfs_allocbt_trace_record(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_rec     *rec,
+       __uint64_t              *l0,
+       __uint64_t              *l1,
+       __uint64_t              *l2)
 {
-       cur->bc_rec.a.ar_startblock = bno;
-       cur->bc_rec.a.ar_blockcount = len;
-       return xfs_alloc_lookup(cur, XFS_LOOKUP_LE, stat);
+       *l0 = be32_to_cpu(rec->alloc.ar_startblock);
+       *l1 = be32_to_cpu(rec->alloc.ar_blockcount);
+       *l2 = 0;
 }
+#endif /* XFS_BTREE_TRACE */
+
+static const struct xfs_btree_ops xfs_allocbt_ops = {
+       .rec_len                = sizeof(xfs_alloc_rec_t),
+       .key_len                = sizeof(xfs_alloc_key_t),
+
+       .dup_cursor             = xfs_allocbt_dup_cursor,
+       .set_root               = xfs_allocbt_set_root,
+       .kill_root              = xfs_allocbt_kill_root,
+       .alloc_block            = xfs_allocbt_alloc_block,
+       .free_block             = xfs_allocbt_free_block,
+       .update_lastrec         = xfs_allocbt_update_lastrec,
+       .get_minrecs            = xfs_allocbt_get_minrecs,
+       .get_maxrecs            = xfs_allocbt_get_maxrecs,
+       .init_key_from_rec      = xfs_allocbt_init_key_from_rec,
+       .init_rec_from_key      = xfs_allocbt_init_rec_from_key,
+       .init_rec_from_cur      = xfs_allocbt_init_rec_from_cur,
+       .init_ptr_from_cur      = xfs_allocbt_init_ptr_from_cur,
+       .key_diff               = xfs_allocbt_key_diff,
+
+#ifdef DEBUG
+       .keys_inorder           = xfs_allocbt_keys_inorder,
+       .recs_inorder           = xfs_allocbt_recs_inorder,
+#endif
+
+#ifdef XFS_BTREE_TRACE
+       .trace_enter            = xfs_allocbt_trace_enter,
+       .trace_cursor           = xfs_allocbt_trace_cursor,
+       .trace_key              = xfs_allocbt_trace_key,
+       .trace_record           = xfs_allocbt_trace_record,
+#endif
+};
 
 /*
- * Update the record referred to by cur, to the value given by [bno, len].
- * This either works (return 0) or gets an EFSCORRUPTED error.
+ * Allocate a new allocation btree cursor.
  */
-int                                    /* error */
-xfs_alloc_update(
-       xfs_btree_cur_t         *cur,   /* btree cursor */
-       xfs_agblock_t           bno,    /* starting block of extent */
-       xfs_extlen_t            len)    /* length of extent */
+struct xfs_btree_cur *                 /* new alloc btree cursor */
+xfs_allocbt_init_cursor(
+       struct xfs_mount        *mp,            /* file system mount point */
+       struct xfs_trans        *tp,            /* transaction pointer */
+       struct xfs_buf          *agbp,          /* buffer for agf structure */
+       xfs_agnumber_t          agno,           /* allocation group number */
+       xfs_btnum_t             btnum)          /* btree identifier */
 {
-       xfs_alloc_block_t       *block; /* btree block to update */
-       int                     error;  /* error return value */
-       int                     ptr;    /* current record number (updating) */
+       struct xfs_agf          *agf = XFS_BUF_TO_AGF(agbp);
+       struct xfs_btree_cur    *cur;
 
-       ASSERT(len > 0);
-       /*
-        * Pick up the a.g. freelist struct and the current block.
-        */
-       block = XFS_BUF_TO_ALLOC_BLOCK(cur->bc_bufs[0]);
-#ifdef DEBUG
-       if ((error = xfs_btree_check_sblock(cur, block, 0, cur->bc_bufs[0])))
-               return error;
-#endif
-       /*
-        * Get the address of the rec to be updated.
-        */
-       ptr = cur->bc_ptrs[0];
-       {
-               xfs_alloc_rec_t         *rp;    /* pointer to updated record */
+       ASSERT(btnum == XFS_BTNUM_BNO || btnum == XFS_BTNUM_CNT);
 
-               rp = XFS_ALLOC_REC_ADDR(block, ptr, cur);
-               /*
-                * Fill in the new contents and log them.
-                */
-               rp->ar_startblock = cpu_to_be32(bno);
-               rp->ar_blockcount = cpu_to_be32(len);
-               xfs_alloc_log_recs(cur, cur->bc_bufs[0], ptr, ptr);
-       }
-       /*
-        * If it's the by-size btree and it's the last leaf block and
-        * it's the last record... then update the size of the longest
-        * extent in the a.g., which we cache in the a.g. freelist header.
-        */
-       if (cur->bc_btnum == XFS_BTNUM_CNT &&
-           be32_to_cpu(block->bb_rightsib) == NULLAGBLOCK &&
-           ptr == be16_to_cpu(block->bb_numrecs)) {
-               xfs_agf_t       *agf;   /* a.g. freespace header */
-               xfs_agnumber_t  seqno;
+       cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP);
 
-               agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
-               seqno = be32_to_cpu(agf->agf_seqno);
-               cur->bc_mp->m_perag[seqno].pagf_longest = len;
-               agf->agf_longest = cpu_to_be32(len);
-               xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp,
-                       XFS_AGF_LONGEST);
-       }
-       /*
-        * Updating first record in leaf. Pass new key value up to our parent.
-        */
-       if (ptr == 1) {
-               xfs_alloc_key_t key;    /* key containing [bno, len] */
+       cur->bc_tp = tp;
+       cur->bc_mp = mp;
+       cur->bc_nlevels = be32_to_cpu(agf->agf_levels[btnum]);
+       cur->bc_btnum = btnum;
+       cur->bc_blocklog = mp->m_sb.sb_blocklog;
 
-               key.ar_startblock = cpu_to_be32(bno);
-               key.ar_blockcount = cpu_to_be32(len);
-               if ((error = xfs_alloc_updkey(cur, &key, 1)))
-                       return error;
-       }
-       return 0;
+       cur->bc_ops = &xfs_allocbt_ops;
+       if (btnum == XFS_BTNUM_CNT)
+               cur->bc_flags = XFS_BTREE_LASTREC_UPDATE;
+
+       cur->bc_private.a.agbp = agbp;
+       cur->bc_private.a.agno = agno;
+
+       return cur;
+}
+
+/*
+ * Calculate number of records in an alloc btree block.
+ */
+int
+xfs_allocbt_maxrecs(
+       struct xfs_mount        *mp,
+       int                     blocklen,
+       int                     leaf)
+{
+       blocklen -= XFS_ALLOC_BLOCK_LEN(mp);
+
+       if (leaf)
+               return blocklen / sizeof(xfs_alloc_rec_t);
+       return blocklen / (sizeof(xfs_alloc_key_t) + sizeof(xfs_alloc_ptr_t));
 }
index 5bd1a2c8bd0712a4ba05828802b8d7fb763727e2..a6caa0022c9bba4f937abaf032ede0761e930a22 100644 (file)
@@ -24,7 +24,6 @@
 
 struct xfs_buf;
 struct xfs_btree_cur;
-struct xfs_btree_sblock;
 struct xfs_mount;
 
 /*
@@ -50,16 +49,6 @@ typedef struct xfs_alloc_rec_incore {
 
 /* btree pointer type */
 typedef __be32 xfs_alloc_ptr_t;
-/* btree block header type */
-typedef        struct xfs_btree_sblock xfs_alloc_block_t;
-
-#define        XFS_BUF_TO_ALLOC_BLOCK(bp)      ((xfs_alloc_block_t *)XFS_BUF_PTR(bp))
-
-/*
- * Real block structures have a size equal to the disk block size.
- */
-#define        XFS_ALLOC_BLOCK_MAXRECS(lev,cur) ((cur)->bc_mp->m_alloc_mxr[lev != 0])
-#define        XFS_ALLOC_BLOCK_MINRECS(lev,cur) ((cur)->bc_mp->m_alloc_mnr[lev != 0])
 
 /*
  * Minimum and maximum blocksize and sectorsize.
@@ -83,73 +72,39 @@ typedef     struct xfs_btree_sblock xfs_alloc_block_t;
 #define        XFS_CNT_BLOCK(mp)       ((xfs_agblock_t)(XFS_BNO_BLOCK(mp) + 1))
 
 /*
- * Record, key, and pointer address macros for btree blocks.
- */
-#define        XFS_ALLOC_REC_ADDR(bb,i,cur)    \
-       XFS_BTREE_REC_ADDR(xfs_alloc, bb, i)
-
-#define        XFS_ALLOC_KEY_ADDR(bb,i,cur)    \
-       XFS_BTREE_KEY_ADDR(xfs_alloc, bb, i)
-
-#define        XFS_ALLOC_PTR_ADDR(bb,i,cur)    \
-       XFS_BTREE_PTR_ADDR(xfs_alloc, bb, i, XFS_ALLOC_BLOCK_MAXRECS(1, cur))
-
-/*
- * Decrement cursor by one record at the level.
- * For nonzero levels the leaf-ward information is untouched.
- */
-extern int xfs_alloc_decrement(struct xfs_btree_cur *cur, int level, int *stat);
-
-/*
- * Delete the record pointed to by cur.
- * The cursor refers to the place where the record was (could be inserted)
- * when the operation returns.
- */
-extern int xfs_alloc_delete(struct xfs_btree_cur *cur, int *stat);
-
-/*
- * Get the data from the pointed-to record.
- */
-extern int xfs_alloc_get_rec(struct xfs_btree_cur *cur,        xfs_agblock_t *bno,
-                               xfs_extlen_t *len, int *stat);
-
-/*
- * Increment cursor by one record at the level.
- * For nonzero levels the leaf-ward information is untouched.
- */
-extern int xfs_alloc_increment(struct xfs_btree_cur *cur, int level, int *stat);
-
-/*
- * Insert the current record at the point referenced by cur.
- * The cursor may be inconsistent on return if splits have been done.
- */
-extern int xfs_alloc_insert(struct xfs_btree_cur *cur, int *stat);
-
-/*
- * Lookup the record equal to [bno, len] in the btree given by cur.
- */
-extern int xfs_alloc_lookup_eq(struct xfs_btree_cur *cur, xfs_agblock_t bno,
-                               xfs_extlen_t len, int *stat);
-
-/*
- * Lookup the first record greater than or equal to [bno, len]
- * in the btree given by cur.
- */
-extern int xfs_alloc_lookup_ge(struct xfs_btree_cur *cur, xfs_agblock_t bno,
-                               xfs_extlen_t len, int *stat);
-
-/*
- * Lookup the first record less than or equal to [bno, len]
- * in the btree given by cur.
+ * Btree block header size depends on a superblock flag.
+ *
+ * (not quite yet, but soon)
  */
-extern int xfs_alloc_lookup_le(struct xfs_btree_cur *cur, xfs_agblock_t bno,
-                               xfs_extlen_t len, int *stat);
+#define XFS_ALLOC_BLOCK_LEN(mp)        XFS_BTREE_SBLOCK_LEN
 
 /*
- * Update the record referred to by cur, to the value given by [bno, len].
- * This either works (return 0) or gets an EFSCORRUPTED error.
- */
-extern int xfs_alloc_update(struct xfs_btree_cur *cur, xfs_agblock_t bno,
-                               xfs_extlen_t len);
+ * Record, key, and pointer address macros for btree blocks.
+ *
+ * (note that some of these may appear unused, but they are used in userspace)
+ */
+#define XFS_ALLOC_REC_ADDR(mp, block, index) \
+       ((xfs_alloc_rec_t *) \
+               ((char *)(block) + \
+                XFS_ALLOC_BLOCK_LEN(mp) + \
+                (((index) - 1) * sizeof(xfs_alloc_rec_t))))
+
+#define XFS_ALLOC_KEY_ADDR(mp, block, index) \
+       ((xfs_alloc_key_t *) \
+               ((char *)(block) + \
+                XFS_ALLOC_BLOCK_LEN(mp) + \
+                ((index) - 1) * sizeof(xfs_alloc_key_t)))
+
+#define XFS_ALLOC_PTR_ADDR(mp, block, index, maxrecs) \
+       ((xfs_alloc_ptr_t *) \
+               ((char *)(block) + \
+                XFS_ALLOC_BLOCK_LEN(mp) + \
+                (maxrecs) * sizeof(xfs_alloc_key_t) + \
+                ((index) - 1) * sizeof(xfs_alloc_ptr_t)))
+
+extern struct xfs_btree_cur *xfs_allocbt_init_cursor(struct xfs_mount *,
+               struct xfs_trans *, struct xfs_buf *,
+               xfs_agnumber_t, xfs_btnum_t);
+extern int xfs_allocbt_maxrecs(struct xfs_mount *, int, int);
 
 #endif /* __XFS_ALLOC_BTREE_H__ */
index 0b3b5efe848cbbf883373a971cf5b34d1c9951e7..53d5e70d13609a720127319da3d43d6e4c42dc0c 100644 (file)
 #endif
 
 #ifdef XFS_NATIVE_HOST
-#define cpu_to_be16(val)       ((__be16)(val))
-#define cpu_to_be32(val)       ((__be32)(val))
-#define cpu_to_be64(val)       ((__be64)(val))
-#define be16_to_cpu(val)       ((__uint16_t)(val))
-#define be32_to_cpu(val)       ((__uint32_t)(val))
-#define be64_to_cpu(val)       ((__uint64_t)(val))
+#define cpu_to_be16(val)       ((__force __be16)(__u16)(val))
+#define cpu_to_be32(val)       ((__force __be32)(__u32)(val))
+#define cpu_to_be64(val)       ((__force __be64)(__u64)(val))
+#define be16_to_cpu(val)       ((__force __u16)(__be16)(val))
+#define be32_to_cpu(val)       ((__force __u32)(__be32)(val))
+#define be64_to_cpu(val)       ((__force __u64)(__be64)(val))
 #else
-#define cpu_to_be16(val)       (__swab16((__uint16_t)(val)))
-#define cpu_to_be32(val)       (__swab32((__uint32_t)(val)))
-#define cpu_to_be64(val)       (__swab64((__uint64_t)(val)))
-#define be16_to_cpu(val)       (__swab16((__be16)(val)))
-#define be32_to_cpu(val)       (__swab32((__be32)(val)))
-#define be64_to_cpu(val)       (__swab64((__be64)(val)))
+#define cpu_to_be16(val)       ((__force __be16)__swab16((__u16)(val)))
+#define cpu_to_be32(val)       ((__force __be32)__swab32((__u32)(val)))
+#define cpu_to_be64(val)       ((__force __be64)__swab64((__u64)(val)))
+#define be16_to_cpu(val)       (__swab16((__force __u16)(__be16)(val)))
+#define be32_to_cpu(val)       (__swab32((__force __u32)(__be32)(val)))
+#define be64_to_cpu(val)       (__swab64((__force __u64)(__be64)(val)))
 #endif
 
+static inline void be16_add_cpu(__be16 *a, __s16 b)
+{
+       *a = cpu_to_be16(be16_to_cpu(*a) + b);
+}
+
+static inline void be32_add_cpu(__be32 *a, __s32 b)
+{
+       *a = cpu_to_be32(be32_to_cpu(*a) + b);
+}
+
+static inline void be64_add_cpu(__be64 *a, __s64 b)
+{
+       *a = cpu_to_be64(be64_to_cpu(*a) + b);
+}
+
 #endif /* __KERNEL__ */
 
 /* do we need conversion? */
index 8e0e463dae2d7c0f55268c46683110a01a42405b..bca7b243c31979ff488de535c38f262fa5739ccb 100644 (file)
@@ -61,8 +61,7 @@ static inline int xfs_highbit64(__uint64_t v)
 /* Get low bit set out of 32-bit argument, -1 if none set */
 static inline int xfs_lowbit32(__uint32_t v)
 {
-       unsigned long   t = v;
-       return (v) ? find_first_bit(&t, 32) : -1;
+       return ffs(v) - 1;
 }
 
 /* Get low bit set out of 64-bit argument, -1 if none set */
index a1aab9275d5ac727700a00070b89c88e1acb8e42..db289050692f4fe4a95c791d427151b92c92fbdf 100644 (file)
@@ -393,8 +393,8 @@ xfs_bmap_count_leaves(
 
 STATIC void
 xfs_bmap_disk_count_leaves(
-       xfs_extnum_t            idx,
-       xfs_bmbt_block_t        *block,
+       struct xfs_mount        *mp,
+       struct xfs_btree_block  *block,
        int                     numrecs,
        int                     *count);
 
@@ -402,6 +402,53 @@ xfs_bmap_disk_count_leaves(
  * Bmap internal routines.
  */
 
+STATIC int                             /* error */
+xfs_bmbt_lookup_eq(
+       struct xfs_btree_cur    *cur,
+       xfs_fileoff_t           off,
+       xfs_fsblock_t           bno,
+       xfs_filblks_t           len,
+       int                     *stat)  /* success/failure */
+{
+       cur->bc_rec.b.br_startoff = off;
+       cur->bc_rec.b.br_startblock = bno;
+       cur->bc_rec.b.br_blockcount = len;
+       return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
+}
+
+STATIC int                             /* error */
+xfs_bmbt_lookup_ge(
+       struct xfs_btree_cur    *cur,
+       xfs_fileoff_t           off,
+       xfs_fsblock_t           bno,
+       xfs_filblks_t           len,
+       int                     *stat)  /* success/failure */
+{
+       cur->bc_rec.b.br_startoff = off;
+       cur->bc_rec.b.br_startblock = bno;
+       cur->bc_rec.b.br_blockcount = len;
+       return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat);
+}
+
+/*
+* Update the record referred to by cur to the value given
+ * by [off, bno, len, state].
+ * This either works (return 0) or gets an EFSCORRUPTED error.
+ */
+STATIC int
+xfs_bmbt_update(
+       struct xfs_btree_cur    *cur,
+       xfs_fileoff_t           off,
+       xfs_fsblock_t           bno,
+       xfs_filblks_t           len,
+       xfs_exntst_t            state)
+{
+       union xfs_btree_rec     rec;
+
+       xfs_bmbt_disk_set_allf(&rec.bmbt, off, bno, len, state);
+       return xfs_btree_update(cur, &rec);
+}
+
 /*
  * Called from xfs_bmap_add_attrfork to handle btree format files.
  */
@@ -422,15 +469,14 @@ xfs_bmap_add_attrfork_btree(
        if (ip->i_df.if_broot_bytes <= XFS_IFORK_DSIZE(ip))
                *flags |= XFS_ILOG_DBROOT;
        else {
-               cur = xfs_btree_init_cursor(mp, tp, NULL, 0, XFS_BTNUM_BMAP, ip,
-                       XFS_DATA_FORK);
+               cur = xfs_bmbt_init_cursor(mp, tp, ip, XFS_DATA_FORK);
                cur->bc_private.b.flist = flist;
                cur->bc_private.b.firstblock = *firstblock;
                if ((error = xfs_bmbt_lookup_ge(cur, 0, 0, 0, &stat)))
                        goto error0;
                /* must be at least one entry */
                XFS_WANT_CORRUPTED_GOTO(stat == 1, error0);
-               if ((error = xfs_bmbt_newroot(cur, flags, &stat)))
+               if ((error = xfs_btree_new_iroot(cur, flags, &stat)))
                        goto error0;
                if (stat == 0) {
                        xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
@@ -818,10 +864,10 @@ xfs_bmap_add_extent_delay_real(
                                        RIGHT.br_blockcount, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                       if ((error = xfs_bmbt_delete(cur, &i)))
+                       if ((error = xfs_btree_delete(cur, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                       if ((error = xfs_bmbt_decrement(cur, 0, &i)))
+                       if ((error = xfs_btree_decrement(cur, 0, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                        if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
@@ -931,7 +977,7 @@ xfs_bmap_add_extent_delay_real(
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 0, done);
                        cur->bc_rec.b.br_state = XFS_EXT_NORM;
-                       if ((error = xfs_bmbt_insert(cur, &i)))
+                       if ((error = xfs_btree_insert(cur, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                }
@@ -1007,7 +1053,7 @@ xfs_bmap_add_extent_delay_real(
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 0, done);
                        cur->bc_rec.b.br_state = XFS_EXT_NORM;
-                       if ((error = xfs_bmbt_insert(cur, &i)))
+                       if ((error = xfs_btree_insert(cur, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                }
@@ -1097,7 +1143,7 @@ xfs_bmap_add_extent_delay_real(
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 0, done);
                        cur->bc_rec.b.br_state = XFS_EXT_NORM;
-                       if ((error = xfs_bmbt_insert(cur, &i)))
+                       if ((error = xfs_btree_insert(cur, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                }
@@ -1152,7 +1198,7 @@ xfs_bmap_add_extent_delay_real(
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 0, done);
                        cur->bc_rec.b.br_state = XFS_EXT_NORM;
-                       if ((error = xfs_bmbt_insert(cur, &i)))
+                       if ((error = xfs_btree_insert(cur, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                }
@@ -1379,16 +1425,16 @@ xfs_bmap_add_extent_unwritten_real(
                                        RIGHT.br_blockcount, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                       if ((error = xfs_bmbt_delete(cur, &i)))
+                       if ((error = xfs_btree_delete(cur, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                       if ((error = xfs_bmbt_decrement(cur, 0, &i)))
+                       if ((error = xfs_btree_decrement(cur, 0, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                       if ((error = xfs_bmbt_delete(cur, &i)))
+                       if ((error = xfs_btree_delete(cur, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                       if ((error = xfs_bmbt_decrement(cur, 0, &i)))
+                       if ((error = xfs_btree_decrement(cur, 0, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                        if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
@@ -1428,10 +1474,10 @@ xfs_bmap_add_extent_unwritten_real(
                                        &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                       if ((error = xfs_bmbt_delete(cur, &i)))
+                       if ((error = xfs_btree_delete(cur, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                       if ((error = xfs_bmbt_decrement(cur, 0, &i)))
+                       if ((error = xfs_btree_decrement(cur, 0, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                        if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
@@ -1471,10 +1517,10 @@ xfs_bmap_add_extent_unwritten_real(
                                        RIGHT.br_blockcount, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                       if ((error = xfs_bmbt_delete(cur, &i)))
+                       if ((error = xfs_btree_delete(cur, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                       if ((error = xfs_bmbt_decrement(cur, 0, &i)))
+                       if ((error = xfs_btree_decrement(cur, 0, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                        if ((error = xfs_bmbt_update(cur, new->br_startoff,
@@ -1557,7 +1603,7 @@ xfs_bmap_add_extent_unwritten_real(
                                PREV.br_blockcount - new->br_blockcount,
                                oldext)))
                                goto done;
-                       if ((error = xfs_bmbt_decrement(cur, 0, &i)))
+                       if ((error = xfs_btree_decrement(cur, 0, &i)))
                                goto done;
                        if (xfs_bmbt_update(cur, LEFT.br_startoff,
                                LEFT.br_startblock,
@@ -1605,7 +1651,7 @@ xfs_bmap_add_extent_unwritten_real(
                                oldext)))
                                goto done;
                        cur->bc_rec.b = *new;
-                       if ((error = xfs_bmbt_insert(cur, &i)))
+                       if ((error = xfs_btree_insert(cur, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                }
@@ -1647,7 +1693,7 @@ xfs_bmap_add_extent_unwritten_real(
                                PREV.br_blockcount - new->br_blockcount,
                                oldext)))
                                goto done;
-                       if ((error = xfs_bmbt_increment(cur, 0, &i)))
+                       if ((error = xfs_btree_increment(cur, 0, &i)))
                                goto done;
                        if ((error = xfs_bmbt_update(cur, new->br_startoff,
                                new->br_startblock,
@@ -1695,7 +1741,7 @@ xfs_bmap_add_extent_unwritten_real(
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 0, done);
                        cur->bc_rec.b.br_state = XFS_EXT_NORM;
-                       if ((error = xfs_bmbt_insert(cur, &i)))
+                       if ((error = xfs_btree_insert(cur, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                }
@@ -1743,7 +1789,7 @@ xfs_bmap_add_extent_unwritten_real(
                        cur->bc_rec.b = PREV;
                        cur->bc_rec.b.br_blockcount =
                                new->br_startoff - PREV.br_startoff;
-                       if ((error = xfs_bmbt_insert(cur, &i)))
+                       if ((error = xfs_btree_insert(cur, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                        /*
@@ -1758,7 +1804,7 @@ xfs_bmap_add_extent_unwritten_real(
                        XFS_WANT_CORRUPTED_GOTO(i == 0, done);
                        /* new middle extent - newext */
                        cur->bc_rec.b.br_state = new->br_state;
-                       if ((error = xfs_bmbt_insert(cur, &i)))
+                       if ((error = xfs_btree_insert(cur, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                }
@@ -2106,10 +2152,10 @@ xfs_bmap_add_extent_hole_real(
                                        right.br_blockcount, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                       if ((error = xfs_bmbt_delete(cur, &i)))
+                       if ((error = xfs_btree_delete(cur, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
-                       if ((error = xfs_bmbt_decrement(cur, 0, &i)))
+                       if ((error = xfs_btree_decrement(cur, 0, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                        if ((error = xfs_bmbt_update(cur, left.br_startoff,
@@ -2218,7 +2264,7 @@ xfs_bmap_add_extent_hole_real(
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 0, done);
                        cur->bc_rec.b.br_state = new->br_state;
-                       if ((error = xfs_bmbt_insert(cur, &i)))
+                       if ((error = xfs_btree_insert(cur, &i)))
                                goto done;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                }
@@ -2996,24 +3042,24 @@ xfs_bmap_btree_to_extents(
        int                     whichfork)  /* data or attr fork */
 {
        /* REFERENCED */
-       xfs_bmbt_block_t        *cblock;/* child btree block */
+       struct xfs_btree_block  *cblock;/* child btree block */
        xfs_fsblock_t           cbno;   /* child block number */
        xfs_buf_t               *cbp;   /* child block's buffer */
        int                     error;  /* error return value */
        xfs_ifork_t             *ifp;   /* inode fork data */
        xfs_mount_t             *mp;    /* mount point structure */
        __be64                  *pp;    /* ptr to block address */
-       xfs_bmbt_block_t        *rblock;/* root btree block */
+       struct xfs_btree_block  *rblock;/* root btree block */
 
+       mp = ip->i_mount;
        ifp = XFS_IFORK_PTR(ip, whichfork);
        ASSERT(ifp->if_flags & XFS_IFEXTENTS);
        ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE);
        rblock = ifp->if_broot;
        ASSERT(be16_to_cpu(rblock->bb_level) == 1);
        ASSERT(be16_to_cpu(rblock->bb_numrecs) == 1);
-       ASSERT(XFS_BMAP_BROOT_MAXRECS(ifp->if_broot_bytes) == 1);
-       mp = ip->i_mount;
-       pp = XFS_BMAP_BROOT_PTR_ADDR(rblock, 1, ifp->if_broot_bytes);
+       ASSERT(xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0) == 1);
+       pp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, ifp->if_broot_bytes);
        cbno = be64_to_cpu(*pp);
        *logflagsp = 0;
 #ifdef DEBUG
@@ -3023,8 +3069,8 @@ xfs_bmap_btree_to_extents(
        if ((error = xfs_btree_read_bufl(mp, tp, cbno, 0, &cbp,
                        XFS_BMAP_BTREE_REF)))
                return error;
-       cblock = XFS_BUF_TO_BMBT_BLOCK(cbp);
-       if ((error = xfs_btree_check_lblock(cur, cblock, 0, cbp)))
+       cblock = XFS_BUF_TO_BLOCK(cbp);
+       if ((error = xfs_btree_check_block(cur, cblock, 0, cbp)))
                return error;
        xfs_bmap_add_free(cbno, 1, cur->bc_private.b.flist, mp);
        ip->i_d.di_nblocks--;
@@ -3170,7 +3216,7 @@ xfs_bmap_del_extent(
                        flags |= XFS_ILOG_FEXT(whichfork);
                        break;
                }
-               if ((error = xfs_bmbt_delete(cur, &i)))
+               if ((error = xfs_btree_delete(cur, &i)))
                        goto done;
                XFS_WANT_CORRUPTED_GOTO(i == 1, done);
                break;
@@ -3254,10 +3300,10 @@ xfs_bmap_del_extent(
                                                got.br_startblock, temp,
                                                got.br_state)))
                                        goto done;
-                               if ((error = xfs_bmbt_increment(cur, 0, &i)))
+                               if ((error = xfs_btree_increment(cur, 0, &i)))
                                        goto done;
                                cur->bc_rec.b = new;
-                               error = xfs_bmbt_insert(cur, &i);
+                               error = xfs_btree_insert(cur, &i);
                                if (error && error != ENOSPC)
                                        goto done;
                                /*
@@ -3404,11 +3450,11 @@ xfs_bmap_extents_to_btree(
        int                     *logflagsp,     /* inode logging flags */
        int                     whichfork)      /* data or attr fork */
 {
-       xfs_bmbt_block_t        *ablock;        /* allocated (child) bt block */
+       struct xfs_btree_block  *ablock;        /* allocated (child) bt block */
        xfs_buf_t               *abp;           /* buffer for ablock */
        xfs_alloc_arg_t         args;           /* allocation arguments */
        xfs_bmbt_rec_t          *arp;           /* child record pointer */
-       xfs_bmbt_block_t        *block;         /* btree root block */
+       struct xfs_btree_block  *block;         /* btree root block */
        xfs_btree_cur_t         *cur;           /* bmap btree cursor */
        xfs_bmbt_rec_host_t     *ep;            /* extent record pointer */
        int                     error;          /* error return value */
@@ -3428,6 +3474,7 @@ xfs_bmap_extents_to_btree(
         */
        xfs_iroot_realloc(ip, 1, whichfork);
        ifp->if_flags |= XFS_IFBROOT;
+
        /*
         * Fill in the root.
         */
@@ -3435,14 +3482,14 @@ xfs_bmap_extents_to_btree(
        block->bb_magic = cpu_to_be32(XFS_BMAP_MAGIC);
        block->bb_level = cpu_to_be16(1);
        block->bb_numrecs = cpu_to_be16(1);
-       block->bb_leftsib = cpu_to_be64(NULLDFSBNO);
-       block->bb_rightsib = cpu_to_be64(NULLDFSBNO);
+       block->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO);
+       block->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO);
+
        /*
         * Need a cursor.  Can't allocate until bb_level is filled in.
         */
        mp = ip->i_mount;
-       cur = xfs_btree_init_cursor(mp, tp, NULL, 0, XFS_BTNUM_BMAP, ip,
-               whichfork);
+       cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
        cur->bc_private.b.firstblock = *firstblock;
        cur->bc_private.b.flist = flist;
        cur->bc_private.b.flags = wasdel ? XFS_BTCUR_BPRV_WASDEL : 0;
@@ -3489,12 +3536,12 @@ xfs_bmap_extents_to_btree(
        /*
         * Fill in the child block.
         */
-       ablock = XFS_BUF_TO_BMBT_BLOCK(abp);
+       ablock = XFS_BUF_TO_BLOCK(abp);
        ablock->bb_magic = cpu_to_be32(XFS_BMAP_MAGIC);
        ablock->bb_level = 0;
-       ablock->bb_leftsib = cpu_to_be64(NULLDFSBNO);
-       ablock->bb_rightsib = cpu_to_be64(NULLDFSBNO);
-       arp = XFS_BMAP_REC_IADDR(ablock, 1, cur);
+       ablock->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO);
+       ablock->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO);
+       arp = XFS_BMBT_REC_ADDR(mp, ablock, 1);
        nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
        for (cnt = i = 0; i < nextents; i++) {
                ep = xfs_iext_get_ext(ifp, i);
@@ -3505,21 +3552,24 @@ xfs_bmap_extents_to_btree(
                }
        }
        ASSERT(cnt == XFS_IFORK_NEXTENTS(ip, whichfork));
-       ablock->bb_numrecs = cpu_to_be16(cnt);
+       xfs_btree_set_numrecs(ablock, cnt);
+
        /*
         * Fill in the root key and pointer.
         */
-       kp = XFS_BMAP_KEY_IADDR(block, 1, cur);
-       arp = XFS_BMAP_REC_IADDR(ablock, 1, cur);
+       kp = XFS_BMBT_KEY_ADDR(mp, block, 1);
+       arp = XFS_BMBT_REC_ADDR(mp, ablock, 1);
        kp->br_startoff = cpu_to_be64(xfs_bmbt_disk_get_startoff(arp));
-       pp = XFS_BMAP_PTR_IADDR(block, 1, cur);
+       pp = XFS_BMBT_PTR_ADDR(mp, block, 1, xfs_bmbt_get_maxrecs(cur,
+                                               be16_to_cpu(block->bb_level)));
        *pp = cpu_to_be64(args.fsbno);
+
        /*
         * Do all this logging at the end so that
         * the root is at the right level.
         */
-       xfs_bmbt_log_block(cur, abp, XFS_BB_ALL_BITS);
-       xfs_bmbt_log_recs(cur, abp, 1, be16_to_cpu(ablock->bb_numrecs));
+       xfs_btree_log_block(cur, abp, XFS_BB_ALL_BITS);
+       xfs_btree_log_recs(cur, abp, 1, be16_to_cpu(ablock->bb_numrecs));
        ASSERT(*curp == NULL);
        *curp = cur;
        *logflagsp = XFS_ILOG_CORE | XFS_ILOG_FBROOT(whichfork);
@@ -4176,7 +4226,7 @@ xfs_bmap_compute_maxlevels(
                maxleafents = MAXAEXTNUM;
                sz = XFS_BMDR_SPACE_CALC(MINABTPTRS);
        }
-       maxrootrecs = (int)XFS_BTREE_BLOCK_MAXRECS(sz, xfs_bmdr, 0);
+       maxrootrecs = xfs_bmdr_maxrecs(mp, sz, 0);
        minleafrecs = mp->m_bmap_dmnr[0];
        minnoderecs = mp->m_bmap_dmnr[1];
        maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs;
@@ -4474,6 +4524,22 @@ xfs_bmap_one_block(
        return rval;
 }
 
+STATIC int
+xfs_bmap_sanity_check(
+       struct xfs_mount        *mp,
+       struct xfs_buf          *bp,
+       int                     level)
+{
+       struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
+
+       if (be32_to_cpu(block->bb_magic) != XFS_BMAP_MAGIC ||
+           be16_to_cpu(block->bb_level) != level ||
+           be16_to_cpu(block->bb_numrecs) == 0 ||
+           be16_to_cpu(block->bb_numrecs) > mp->m_bmap_dmxr[level != 0])
+               return 0;
+       return 1;
+}
+
 /*
  * Read in the extents to if_extents.
  * All inode fields are set up by caller, we just traverse the btree
@@ -4486,7 +4552,7 @@ xfs_bmap_read_extents(
        xfs_inode_t             *ip,    /* incore inode */
        int                     whichfork) /* data or attr fork */
 {
-       xfs_bmbt_block_t        *block; /* current btree block */
+       struct xfs_btree_block  *block; /* current btree block */
        xfs_fsblock_t           bno;    /* block # of "block" */
        xfs_buf_t               *bp;    /* buffer for "block" */
        int                     error;  /* error return value */
@@ -4510,7 +4576,7 @@ xfs_bmap_read_extents(
         */
        level = be16_to_cpu(block->bb_level);
        ASSERT(level > 0);
-       pp = XFS_BMAP_BROOT_PTR_ADDR(block, 1, ifp->if_broot_bytes);
+       pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
        bno = be64_to_cpu(*pp);
        ASSERT(bno != NULLDFSBNO);
        ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount);
@@ -4523,13 +4589,13 @@ xfs_bmap_read_extents(
                if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
                                XFS_BMAP_BTREE_REF)))
                        return error;
-               block = XFS_BUF_TO_BMBT_BLOCK(bp);
+               block = XFS_BUF_TO_BLOCK(bp);
                XFS_WANT_CORRUPTED_GOTO(
-                       XFS_BMAP_SANITY_CHECK(mp, block, level),
+                       xfs_bmap_sanity_check(mp, bp, level),
                        error0);
                if (level == 0)
                        break;
-               pp = XFS_BTREE_PTR_ADDR(xfs_bmbt, block, 1, mp->m_bmap_dmxr[1]);
+               pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
                bno = be64_to_cpu(*pp);
                XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, bno), error0);
                xfs_trans_brelse(tp, bp);
@@ -4549,7 +4615,7 @@ xfs_bmap_read_extents(
                xfs_extnum_t    start;
 
 
-               num_recs = be16_to_cpu(block->bb_numrecs);
+               num_recs = xfs_btree_get_numrecs(block);
                if (unlikely(i + num_recs > room)) {
                        ASSERT(i + num_recs <= room);
                        xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
@@ -4561,18 +4627,18 @@ xfs_bmap_read_extents(
                        goto error0;
                }
                XFS_WANT_CORRUPTED_GOTO(
-                       XFS_BMAP_SANITY_CHECK(mp, block, 0),
+                       xfs_bmap_sanity_check(mp, bp, 0),
                        error0);
                /*
                 * Read-ahead the next leaf block, if any.
                 */
-               nextbno = be64_to_cpu(block->bb_rightsib);
+               nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
                if (nextbno != NULLFSBLOCK)
                        xfs_btree_reada_bufl(mp, nextbno, 1);
                /*
                 * Copy records into the extent records.
                 */
-               frp = XFS_BTREE_REC_ADDR(xfs_bmbt, block, 1);
+               frp = XFS_BMBT_REC_ADDR(mp, block, 1);
                start = i;
                for (j = 0; j < num_recs; j++, i++, frp++) {
                        xfs_bmbt_rec_host_t *trp = xfs_iext_get_ext(ifp, i);
@@ -4603,7 +4669,7 @@ xfs_bmap_read_extents(
                if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
                                XFS_BMAP_BTREE_REF)))
                        return error;
-               block = XFS_BUF_TO_BMBT_BLOCK(bp);
+               block = XFS_BUF_TO_BLOCK(bp);
        }
        ASSERT(i == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)));
        ASSERT(i == XFS_IFORK_NEXTENTS(ip, whichfork));
@@ -5029,8 +5095,7 @@ xfs_bmapi(
                                if (abno == NULLFSBLOCK)
                                        break;
                                if ((ifp->if_flags & XFS_IFBROOT) && !cur) {
-                                       cur = xfs_btree_init_cursor(mp,
-                                               tp, NULL, 0, XFS_BTNUM_BMAP,
+                                       cur = xfs_bmbt_init_cursor(mp, tp,
                                                ip, whichfork);
                                        cur->bc_private.b.firstblock =
                                                *firstblock;
@@ -5147,9 +5212,8 @@ xfs_bmapi(
                         */
                        ASSERT(mval->br_blockcount <= len);
                        if ((ifp->if_flags & XFS_IFBROOT) && !cur) {
-                               cur = xfs_btree_init_cursor(mp,
-                                       tp, NULL, 0, XFS_BTNUM_BMAP,
-                                       ip, whichfork);
+                               cur = xfs_bmbt_init_cursor(mp,
+                                       tp, ip, whichfork);
                                cur->bc_private.b.firstblock =
                                        *firstblock;
                                cur->bc_private.b.flist = flist;
@@ -5440,8 +5504,7 @@ xfs_bunmapi(
        logflags = 0;
        if (ifp->if_flags & XFS_IFBROOT) {
                ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE);
-               cur = xfs_btree_init_cursor(mp, tp, NULL, 0, XFS_BTNUM_BMAP, ip,
-                       whichfork);
+               cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
                cur->bc_private.b.firstblock = *firstblock;
                cur->bc_private.b.flist = flist;
                cur->bc_private.b.flags = 0;
@@ -6131,7 +6194,7 @@ xfs_bmap_get_bp(
 
 void
 xfs_check_block(
-       xfs_bmbt_block_t        *block,
+       struct xfs_btree_block  *block,
        xfs_mount_t             *mp,
        int                     root,
        short                   sz)
@@ -6143,36 +6206,29 @@ xfs_check_block(
        ASSERT(be16_to_cpu(block->bb_level) > 0);
 
        prevp = NULL;
-       for( i = 1; i <= be16_to_cpu(block->bb_numrecs); i++) {
+       for( i = 1; i <= xfs_btree_get_numrecs(block); i++) {
                dmxr = mp->m_bmap_dmxr[0];
-
-               if (root) {
-                       keyp = XFS_BMAP_BROOT_KEY_ADDR(block, i, sz);
-               } else {
-                       keyp = XFS_BTREE_KEY_ADDR(xfs_bmbt, block, i);
-               }
+               keyp = XFS_BMBT_KEY_ADDR(mp, block, i);
 
                if (prevp) {
-                       xfs_btree_check_key(XFS_BTNUM_BMAP, prevp, keyp);
+                       ASSERT(be64_to_cpu(prevp->br_startoff) <
+                              be64_to_cpu(keyp->br_startoff));
                }
                prevp = keyp;
 
                /*
                 * Compare the block numbers to see if there are dups.
                 */
+               if (root)
+                       pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, i, sz);
+               else
+                       pp = XFS_BMBT_PTR_ADDR(mp, block, i, dmxr);
 
-               if (root) {
-                       pp = XFS_BMAP_BROOT_PTR_ADDR(block, i, sz);
-               } else {
-                       pp = XFS_BTREE_PTR_ADDR(xfs_bmbt, block, i, dmxr);
-               }
                for (j = i+1; j <= be16_to_cpu(block->bb_numrecs); j++) {
-                       if (root) {
-                               thispa = XFS_BMAP_BROOT_PTR_ADDR(block, j, sz);
-                       } else {
-                               thispa = XFS_BTREE_PTR_ADDR(xfs_bmbt, block, j,
-                                                           dmxr);
-                       }
+                       if (root)
+                               thispa = XFS_BMAP_BROOT_PTR_ADDR(mp, block, j, sz);
+                       else
+                               thispa = XFS_BMBT_PTR_ADDR(mp, block, j, dmxr);
                        if (*thispa == *pp) {
                                cmn_err(CE_WARN, "%s: thispa(%d) == pp(%d) %Ld",
                                        __func__, j, i,
@@ -6195,7 +6251,7 @@ xfs_bmap_check_leaf_extents(
        xfs_inode_t             *ip,            /* incore inode pointer */
        int                     whichfork)      /* data or attr fork */
 {
-       xfs_bmbt_block_t        *block; /* current btree block */
+       struct xfs_btree_block  *block; /* current btree block */
        xfs_fsblock_t           bno;    /* block # of "block" */
        xfs_buf_t               *bp;    /* buffer for "block" */
        int                     error;  /* error return value */
@@ -6223,7 +6279,7 @@ xfs_bmap_check_leaf_extents(
        level = be16_to_cpu(block->bb_level);
        ASSERT(level > 0);
        xfs_check_block(block, mp, 1, ifp->if_broot_bytes);
-       pp = XFS_BMAP_BROOT_PTR_ADDR(block, 1, ifp->if_broot_bytes);
+       pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
        bno = be64_to_cpu(*pp);
 
        ASSERT(bno != NULLDFSBNO);
@@ -6245,9 +6301,9 @@ xfs_bmap_check_leaf_extents(
                if (!bp && (error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
                                XFS_BMAP_BTREE_REF)))
                        goto error_norelse;
-               block = XFS_BUF_TO_BMBT_BLOCK(bp);
+               block = XFS_BUF_TO_BLOCK(bp);
                XFS_WANT_CORRUPTED_GOTO(
-                       XFS_BMAP_SANITY_CHECK(mp, block, level),
+                       xfs_bmap_sanity_check(mp, bp, level),
                        error0);
                if (level == 0)
                        break;
@@ -6258,7 +6314,7 @@ xfs_bmap_check_leaf_extents(
                 */
 
                xfs_check_block(block, mp, 0, 0);
-               pp = XFS_BTREE_PTR_ADDR(xfs_bmbt, block, 1, mp->m_bmap_dmxr[1]);
+               pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
                bno = be64_to_cpu(*pp);
                XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, bno), error0);
                if (bp_release) {
@@ -6280,13 +6336,13 @@ xfs_bmap_check_leaf_extents(
                xfs_extnum_t    num_recs;
 
 
-               num_recs = be16_to_cpu(block->bb_numrecs);
+               num_recs = xfs_btree_get_numrecs(block);
 
                /*
                 * Read-ahead the next leaf block, if any.
                 */
 
-               nextbno = be64_to_cpu(block->bb_rightsib);
+               nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
 
                /*
                 * Check all the extents to make sure they are OK.
@@ -6294,13 +6350,17 @@ xfs_bmap_check_leaf_extents(
                 * conform with the first entry in this one.
                 */
 
-               ep = XFS_BTREE_REC_ADDR(xfs_bmbt, block, 1);
+               ep = XFS_BMBT_REC_ADDR(mp, block, 1);
                if (i) {
-                       xfs_btree_check_rec(XFS_BTNUM_BMAP, &last, ep);
+                       ASSERT(xfs_bmbt_disk_get_startoff(&last) +
+                              xfs_bmbt_disk_get_blockcount(&last) <=
+                              xfs_bmbt_disk_get_startoff(ep));
                }
                for (j = 1; j < num_recs; j++) {
-                       nextp = XFS_BTREE_REC_ADDR(xfs_bmbt, block, j + 1);
-                       xfs_btree_check_rec(XFS_BTNUM_BMAP, ep, nextp);
+                       nextp = XFS_BMBT_REC_ADDR(mp, block, j + 1);
+                       ASSERT(xfs_bmbt_disk_get_startoff(ep) +
+                              xfs_bmbt_disk_get_blockcount(ep) <=
+                              xfs_bmbt_disk_get_startoff(nextp));
                        ep = nextp;
                }
 
@@ -6326,7 +6386,7 @@ xfs_bmap_check_leaf_extents(
                if (!bp && (error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
                                XFS_BMAP_BTREE_REF)))
                        goto error_norelse;
-               block = XFS_BUF_TO_BMBT_BLOCK(bp);
+               block = XFS_BUF_TO_BLOCK(bp);
        }
        if (bp_release) {
                bp_release = 0;
@@ -6356,7 +6416,7 @@ xfs_bmap_count_blocks(
        int                     whichfork,      /* data or attr fork */
        int                     *count)         /* out: count of blocks */
 {
-       xfs_bmbt_block_t        *block; /* current btree block */
+       struct xfs_btree_block  *block; /* current btree block */
        xfs_fsblock_t           bno;    /* block # of "block" */
        xfs_ifork_t             *ifp;   /* fork structure */
        int                     level;  /* btree level, for checking */
@@ -6379,7 +6439,7 @@ xfs_bmap_count_blocks(
        block = ifp->if_broot;
        level = be16_to_cpu(block->bb_level);
        ASSERT(level > 0);
-       pp = XFS_BMAP_BROOT_PTR_ADDR(block, 1, ifp->if_broot_bytes);
+       pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
        bno = be64_to_cpu(*pp);
        ASSERT(bno != NULLDFSBNO);
        ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount);
@@ -6413,29 +6473,29 @@ xfs_bmap_count_tree(
        __be64                  *pp;
        xfs_fsblock_t           bno = blockno;
        xfs_fsblock_t           nextbno;
-       xfs_bmbt_block_t        *block, *nextblock;
+       struct xfs_btree_block  *block, *nextblock;
        int                     numrecs;
 
        if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF)))
                return error;
        *count += 1;
-       block = XFS_BUF_TO_BMBT_BLOCK(bp);
+       block = XFS_BUF_TO_BLOCK(bp);
 
        if (--level) {
                /* Not at node above leafs, count this level of nodes */
-               nextbno = be64_to_cpu(block->bb_rightsib);
+               nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
                while (nextbno != NULLFSBLOCK) {
                        if ((error = xfs_btree_read_bufl(mp, tp, nextbno,
                                0, &nbp, XFS_BMAP_BTREE_REF)))
                                return error;
                        *count += 1;
-                       nextblock = XFS_BUF_TO_BMBT_BLOCK(nbp);
-                       nextbno = be64_to_cpu(nextblock->bb_rightsib);
+                       nextblock = XFS_BUF_TO_BLOCK(nbp);
+                       nextbno = be64_to_cpu(nextblock->bb_u.l.bb_rightsib);
                        xfs_trans_brelse(tp, nbp);
                }
 
                /* Dive to the next level */
-               pp = XFS_BTREE_PTR_ADDR(xfs_bmbt, block, 1, mp->m_bmap_dmxr[1]);
+               pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
                bno = be64_to_cpu(*pp);
                if (unlikely((error =
                     xfs_bmap_count_tree(mp, tp, ifp, bno, level, count)) < 0)) {
@@ -6448,9 +6508,9 @@ xfs_bmap_count_tree(
        } else {
                /* count all level 1 nodes and their leaves */
                for (;;) {
-                       nextbno = be64_to_cpu(block->bb_rightsib);
+                       nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
                        numrecs = be16_to_cpu(block->bb_numrecs);
-                       xfs_bmap_disk_count_leaves(0, block, numrecs, count);
+                       xfs_bmap_disk_count_leaves(mp, block, numrecs, count);
                        xfs_trans_brelse(tp, bp);
                        if (nextbno == NULLFSBLOCK)
                                break;
@@ -6459,7 +6519,7 @@ xfs_bmap_count_tree(
                                XFS_BMAP_BTREE_REF)))
                                return error;
                        *count += 1;
-                       block = XFS_BUF_TO_BMBT_BLOCK(bp);
+                       block = XFS_BUF_TO_BLOCK(bp);
                }
        }
        return 0;
@@ -6489,8 +6549,8 @@ xfs_bmap_count_leaves(
  */
 STATIC void
 xfs_bmap_disk_count_leaves(
-       xfs_extnum_t            idx,
-       xfs_bmbt_block_t        *block,
+       struct xfs_mount        *mp,
+       struct xfs_btree_block  *block,
        int                     numrecs,
        int                     *count)
 {
@@ -6498,7 +6558,7 @@ xfs_bmap_disk_count_leaves(
        xfs_bmbt_rec_t  *frp;
 
        for (b = 1; b <= numrecs; b++) {
-               frp = XFS_BTREE_REC_ADDR(xfs_bmbt, block, idx + b);
+               frp = XFS_BMBT_REC_ADDR(mp, block, b);
                *count += xfs_bmbt_disk_get_blockcount(frp);
        }
 }
index 9f3e3a836d153b6bbf09d222588b84ae06c1be34..7c9d12cd7a47bbd28db102e238b15d2c1ac547af 100644 (file)
@@ -137,9 +137,7 @@ typedef struct xfs_bmalloca {
        char                    conv;   /* overwriting unwritten extents */
 } xfs_bmalloca_t;
 
-#ifdef __KERNEL__
-
-#if defined(XFS_BMAP_TRACE)
+#if defined(__KERNEL__) && defined(XFS_BMAP_TRACE)
 /*
  * Trace operations for bmap extent tracing
  */
@@ -163,9 +161,12 @@ xfs_bmap_trace_exlist(
        int                     whichfork);     /* data or attr fork */
 #define        XFS_BMAP_TRACE_EXLIST(ip,c,w)   \
        xfs_bmap_trace_exlist(__func__,ip,c,w)
-#else
+
+#else  /* __KERNEL__ && XFS_BMAP_TRACE */
+
 #define        XFS_BMAP_TRACE_EXLIST(ip,c,w)
-#endif
+
+#endif /* __KERNEL__ && XFS_BMAP_TRACE */
 
 /*
  * Convert inode from non-attributed to attributed.
@@ -205,20 +206,6 @@ xfs_bmap_compute_maxlevels(
        struct xfs_mount        *mp,    /* file system mount structure */
        int                     whichfork);     /* data or attr fork */
 
-/*
- * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi
- * caller.  Frees all the extents that need freeing, which must be done
- * last due to locking considerations.
- *
- * Return 1 if the given transaction was committed and a new one allocated,
- * and 0 otherwise.
- */
-int                                            /* error */
-xfs_bmap_finish(
-       struct xfs_trans        **tp,           /* transaction pointer addr */
-       xfs_bmap_free_t         *flist,         /* i/o: list extents to free */
-       int                     *committed);    /* xact committed or not */
-
 /*
  * Returns the file-relative block number of the first unused block in the file.
  * This is the lowest-address hole if the file has holes, else the first block
@@ -343,6 +330,32 @@ xfs_bunmapi(
                                                   extents */
        int                     *done);         /* set if not done yet */
 
+/*
+ * Check an extent list, which has just been read, for
+ * any bit in the extent flag field.
+ */
+int
+xfs_check_nostate_extents(
+       struct xfs_ifork        *ifp,
+       xfs_extnum_t            idx,
+       xfs_extnum_t            num);
+
+#ifdef __KERNEL__
+
+/*
+ * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi
+ * caller.  Frees all the extents that need freeing, which must be done
+ * last due to locking considerations.
+ *
+ * Return 1 if the given transaction was committed and a new one allocated,
+ * and 0 otherwise.
+ */
+int                                            /* error */
+xfs_bmap_finish(
+       struct xfs_trans        **tp,           /* transaction pointer addr */
+       xfs_bmap_free_t         *flist,         /* i/o: list extents to free */
+       int                     *committed);    /* xact committed or not */
+
 /*
  * Fcntl interface to xfs_bmapi.
  */
@@ -374,16 +387,6 @@ xfs_bmap_count_blocks(
        int                     whichfork,
        int                     *count);
 
-/*
- * Check an extent list, which has just been read, for
- * any bit in the extent flag field.
- */
-int
-xfs_check_nostate_extents(
-       struct xfs_ifork        *ifp,
-       xfs_extnum_t            idx,
-       xfs_extnum_t            num);
-
 /*
  * Search the extent records for the entry containing block bno.
  * If bno lies in a hole, point to the next entry.  If bno lies
index 23efad29a5cd680b1229ae32a3e56fa9b78d50a8..e46e02b8e27712b7918bc935e41a0a6394e95adf 100644 (file)
 #include "xfs_inode_item.h"
 #include "xfs_alloc.h"
 #include "xfs_btree.h"
+#include "xfs_btree_trace.h"
 #include "xfs_ialloc.h"
 #include "xfs_itable.h"
 #include "xfs_bmap.h"
 #include "xfs_error.h"
 #include "xfs_quota.h"
 
-#if defined(XFS_BMBT_TRACE)
-ktrace_t       *xfs_bmbt_trace_buf;
-#endif
-
-/*
- * Prototypes for internal btree functions.
- */
-
-
-STATIC int xfs_bmbt_killroot(xfs_btree_cur_t *);
-STATIC void xfs_bmbt_log_keys(xfs_btree_cur_t *, xfs_buf_t *, int, int);
-STATIC void xfs_bmbt_log_ptrs(xfs_btree_cur_t *, xfs_buf_t *, int, int);
-STATIC int xfs_bmbt_lshift(xfs_btree_cur_t *, int, int *);
-STATIC int xfs_bmbt_rshift(xfs_btree_cur_t *, int, int *);
-STATIC int xfs_bmbt_split(xfs_btree_cur_t *, int, xfs_fsblock_t *,
-               __uint64_t *, xfs_btree_cur_t **, int *);
-STATIC int xfs_bmbt_updkey(xfs_btree_cur_t *, xfs_bmbt_key_t *, int);
-
-
-#if defined(XFS_BMBT_TRACE)
-
-static char    ARGS[] = "args";
-static char    ENTRY[] = "entry";
-static char    ERROR[] = "error";
-#undef EXIT
-static char    EXIT[] = "exit";
-
-/*
- * Add a trace buffer entry for the arguments given to the routine,
- * generic form.
- */
-STATIC void
-xfs_bmbt_trace_enter(
-       const char      *func,
-       xfs_btree_cur_t *cur,
-       char            *s,
-       int             type,
-       int             line,
-       __psunsigned_t  a0,
-       __psunsigned_t  a1,
-       __psunsigned_t  a2,
-       __psunsigned_t  a3,
-       __psunsigned_t  a4,
-       __psunsigned_t  a5,
-       __psunsigned_t  a6,
-       __psunsigned_t  a7,
-       __psunsigned_t  a8,
-       __psunsigned_t  a9,
-       __psunsigned_t  a10)
-{
-       xfs_inode_t     *ip;
-       int             whichfork;
-
-       ip = cur->bc_private.b.ip;
-       whichfork = cur->bc_private.b.whichfork;
-       ktrace_enter(xfs_bmbt_trace_buf,
-               (void *)((__psint_t)type | (whichfork << 8) | (line << 16)),
-               (void *)func, (void *)s, (void *)ip, (void *)cur,
-               (void *)a0, (void *)a1, (void *)a2, (void *)a3,
-               (void *)a4, (void *)a5, (void *)a6, (void *)a7,
-               (void *)a8, (void *)a9, (void *)a10);
-       ASSERT(ip->i_btrace);
-       ktrace_enter(ip->i_btrace,
-               (void *)((__psint_t)type | (whichfork << 8) | (line << 16)),
-               (void *)func, (void *)s, (void *)ip, (void *)cur,
-               (void *)a0, (void *)a1, (void *)a2, (void *)a3,
-               (void *)a4, (void *)a5, (void *)a6, (void *)a7,
-               (void *)a8, (void *)a9, (void *)a10);
-}
-/*
- * Add a trace buffer entry for arguments, for a buffer & 1 integer arg.
- */
-STATIC void
-xfs_bmbt_trace_argbi(
-       const char      *func,
-       xfs_btree_cur_t *cur,
-       xfs_buf_t       *b,
-       int             i,
-       int             line)
-{
-       xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGBI, line,
-               (__psunsigned_t)b, i, 0, 0,
-               0, 0, 0, 0,
-               0, 0, 0);
-}
-
-/*
- * Add a trace buffer entry for arguments, for a buffer & 2 integer args.
- */
-STATIC void
-xfs_bmbt_trace_argbii(
-       const char      *func,
-       xfs_btree_cur_t *cur,
-       xfs_buf_t       *b,
-       int             i0,
-       int             i1,
-       int             line)
-{
-       xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGBII, line,
-               (__psunsigned_t)b, i0, i1, 0,
-               0, 0, 0, 0,
-               0, 0, 0);
-}
-
-/*
- * Add a trace buffer entry for arguments, for 3 block-length args
- * and an integer arg.
- */
-STATIC void
-xfs_bmbt_trace_argfffi(
-       const char              *func,
-       xfs_btree_cur_t         *cur,
-       xfs_dfiloff_t           o,
-       xfs_dfsbno_t            b,
-       xfs_dfilblks_t          i,
-       int                     j,
-       int                     line)
-{
-       xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGFFFI, line,
-               o >> 32, (int)o, b >> 32, (int)b,
-               i >> 32, (int)i, (int)j, 0,
-               0, 0, 0);
-}
-
-/*
- * Add a trace buffer entry for arguments, for one integer arg.
- */
-STATIC void
-xfs_bmbt_trace_argi(
-       const char      *func,
-       xfs_btree_cur_t *cur,
-       int             i,
-       int             line)
-{
-       xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGI, line,
-               i, 0, 0, 0,
-               0, 0, 0, 0,
-               0, 0, 0);
-}
-
-/*
- * Add a trace buffer entry for arguments, for int, fsblock, key.
- */
-STATIC void
-xfs_bmbt_trace_argifk(
-       const char              *func,
-       xfs_btree_cur_t         *cur,
-       int                     i,
-       xfs_fsblock_t           f,
-       xfs_dfiloff_t           o,
-       int                     line)
-{
-       xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGIFK, line,
-               i, (xfs_dfsbno_t)f >> 32, (int)f, o >> 32,
-               (int)o, 0, 0, 0,
-               0, 0, 0);
-}
-
-/*
- * Add a trace buffer entry for arguments, for int, fsblock, rec.
- */
-STATIC void
-xfs_bmbt_trace_argifr(
-       const char              *func,
-       xfs_btree_cur_t         *cur,
-       int                     i,
-       xfs_fsblock_t           f,
-       xfs_bmbt_rec_t          *r,
-       int                     line)
-{
-       xfs_dfsbno_t            b;
-       xfs_dfilblks_t          c;
-       xfs_dfsbno_t            d;
-       xfs_dfiloff_t           o;
-       xfs_bmbt_irec_t         s;
-
-       d = (xfs_dfsbno_t)f;
-       xfs_bmbt_disk_get_all(r, &s);
-       o = (xfs_dfiloff_t)s.br_startoff;
-       b = (xfs_dfsbno_t)s.br_startblock;
-       c = s.br_blockcount;
-       xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGIFR, line,
-               i, d >> 32, (int)d, o >> 32,
-               (int)o, b >> 32, (int)b, c >> 32,
-               (int)c, 0, 0);
-}
-
-/*
- * Add a trace buffer entry for arguments, for int, key.
- */
-STATIC void
-xfs_bmbt_trace_argik(
-       const char              *func,
-       xfs_btree_cur_t         *cur,
-       int                     i,
-       xfs_bmbt_key_t          *k,
-       int                     line)
-{
-       xfs_dfiloff_t           o;
-
-       o = be64_to_cpu(k->br_startoff);
-       xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGIFK, line,
-               i, o >> 32, (int)o, 0,
-               0, 0, 0, 0,
-               0, 0, 0);
-}
-
-/*
- * Add a trace buffer entry for the cursor/operation.
- */
-STATIC void
-xfs_bmbt_trace_cursor(
-       const char      *func,
-       xfs_btree_cur_t *cur,
-       char            *s,
-       int             line)
-{
-       xfs_bmbt_rec_host_t     r;
-
-       xfs_bmbt_set_all(&r, &cur->bc_rec.b);
-       xfs_bmbt_trace_enter(func, cur, s, XFS_BMBT_KTRACE_CUR, line,
-               (cur->bc_nlevels << 24) | (cur->bc_private.b.flags << 16) |
-               cur->bc_private.b.allocated,
-               r.l0 >> 32, (int)r.l0,
-               r.l1 >> 32, (int)r.l1,
-               (unsigned long)cur->bc_bufs[0], (unsigned long)cur->bc_bufs[1],
-               (unsigned long)cur->bc_bufs[2], (unsigned long)cur->bc_bufs[3],
-               (cur->bc_ptrs[0] << 16) | cur->bc_ptrs[1],
-               (cur->bc_ptrs[2] << 16) | cur->bc_ptrs[3]);
-}
-
-#define        XFS_BMBT_TRACE_ARGBI(c,b,i)     \
-       xfs_bmbt_trace_argbi(__func__, c, b, i, __LINE__)
-#define        XFS_BMBT_TRACE_ARGBII(c,b,i,j)  \
-       xfs_bmbt_trace_argbii(__func__, c, b, i, j, __LINE__)
-#define        XFS_BMBT_TRACE_ARGFFFI(c,o,b,i,j)       \
-       xfs_bmbt_trace_argfffi(__func__, c, o, b, i, j, __LINE__)
-#define        XFS_BMBT_TRACE_ARGI(c,i)        \
-       xfs_bmbt_trace_argi(__func__, c, i, __LINE__)
-#define        XFS_BMBT_TRACE_ARGIFK(c,i,f,s)  \
-       xfs_bmbt_trace_argifk(__func__, c, i, f, s, __LINE__)
-#define        XFS_BMBT_TRACE_ARGIFR(c,i,f,r)  \
-       xfs_bmbt_trace_argifr(__func__, c, i, f, r, __LINE__)
-#define        XFS_BMBT_TRACE_ARGIK(c,i,k)     \
-       xfs_bmbt_trace_argik(__func__, c, i, k, __LINE__)
-#define        XFS_BMBT_TRACE_CURSOR(c,s)      \
-       xfs_bmbt_trace_cursor(__func__, c, s, __LINE__)
-#else
-#define        XFS_BMBT_TRACE_ARGBI(c,b,i)
-#define        XFS_BMBT_TRACE_ARGBII(c,b,i,j)
-#define        XFS_BMBT_TRACE_ARGFFFI(c,o,b,i,j)
-#define        XFS_BMBT_TRACE_ARGI(c,i)
-#define        XFS_BMBT_TRACE_ARGIFK(c,i,f,s)
-#define        XFS_BMBT_TRACE_ARGIFR(c,i,f,r)
-#define        XFS_BMBT_TRACE_ARGIK(c,i,k)
-#define        XFS_BMBT_TRACE_CURSOR(c,s)
-#endif /* XFS_BMBT_TRACE */
-
-
-/*
- * Internal functions.
- */
-
-/*
- * Delete record pointed to by cur/level.
- */
-STATIC int                                     /* error */
-xfs_bmbt_delrec(
-       xfs_btree_cur_t         *cur,
-       int                     level,
-       int                     *stat)          /* success/failure */
-{
-       xfs_bmbt_block_t        *block;         /* bmap btree block */
-       xfs_fsblock_t           bno;            /* fs-relative block number */
-       xfs_buf_t               *bp;            /* buffer for block */
-       int                     error;          /* error return value */
-       int                     i;              /* loop counter */
-       int                     j;              /* temp state */
-       xfs_bmbt_key_t          key;            /* bmap btree key */
-       xfs_bmbt_key_t          *kp=NULL;       /* pointer to bmap btree key */
-       xfs_fsblock_t           lbno;           /* left sibling block number */
-       xfs_buf_t               *lbp;           /* left buffer pointer */
-       xfs_bmbt_block_t        *left;          /* left btree block */
-       xfs_bmbt_key_t          *lkp;           /* left btree key */
-       xfs_bmbt_ptr_t          *lpp;           /* left address pointer */
-       int                     lrecs=0;        /* left record count */
-       xfs_bmbt_rec_t          *lrp;           /* left record pointer */
-       xfs_mount_t             *mp;            /* file system mount point */
-       xfs_bmbt_ptr_t          *pp;            /* pointer to bmap block addr */
-       int                     ptr;            /* key/record index */
-       xfs_fsblock_t           rbno;           /* right sibling block number */
-       xfs_buf_t               *rbp;           /* right buffer pointer */
-       xfs_bmbt_block_t        *right;         /* right btree block */
-       xfs_bmbt_key_t          *rkp;           /* right btree key */
-       xfs_bmbt_rec_t          *rp;            /* pointer to bmap btree rec */
-       xfs_bmbt_ptr_t          *rpp;           /* right address pointer */
-       xfs_bmbt_block_t        *rrblock;       /* right-right btree block */
-       xfs_buf_t               *rrbp;          /* right-right buffer pointer */
-       int                     rrecs=0;        /* right record count */
-       xfs_bmbt_rec_t          *rrp;           /* right record pointer */
-       xfs_btree_cur_t         *tcur;          /* temporary btree cursor */
-       int                     numrecs;        /* temporary numrec count */
-       int                     numlrecs, numrrecs;
-
-       XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
-       XFS_BMBT_TRACE_ARGI(cur, level);
-       ptr = cur->bc_ptrs[level];
-       tcur = NULL;
-       if (ptr == 0) {
-               XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-               *stat = 0;
-               return 0;
-       }
-       block = xfs_bmbt_get_block(cur, level, &bp);
-       numrecs = be16_to_cpu(block->bb_numrecs);
-#ifdef DEBUG
-       if ((error = xfs_btree_check_lblock(cur, block, level, bp))) {
-               XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-               goto error0;
-       }
-#endif
-       if (ptr > numrecs) {
-               XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-               *stat = 0;
-               return 0;
-       }
-       XFS_STATS_INC(xs_bmbt_delrec);
-       if (level > 0) {
-               kp = XFS_BMAP_KEY_IADDR(block, 1, cur);
-               pp = XFS_BMAP_PTR_IADDR(block, 1, cur);
-#ifdef DEBUG
-               for (i = ptr; i < numrecs; i++) {
-                       if ((error = xfs_btree_check_lptr_disk(cur, pp[i], level))) {
-                               XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                               goto error0;
-                       }
-               }
-#endif
-               if (ptr < numrecs) {
-                       memmove(&kp[ptr - 1], &kp[ptr],
-                               (numrecs - ptr) * sizeof(*kp));
-                       memmove(&pp[ptr - 1], &pp[ptr],
-                               (numrecs - ptr) * sizeof(*pp));
-                       xfs_bmbt_log_ptrs(cur, bp, ptr, numrecs - 1);
-                       xfs_bmbt_log_keys(cur, bp, ptr, numrecs - 1);
-               }
-       } else {
-               rp = XFS_BMAP_REC_IADDR(block, 1, cur);
-               if (ptr < numrecs) {
-                       memmove(&rp[ptr - 1], &rp[ptr],
-                               (numrecs - ptr) * sizeof(*rp));
-                       xfs_bmbt_log_recs(cur, bp, ptr, numrecs - 1);
-               }
-               if (ptr == 1) {
-                       key.br_startoff =
-                               cpu_to_be64(xfs_bmbt_disk_get_startoff(rp));
-                       kp = &key;
-               }
-       }
-       numrecs--;
-       block->bb_numrecs = cpu_to_be16(numrecs);
-       xfs_bmbt_log_block(cur, bp, XFS_BB_NUMRECS);
-       /*
-        * We're at the root level.
-        * First, shrink the root block in-memory.
-        * Try to get rid of the next level down.
-        * If we can't then there's nothing left to do.
-        */
-       if (level == cur->bc_nlevels - 1) {
-               xfs_iroot_realloc(cur->bc_private.b.ip, -1,
-                       cur->bc_private.b.whichfork);
-               if ((error = xfs_bmbt_killroot(cur))) {
-                       XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                       goto error0;
-               }
-               if (level > 0 && (error = xfs_bmbt_decrement(cur, level, &j))) {
-                       XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                       goto error0;
-               }
-               XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-               *stat = 1;
-               return 0;
-       }
-       if (ptr == 1 && (error = xfs_bmbt_updkey(cur, kp, level + 1))) {
-               XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-               goto error0;
-       }
-       if (numrecs >= XFS_BMAP_BLOCK_IMINRECS(level, cur)) {
-               if (level > 0 && (error = xfs_bmbt_decrement(cur, level, &j))) {
-                       XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                       goto error0;
-               }
-               XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-               *stat = 1;
-               return 0;
-       }
-       rbno = be64_to_cpu(block->bb_rightsib);
-       lbno = be64_to_cpu(block->bb_leftsib);
-       /*
-        * One child of root, need to get a chance to copy its contents
-        * into the root and delete it. Can't go up to next level,
-        * there's nothing to delete there.
-        */
-       if (lbno == NULLFSBLOCK && rbno == NULLFSBLOCK &&
-           level == cur->bc_nlevels - 2) {
-               if ((error = xfs_bmbt_killroot(cur))) {
-                       XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                       goto error0;
-               }
-               if (level > 0 && (error = xfs_bmbt_decrement(cur, level, &i))) {
-                       XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                       goto error0;
-               }
-               XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-               *stat = 1;
-               return 0;
-       }
-       ASSERT(rbno != NULLFSBLOCK || lbno != NULLFSBLOCK);
-       if ((error = xfs_btree_dup_cursor(cur, &tcur))) {
-               XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-               goto error0;
-       }
-       bno = NULLFSBLOCK;
-       if (rbno != NULLFSBLOCK) {
-               i = xfs_btree_lastrec(tcur, level);
-               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-               if ((error = xfs_bmbt_increment(tcur, level, &i))) {
-                       XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                       goto error0;
-               }
-               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-               i = xfs_btree_lastrec(tcur, level);
-               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-               rbp = tcur->bc_bufs[level];
-               right = XFS_BUF_TO_BMBT_BLOCK(rbp);
-#ifdef DEBUG
-               if ((error = xfs_btree_check_lblock(cur, right, level, rbp))) {
-                       XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                       goto error0;
-               }
-#endif
-               bno = be64_to_cpu(right->bb_leftsib);
-               if (be16_to_cpu(right->bb_numrecs) - 1 >=
-                   XFS_BMAP_BLOCK_IMINRECS(level, cur)) {
-                       if ((error = xfs_bmbt_lshift(tcur, level, &i))) {
-                               XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                               goto error0;
-                       }
-                       if (i) {
-                               ASSERT(be16_to_cpu(block->bb_numrecs) >=
-                                      XFS_BMAP_BLOCK_IMINRECS(level, tcur));
-                               xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
-                               tcur = NULL;
-                               if (level > 0) {
-                                       if ((error = xfs_bmbt_decrement(cur,
-                                                       level, &i))) {
-                                               XFS_BMBT_TRACE_CURSOR(cur,
-                                                       ERROR);
-                                               goto error0;
-                                       }
-                               }
-                               XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                               *stat = 1;
-                               return 0;
-                       }
-               }
-               rrecs = be16_to_cpu(right->bb_numrecs);
-               if (lbno != NULLFSBLOCK) {
-                       i = xfs_btree_firstrec(tcur, level);
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                       if ((error = xfs_bmbt_decrement(tcur, level, &i))) {
-                               XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                               goto error0;
-                       }
-                       XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-               }
-       }
-       if (lbno != NULLFSBLOCK) {
-               i = xfs_btree_firstrec(tcur, level);
-               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-               /*
-                * decrement to last in block
-                */
-               if ((error = xfs_bmbt_decrement(tcur, level, &i))) {
-                       XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                       goto error0;
-               }
-               i = xfs_btree_firstrec(tcur, level);
-               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-               lbp = tcur->bc_bufs[level];
-               left = XFS_BUF_TO_BMBT_BLOCK(lbp);
-#ifdef DEBUG
-               if ((error = xfs_btree_check_lblock(cur, left, level, lbp))) {
-                       XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                       goto error0;
-               }
-#endif
-               bno = be64_to_cpu(left->bb_rightsib);
-               if (be16_to_cpu(left->bb_numrecs) - 1 >=
-                   XFS_BMAP_BLOCK_IMINRECS(level, cur)) {
-                       if ((error = xfs_bmbt_rshift(tcur, level, &i))) {
-                               XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                               goto error0;
-                       }
-                       if (i) {
-                               ASSERT(be16_to_cpu(block->bb_numrecs) >=
-                                      XFS_BMAP_BLOCK_IMINRECS(level, tcur));
-                               xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
-                               tcur = NULL;
-                               if (level == 0)
-                                       cur->bc_ptrs[0]++;
-                               XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                               *stat = 1;
-                               return 0;
-                       }
-               }
-               lrecs = be16_to_cpu(left->bb_numrecs);
-       }
-       xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
-       tcur = NULL;
-       mp = cur->bc_mp;
-       ASSERT(bno != NULLFSBLOCK);
-       if (lbno != NULLFSBLOCK &&
-           lrecs + be16_to_cpu(block->bb_numrecs) <= XFS_BMAP_BLOCK_IMAXRECS(level, cur)) {
-               rbno = bno;
-               right = block;
-               rbp = bp;
-               if ((error = xfs_btree_read_bufl(mp, cur->bc_tp, lbno, 0, &lbp,
-                               XFS_BMAP_BTREE_REF))) {
-                       XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                       goto error0;
-               }
-               left = XFS_BUF_TO_BMBT_BLOCK(lbp);
-               if ((error = xfs_btree_check_lblock(cur, left, level, lbp))) {
-                       XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                       goto error0;
-               }
-       } else if (rbno != NULLFSBLOCK &&
-                  rrecs + be16_to_cpu(block->bb_numrecs) <=
-                  XFS_BMAP_BLOCK_IMAXRECS(level, cur)) {
-               lbno = bno;
-               left = block;
-               lbp = bp;
-               if ((error = xfs_btree_read_bufl(mp, cur->bc_tp, rbno, 0, &rbp,
-                               XFS_BMAP_BTREE_REF))) {
-                       XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                       goto error0;
-               }
-               right = XFS_BUF_TO_BMBT_BLOCK(rbp);
-               if ((error = xfs_btree_check_lblock(cur, right, level, rbp))) {
-                       XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                       goto error0;
-               }
-               lrecs = be16_to_cpu(left->bb_numrecs);
-       } else {
-               if (level > 0 && (error = xfs_bmbt_decrement(cur, level, &i))) {
-                       XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                       goto error0;
-               }
-               XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-               *stat = 1;
-               return 0;
-       }
-       numlrecs = be16_to_cpu(left->bb_numrecs);
-       numrrecs = be16_to_cpu(right->bb_numrecs);
-       if (level > 0) {
-               lkp = XFS_BMAP_KEY_IADDR(left, numlrecs + 1, cur);
-               lpp = XFS_BMAP_PTR_IADDR(left, numlrecs + 1, cur);
-               rkp = XFS_BMAP_KEY_IADDR(right, 1, cur);
-               rpp = XFS_BMAP_PTR_IADDR(right, 1, cur);
-#ifdef DEBUG
-               for (i = 0; i < numrrecs; i++) {
-                       if ((error = xfs_btree_check_lptr_disk(cur, rpp[i], level))) {
-                               XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                               goto error0;
-                       }
-               }
-#endif
-               memcpy(lkp, rkp, numrrecs * sizeof(*lkp));
-               memcpy(lpp, rpp, numrrecs * sizeof(*lpp));
-               xfs_bmbt_log_keys(cur, lbp, numlrecs + 1, numlrecs + numrrecs);
-               xfs_bmbt_log_ptrs(cur, lbp, numlrecs + 1, numlrecs + numrrecs);
-       } else {
-               lrp = XFS_BMAP_REC_IADDR(left, numlrecs + 1, cur);
-               rrp = XFS_BMAP_REC_IADDR(right, 1, cur);
-               memcpy(lrp, rrp, numrrecs * sizeof(*lrp));
-               xfs_bmbt_log_recs(cur, lbp, numlrecs + 1, numlrecs + numrrecs);
-       }
-       be16_add_cpu(&left->bb_numrecs, numrrecs);
-       left->bb_rightsib = right->bb_rightsib;
-       xfs_bmbt_log_block(cur, lbp, XFS_BB_RIGHTSIB | XFS_BB_NUMRECS);
-       if (be64_to_cpu(left->bb_rightsib) != NULLDFSBNO) {
-               if ((error = xfs_btree_read_bufl(mp, cur->bc_tp,
-                               be64_to_cpu(left->bb_rightsib),
-                               0, &rrbp, XFS_BMAP_BTREE_REF))) {
-                       XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                       goto error0;
-               }
-               rrblock = XFS_BUF_TO_BMBT_BLOCK(rrbp);
-               if ((error = xfs_btree_check_lblock(cur, rrblock, level, rrbp))) {
-                       XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                       goto error0;
-               }
-               rrblock->bb_leftsib = cpu_to_be64(lbno);
-               xfs_bmbt_log_block(cur, rrbp, XFS_BB_LEFTSIB);
-       }
-       xfs_bmap_add_free(XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(rbp)), 1,
-               cur->bc_private.b.flist, mp);
-       cur->bc_private.b.ip->i_d.di_nblocks--;
-       xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip, XFS_ILOG_CORE);
-       XFS_TRANS_MOD_DQUOT_BYINO(mp, cur->bc_tp, cur->bc_private.b.ip,
-                       XFS_TRANS_DQ_BCOUNT, -1L);
-       xfs_trans_binval(cur->bc_tp, rbp);
-       if (bp != lbp) {
-               cur->bc_bufs[level] = lbp;
-               cur->bc_ptrs[level] += lrecs;
-               cur->bc_ra[level] = 0;
-       } else if ((error = xfs_bmbt_increment(cur, level + 1, &i))) {
-               XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-               goto error0;
-       }
-       if (level > 0)
-               cur->bc_ptrs[level]--;
-       XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-       *stat = 2;
-       return 0;
-
-error0:
-       if (tcur)
-               xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
-       return error;
-}
-
-/*
- * Insert one record/level.  Return information to the caller
- * allowing the next level up to proceed if necessary.
- */
-STATIC int                                     /* error */
-xfs_bmbt_insrec(
-       xfs_btree_cur_t         *cur,
-       int                     level,
-       xfs_fsblock_t           *bnop,
-       xfs_bmbt_rec_t          *recp,
-       xfs_btree_cur_t         **curp,
-       int                     *stat)          /* no-go/done/continue */
-{
-       xfs_bmbt_block_t        *block;         /* bmap btree block */
-       xfs_buf_t               *bp;            /* buffer for block */
-       int                     error;          /* error return value */
-       int                     i;              /* loop index */
-       xfs_bmbt_key_t          key;            /* bmap btree key */
-       xfs_bmbt_key_t          *kp=NULL;       /* pointer to bmap btree key */
-       int                     logflags;       /* inode logging flags */
-       xfs_fsblock_t           nbno;           /* new block number */
-       struct xfs_btree_cur    *ncur;          /* new btree cursor */
-       __uint64_t              startoff;       /* new btree key value */
-       xfs_bmbt_rec_t          nrec;           /* new record count */
-       int                     optr;           /* old key/record index */
-       xfs_bmbt_ptr_t          *pp;            /* pointer to bmap block addr */
-       int                     ptr;            /* key/record index */
-       xfs_bmbt_rec_t          *rp=NULL;       /* pointer to bmap btree rec */
-       int                     numrecs;
-
-       ASSERT(level < cur->bc_nlevels);
-       XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
-       XFS_BMBT_TRACE_ARGIFR(cur, level, *bnop, recp);
-       ncur = NULL;
-       key.br_startoff = cpu_to_be64(xfs_bmbt_disk_get_startoff(recp));
-       optr = ptr = cur->bc_ptrs[level];
-       if (ptr == 0) {
-               XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-               *stat = 0;
-               return 0;
-       }
-       XFS_STATS_INC(xs_bmbt_insrec);
-       block = xfs_bmbt_get_block(cur, level, &bp);
-       numrecs = be16_to_cpu(block->bb_numrecs);
-#ifdef DEBUG
-       if ((error = xfs_btree_check_lblock(cur, block, level, bp))) {
-               XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-               return error;
-       }
-       if (ptr <= numrecs) {
-               if (level == 0) {
-                       rp = XFS_BMAP_REC_IADDR(block, ptr, cur);
-                       xfs_btree_check_rec(XFS_BTNUM_BMAP, recp, rp);
-               } else {
-                       kp = XFS_BMAP_KEY_IADDR(block, ptr, cur);
-                       xfs_btree_check_key(XFS_BTNUM_BMAP, &key, kp);
-               }
-       }
-#endif
-       nbno = NULLFSBLOCK;
-       if (numrecs == XFS_BMAP_BLOCK_IMAXRECS(level, cur)) {
-               if (numrecs < XFS_BMAP_BLOCK_DMAXRECS(level, cur)) {
-                       /*
-                        * A root block, that can be made bigger.
-                        */
-                       xfs_iroot_realloc(cur->bc_private.b.ip, 1,
-                               cur->bc_private.b.whichfork);
-                       block = xfs_bmbt_get_block(cur, level, &bp);
-               } else if (level == cur->bc_nlevels - 1) {
-                       if ((error = xfs_bmbt_newroot(cur, &logflags, stat)) ||
-                           *stat == 0) {
-                               XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                               return error;
-                       }
-                       xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
-                               logflags);
-                       block = xfs_bmbt_get_block(cur, level, &bp);
-               } else {
-                       if ((error = xfs_bmbt_rshift(cur, level, &i))) {
-                               XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                               return error;
-                       }
-                       if (i) {
-                               /* nothing */
-                       } else {
-                               if ((error = xfs_bmbt_lshift(cur, level, &i))) {
-                                       XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                                       return error;
-                               }
-                               if (i) {
-                                       optr = ptr = cur->bc_ptrs[level];
-                               } else {
-                                       if ((error = xfs_bmbt_split(cur, level,
-                                                       &nbno, &startoff, &ncur,
-                                                       &i))) {
-                                               XFS_BMBT_TRACE_CURSOR(cur,
-                                                       ERROR);
-                                               return error;
-                                       }
-                                       if (i) {
-                                               block = xfs_bmbt_get_block(
-                                                           cur, level, &bp);
-#ifdef DEBUG
-                                               if ((error =
-                                                   xfs_btree_check_lblock(cur,
-                                                           block, level, bp))) {
-                                                       XFS_BMBT_TRACE_CURSOR(
-                                                               cur, ERROR);
-                                                       return error;
-                                               }
-#endif
-                                               ptr = cur->bc_ptrs[level];
-                                               xfs_bmbt_disk_set_allf(&nrec,
-                                                       startoff, 0, 0,
-                                                       XFS_EXT_NORM);
-                                       } else {
-                                               XFS_BMBT_TRACE_CURSOR(cur,
-                                                       EXIT);
-                                               *stat = 0;
-                                               return 0;
-                                       }
-                               }
-                       }
-               }
-       }
-       numrecs = be16_to_cpu(block->bb_numrecs);
-       if (level > 0) {
-               kp = XFS_BMAP_KEY_IADDR(block, 1, cur);
-               pp = XFS_BMAP_PTR_IADDR(block, 1, cur);
-#ifdef DEBUG
-               for (i = numrecs; i >= ptr; i--) {
-                       if ((error = xfs_btree_check_lptr_disk(cur, pp[i - 1],
-                                       level))) {
-                               XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                               return error;
-                       }
-               }
-#endif
-               memmove(&kp[ptr], &kp[ptr - 1],
-                       (numrecs - ptr + 1) * sizeof(*kp));
-               memmove(&pp[ptr], &pp[ptr - 1],
-                       (numrecs - ptr + 1) * sizeof(*pp));
-#ifdef DEBUG
-               if ((error = xfs_btree_check_lptr(cur, *bnop, level))) {
-                       XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                       return error;
-               }
-#endif
-               kp[ptr - 1] = key;
-               pp[ptr - 1] = cpu_to_be64(*bnop);
-               numrecs++;
-               block->bb_numrecs = cpu_to_be16(numrecs);
-               xfs_bmbt_log_keys(cur, bp, ptr, numrecs);
-               xfs_bmbt_log_ptrs(cur, bp, ptr, numrecs);
-       } else {
-               rp = XFS_BMAP_REC_IADDR(block, 1, cur);
-               memmove(&rp[ptr], &rp[ptr - 1],
-                       (numrecs - ptr + 1) * sizeof(*rp));
-               rp[ptr - 1] = *recp;
-               numrecs++;
-               block->bb_numrecs = cpu_to_be16(numrecs);
-               xfs_bmbt_log_recs(cur, bp, ptr, numrecs);
-       }
-       xfs_bmbt_log_block(cur, bp, XFS_BB_NUMRECS);
-#ifdef DEBUG
-       if (ptr < numrecs) {
-               if (level == 0)
-                       xfs_btree_check_rec(XFS_BTNUM_BMAP, rp + ptr - 1,
-                               rp + ptr);
-               else
-                       xfs_btree_check_key(XFS_BTNUM_BMAP, kp + ptr - 1,
-                               kp + ptr);
-       }
-#endif
-       if (optr == 1 && (error = xfs_bmbt_updkey(cur, &key, level + 1))) {
-               XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-               return error;
-       }
-       *bnop = nbno;
-       if (nbno != NULLFSBLOCK) {
-               *recp = nrec;
-               *curp = ncur;
-       }
-       XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-       *stat = 1;
-       return 0;
-}
-
-STATIC int
-xfs_bmbt_killroot(
-       xfs_btree_cur_t         *cur)
-{
-       xfs_bmbt_block_t        *block;
-       xfs_bmbt_block_t        *cblock;
-       xfs_buf_t               *cbp;
-       xfs_bmbt_key_t          *ckp;
-       xfs_bmbt_ptr_t          *cpp;
-#ifdef DEBUG
-       int                     error;
-#endif
-       int                     i;
-       xfs_bmbt_key_t          *kp;
-       xfs_inode_t             *ip;
-       xfs_ifork_t             *ifp;
-       int                     level;
-       xfs_bmbt_ptr_t          *pp;
-
-       XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
-       level = cur->bc_nlevels - 1;
-       ASSERT(level >= 1);
-       /*
-        * Don't deal with the root block needs to be a leaf case.
-        * We're just going to turn the thing back into extents anyway.
-        */
-       if (level == 1) {
-               XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-               return 0;
-       }
-       block = xfs_bmbt_get_block(cur, level, &cbp);
-       /*
-        * Give up if the root has multiple children.
-        */
-       if (be16_to_cpu(block->bb_numrecs) != 1) {
-               XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-               return 0;
-       }
-       /*
-        * Only do this if the next level will fit.
-        * Then the data must be copied up to the inode,
-        * instead of freeing the root you free the next level.
-        */
-       cbp = cur->bc_bufs[level - 1];
-       cblock = XFS_BUF_TO_BMBT_BLOCK(cbp);
-       if (be16_to_cpu(cblock->bb_numrecs) > XFS_BMAP_BLOCK_DMAXRECS(level, cur)) {
-               XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-               return 0;
-       }
-       ASSERT(be64_to_cpu(cblock->bb_leftsib) == NULLDFSBNO);
-       ASSERT(be64_to_cpu(cblock->bb_rightsib) == NULLDFSBNO);
-       ip = cur->bc_private.b.ip;
-       ifp = XFS_IFORK_PTR(ip, cur->bc_private.b.whichfork);
-       ASSERT(XFS_BMAP_BLOCK_IMAXRECS(level, cur) ==
-              XFS_BMAP_BROOT_MAXRECS(ifp->if_broot_bytes));
-       i = (int)(be16_to_cpu(cblock->bb_numrecs) - XFS_BMAP_BLOCK_IMAXRECS(level, cur));
-       if (i) {
-               xfs_iroot_realloc(ip, i, cur->bc_private.b.whichfork);
-               block = ifp->if_broot;
-       }
-       be16_add_cpu(&block->bb_numrecs, i);
-       ASSERT(block->bb_numrecs == cblock->bb_numrecs);
-       kp = XFS_BMAP_KEY_IADDR(block, 1, cur);
-       ckp = XFS_BMAP_KEY_IADDR(cblock, 1, cur);
-       memcpy(kp, ckp, be16_to_cpu(block->bb_numrecs) * sizeof(*kp));
-       pp = XFS_BMAP_PTR_IADDR(block, 1, cur);
-       cpp = XFS_BMAP_PTR_IADDR(cblock, 1, cur);
-#ifdef DEBUG
-       for (i = 0; i < be16_to_cpu(cblock->bb_numrecs); i++) {
-               if ((error = xfs_btree_check_lptr_disk(cur, cpp[i], level - 1))) {
-                       XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                       return error;
-               }
-       }
-#endif
-       memcpy(pp, cpp, be16_to_cpu(block->bb_numrecs) * sizeof(*pp));
-       xfs_bmap_add_free(XFS_DADDR_TO_FSB(cur->bc_mp, XFS_BUF_ADDR(cbp)), 1,
-                       cur->bc_private.b.flist, cur->bc_mp);
-       ip->i_d.di_nblocks--;
-       XFS_TRANS_MOD_DQUOT_BYINO(cur->bc_mp, cur->bc_tp, ip,
-                       XFS_TRANS_DQ_BCOUNT, -1L);
-       xfs_trans_binval(cur->bc_tp, cbp);
-       cur->bc_bufs[level - 1] = NULL;
-       be16_add_cpu(&block->bb_level, -1);
-       xfs_trans_log_inode(cur->bc_tp, ip,
-               XFS_ILOG_CORE | XFS_ILOG_FBROOT(cur->bc_private.b.whichfork));
-       cur->bc_nlevels--;
-       XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-       return 0;
-}
-
-/*
- * Log key values from the btree block.
- */
-STATIC void
-xfs_bmbt_log_keys(
-       xfs_btree_cur_t *cur,
-       xfs_buf_t       *bp,
-       int             kfirst,
-       int             klast)
-{
-       xfs_trans_t     *tp;
-
-       XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
-       XFS_BMBT_TRACE_ARGBII(cur, bp, kfirst, klast);
-       tp = cur->bc_tp;
-       if (bp) {
-               xfs_bmbt_block_t        *block;
-               int                     first;
-               xfs_bmbt_key_t          *kp;
-               int                     last;
-
-               block = XFS_BUF_TO_BMBT_BLOCK(bp);
-               kp = XFS_BMAP_KEY_DADDR(block, 1, cur);
-               first = (int)((xfs_caddr_t)&kp[kfirst - 1] - (xfs_caddr_t)block);
-               last = (int)(((xfs_caddr_t)&kp[klast] - 1) - (xfs_caddr_t)block);
-               xfs_trans_log_buf(tp, bp, first, last);
-       } else {
-               xfs_inode_t              *ip;
-
-               ip = cur->bc_private.b.ip;
-               xfs_trans_log_inode(tp, ip,
-                       XFS_ILOG_FBROOT(cur->bc_private.b.whichfork));
-       }
-       XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-}
-
-/*
- * Log pointer values from the btree block.
- */
-STATIC void
-xfs_bmbt_log_ptrs(
-       xfs_btree_cur_t *cur,
-       xfs_buf_t       *bp,
-       int             pfirst,
-       int             plast)
-{
-       xfs_trans_t     *tp;
-
-       XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
-       XFS_BMBT_TRACE_ARGBII(cur, bp, pfirst, plast);
-       tp = cur->bc_tp;
-       if (bp) {
-               xfs_bmbt_block_t        *block;
-               int                     first;
-               int                     last;
-               xfs_bmbt_ptr_t          *pp;
-
-               block = XFS_BUF_TO_BMBT_BLOCK(bp);
-               pp = XFS_BMAP_PTR_DADDR(block, 1, cur);
-               first = (int)((xfs_caddr_t)&pp[pfirst - 1] - (xfs_caddr_t)block);
-               last = (int)(((xfs_caddr_t)&pp[plast] - 1) - (xfs_caddr_t)block);
-               xfs_trans_log_buf(tp, bp, first, last);
-       } else {
-               xfs_inode_t             *ip;
-
-               ip = cur->bc_private.b.ip;
-               xfs_trans_log_inode(tp, ip,
-                       XFS_ILOG_FBROOT(cur->bc_private.b.whichfork));
-       }
-       XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-}
-
-/*
- * Lookup the record.  The cursor is made to point to it, based on dir.
- */
-STATIC int                             /* error */
-xfs_bmbt_lookup(
-       xfs_btree_cur_t         *cur,
-       xfs_lookup_t            dir,
-       int                     *stat)          /* success/failure */
-{
-       xfs_bmbt_block_t        *block=NULL;
-       xfs_buf_t               *bp;
-       xfs_daddr_t             d;
-       xfs_sfiloff_t           diff;
-       int                     error;          /* error return value */
-       xfs_fsblock_t           fsbno=0;
-       int                     high;
-       int                     i;
-       int                     keyno=0;
-       xfs_bmbt_key_t          *kkbase=NULL;
-       xfs_bmbt_key_t          *kkp;
-       xfs_bmbt_rec_t          *krbase=NULL;
-       xfs_bmbt_rec_t          *krp;
-       int                     level;
-       int                     low;
-       xfs_mount_t             *mp;
-       xfs_bmbt_ptr_t          *pp;
-       xfs_bmbt_irec_t         *rp;
-       xfs_fileoff_t           startoff;
-       xfs_trans_t             *tp;
-
-       XFS_STATS_INC(xs_bmbt_lookup);
-       XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
-       XFS_BMBT_TRACE_ARGI(cur, (int)dir);
-       tp = cur->bc_tp;
-       mp = cur->bc_mp;
-       rp = &cur->bc_rec.b;
-       for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) {
-               if (level < cur->bc_nlevels - 1) {
-                       d = XFS_FSB_TO_DADDR(mp, fsbno);
-                       bp = cur->bc_bufs[level];
-                       if (bp && XFS_BUF_ADDR(bp) != d)
-                               bp = NULL;
-                       if (!bp) {
-                               if ((error = xfs_btree_read_bufl(mp, tp, fsbno,
-                                               0, &bp, XFS_BMAP_BTREE_REF))) {
-                                       XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                                       return error;
-                               }
-                               xfs_btree_setbuf(cur, level, bp);
-                               block = XFS_BUF_TO_BMBT_BLOCK(bp);
-                               if ((error = xfs_btree_check_lblock(cur, block,
-                                               level, bp))) {
-                                       XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                                       return error;
-                               }
-                       } else
-                               block = XFS_BUF_TO_BMBT_BLOCK(bp);
-               } else
-                       block = xfs_bmbt_get_block(cur, level, &bp);
-               if (diff == 0)
-                       keyno = 1;
-               else {
-                       if (level > 0)
-                               kkbase = XFS_BMAP_KEY_IADDR(block, 1, cur);
-                       else
-                               krbase = XFS_BMAP_REC_IADDR(block, 1, cur);
-                       low = 1;
-                       if (!(high = be16_to_cpu(block->bb_numrecs))) {
-                               ASSERT(level == 0);
-                               cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE;
-                               XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                               *stat = 0;
-                               return 0;
-                       }
-                       while (low <= high) {
-                               XFS_STATS_INC(xs_bmbt_compare);
-                               keyno = (low + high) >> 1;
-                               if (level > 0) {
-                                       kkp = kkbase + keyno - 1;
-                                       startoff = be64_to_cpu(kkp->br_startoff);
-                               } else {
-                                       krp = krbase + keyno - 1;
-                                       startoff = xfs_bmbt_disk_get_startoff(krp);
-                               }
-                               diff = (xfs_sfiloff_t)
-                                               (startoff - rp->br_startoff);
-                               if (diff < 0)
-                                       low = keyno + 1;
-                               else if (diff > 0)
-                                       high = keyno - 1;
-                               else
-                                       break;
-                       }
-               }
-               if (level > 0) {
-                       if (diff > 0 && --keyno < 1)
-                               keyno = 1;
-                       pp = XFS_BMAP_PTR_IADDR(block, keyno, cur);
-                       fsbno = be64_to_cpu(*pp);
-#ifdef DEBUG
-                       if ((error = xfs_btree_check_lptr(cur, fsbno, level))) {
-                               XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                               return error;
-                       }
-#endif
-                       cur->bc_ptrs[level] = keyno;
-               }
-       }
-       if (dir != XFS_LOOKUP_LE && diff < 0) {
-               keyno++;
-               /*
-                * If ge search and we went off the end of the block, but it's
-                * not the last block, we're in the wrong block.
-                */
-               if (dir == XFS_LOOKUP_GE && keyno > be16_to_cpu(block->bb_numrecs) &&
-                   be64_to_cpu(block->bb_rightsib) != NULLDFSBNO) {
-                       cur->bc_ptrs[0] = keyno;
-                       if ((error = xfs_bmbt_increment(cur, 0, &i))) {
-                               XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                               return error;
-                       }
-                       XFS_WANT_CORRUPTED_RETURN(i == 1);
-                       XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-                       *stat = 1;
-                       return 0;
-               }
-       }
-       else if (dir == XFS_LOOKUP_LE && diff > 0)
-               keyno--;
-       cur->bc_ptrs[0] = keyno;
-       if (keyno == 0 || keyno > be16_to_cpu(block->bb_numrecs)) {
-               XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-               *stat = 0;
-       } else {
-               XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-               *stat = ((dir != XFS_LOOKUP_EQ) || (diff == 0));
-       }
-       return 0;
-}
-
-/*
- * Move 1 record left from cur/level if possible.
- * Update cur to reflect the new path.
- */
-STATIC int                                     /* error */
-xfs_bmbt_lshift(
-       xfs_btree_cur_t         *cur,
-       int                     level,
-       int                     *stat)          /* success/failure */
-{
-       int                     error;          /* error return value */
-#ifdef DEBUG
-       int                     i;              /* loop counter */
-#endif
-       xfs_bmbt_key_t          key;            /* bmap btree key */
-       xfs_buf_t               *lbp;           /* left buffer pointer */
-       xfs_bmbt_block_t        *left;          /* left btree block */
-       xfs_bmbt_key_t          *lkp=NULL;      /* left btree key */
-       xfs_bmbt_ptr_t          *lpp;           /* left address pointer */
-       int                     lrecs;          /* left record count */
-       xfs_bmbt_rec_t          *lrp=NULL;      /* left record pointer */
-       xfs_mount_t             *mp;            /* file system mount point */
-       xfs_buf_t               *rbp;           /* right buffer pointer */
-       xfs_bmbt_block_t        *right;         /* right btree block */
-       xfs_bmbt_key_t          *rkp=NULL;      /* right btree key */
-       xfs_bmbt_ptr_t          *rpp=NULL;      /* right address pointer */
-       xfs_bmbt_rec_t          *rrp=NULL;      /* right record pointer */
-       int                     rrecs;          /* right record count */
-
-       XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
-       XFS_BMBT_TRACE_ARGI(cur, level);
-       if (level == cur->bc_nlevels - 1) {
-               XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-               *stat = 0;
-               return 0;
-       }
-       rbp = cur->bc_bufs[level];
-       right = XFS_BUF_TO_BMBT_BLOCK(rbp);
-#ifdef DEBUG
-       if ((error = xfs_btree_check_lblock(cur, right, level, rbp))) {
-               XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-               return error;
-       }
-#endif
-       if (be64_to_cpu(right->bb_leftsib) == NULLDFSBNO) {
-               XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-               *stat = 0;
-               return 0;
-       }
-       if (cur->bc_ptrs[level] <= 1) {
-               XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-               *stat = 0;
-               return 0;
-       }
-       mp = cur->bc_mp;
-       if ((error = xfs_btree_read_bufl(mp, cur->bc_tp, be64_to_cpu(right->bb_leftsib), 0,
-                       &lbp, XFS_BMAP_BTREE_REF))) {
-               XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-               return error;
-       }
-       left = XFS_BUF_TO_BMBT_BLOCK(lbp);
-       if ((error = xfs_btree_check_lblock(cur, left, level, lbp))) {
-               XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-               return error;
-       }
-       if (be16_to_cpu(left->bb_numrecs) == XFS_BMAP_BLOCK_IMAXRECS(level, cur)) {
-               XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-               *stat = 0;
-               return 0;
-       }
-       lrecs = be16_to_cpu(left->bb_numrecs) + 1;
-       if (level > 0) {
-               lkp = XFS_BMAP_KEY_IADDR(left, lrecs, cur);
-               rkp = XFS_BMAP_KEY_IADDR(right, 1, cur);
-               *lkp = *rkp;
-               xfs_bmbt_log_keys(cur, lbp, lrecs, lrecs);
-               lpp = XFS_BMAP_PTR_IADDR(left, lrecs, cur);
-               rpp = XFS_BMAP_PTR_IADDR(right, 1, cur);
-#ifdef DEBUG
-               if ((error = xfs_btree_check_lptr_disk(cur, *rpp, level))) {
-                       XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                       return error;
-               }
-#endif
-               *lpp = *rpp;
-               xfs_bmbt_log_ptrs(cur, lbp, lrecs, lrecs);
-       } else {
-               lrp = XFS_BMAP_REC_IADDR(left, lrecs, cur);
-               rrp = XFS_BMAP_REC_IADDR(right, 1, cur);
-               *lrp = *rrp;
-               xfs_bmbt_log_recs(cur, lbp, lrecs, lrecs);
-       }
-       left->bb_numrecs = cpu_to_be16(lrecs);
-       xfs_bmbt_log_block(cur, lbp, XFS_BB_NUMRECS);
-#ifdef DEBUG
-       if (level > 0)
-               xfs_btree_check_key(XFS_BTNUM_BMAP, lkp - 1, lkp);
-       else
-               xfs_btree_check_rec(XFS_BTNUM_BMAP, lrp - 1, lrp);
-#endif
-       rrecs = be16_to_cpu(right->bb_numrecs) - 1;
-       right->bb_numrecs = cpu_to_be16(rrecs);
-       xfs_bmbt_log_block(cur, rbp, XFS_BB_NUMRECS);
-       if (level > 0) {
-#ifdef DEBUG
-               for (i = 0; i < rrecs; i++) {
-                       if ((error = xfs_btree_check_lptr_disk(cur, rpp[i + 1],
-                                       level))) {
-                               XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                               return error;
-                       }
-               }
-#endif
-               memmove(rkp, rkp + 1, rrecs * sizeof(*rkp));
-               memmove(rpp, rpp + 1, rrecs * sizeof(*rpp));
-               xfs_bmbt_log_keys(cur, rbp, 1, rrecs);
-               xfs_bmbt_log_ptrs(cur, rbp, 1, rrecs);
-       } else {
-               memmove(rrp, rrp + 1, rrecs * sizeof(*rrp));
-               xfs_bmbt_log_recs(cur, rbp, 1, rrecs);
-               key.br_startoff = cpu_to_be64(xfs_bmbt_disk_get_startoff(rrp));
-               rkp = &key;
-       }
-       if ((error = xfs_bmbt_updkey(cur, rkp, level + 1))) {
-               XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-               return error;
-       }
-       cur->bc_ptrs[level]--;
-       XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-       *stat = 1;
-       return 0;
-}
-
-/*
- * Move 1 record right from cur/level if possible.
- * Update cur to reflect the new path.
- */
-STATIC int                                     /* error */
-xfs_bmbt_rshift(
-       xfs_btree_cur_t         *cur,
-       int                     level,
-       int                     *stat)          /* success/failure */
-{
-       int                     error;          /* error return value */
-       int                     i;              /* loop counter */
-       xfs_bmbt_key_t          key;            /* bmap btree key */
-       xfs_buf_t               *lbp;           /* left buffer pointer */
-       xfs_bmbt_block_t        *left;          /* left btree block */
-       xfs_bmbt_key_t          *lkp;           /* left btree key */
-       xfs_bmbt_ptr_t          *lpp;           /* left address pointer */
-       xfs_bmbt_rec_t          *lrp;           /* left record pointer */
-       xfs_mount_t             *mp;            /* file system mount point */
-       xfs_buf_t               *rbp;           /* right buffer pointer */
-       xfs_bmbt_block_t        *right;         /* right btree block */
-       xfs_bmbt_key_t          *rkp;           /* right btree key */
-       xfs_bmbt_ptr_t          *rpp;           /* right address pointer */
-       xfs_bmbt_rec_t          *rrp=NULL;      /* right record pointer */
-       struct xfs_btree_cur    *tcur;          /* temporary btree cursor */
-
-       XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
-       XFS_BMBT_TRACE_ARGI(cur, level);
-       if (level == cur->bc_nlevels - 1) {
-               XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-               *stat = 0;
-               return 0;
-       }
-       lbp = cur->bc_bufs[level];
-       left = XFS_BUF_TO_BMBT_BLOCK(lbp);
-#ifdef DEBUG
-       if ((error = xfs_btree_check_lblock(cur, left, level, lbp))) {
-               XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-               return error;
-       }
-#endif
-       if (be64_to_cpu(left->bb_rightsib) == NULLDFSBNO) {
-               XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-               *stat = 0;
-               return 0;
-       }
-       if (cur->bc_ptrs[level] >= be16_to_cpu(left->bb_numrecs)) {
-               XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-               *stat = 0;
-               return 0;
-       }
-       mp = cur->bc_mp;
-       if ((error = xfs_btree_read_bufl(mp, cur->bc_tp, be64_to_cpu(left->bb_rightsib), 0,
-                       &rbp, XFS_BMAP_BTREE_REF))) {
-               XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-               return error;
-       }
-       right = XFS_BUF_TO_BMBT_BLOCK(rbp);
-       if ((error = xfs_btree_check_lblock(cur, right, level, rbp))) {
-               XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-               return error;
-       }
-       if (be16_to_cpu(right->bb_numrecs) == XFS_BMAP_BLOCK_IMAXRECS(level, cur)) {
-               XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-               *stat = 0;
-               return 0;
-       }
-       if (level > 0) {
-               lkp = XFS_BMAP_KEY_IADDR(left, be16_to_cpu(left->bb_numrecs), cur);
-               lpp = XFS_BMAP_PTR_IADDR(left, be16_to_cpu(left->bb_numrecs), cur);
-               rkp = XFS_BMAP_KEY_IADDR(right, 1, cur);
-               rpp = XFS_BMAP_PTR_IADDR(right, 1, cur);
-#ifdef DEBUG
-               for (i = be16_to_cpu(right->bb_numrecs) - 1; i >= 0; i--) {
-                       if ((error = xfs_btree_check_lptr_disk(cur, rpp[i], level))) {
-                               XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                               return error;
-                       }
-               }
-#endif
-               memmove(rkp + 1, rkp, be16_to_cpu(right->bb_numrecs) * sizeof(*rkp));
-               memmove(rpp + 1, rpp, be16_to_cpu(right->bb_numrecs) * sizeof(*rpp));
-#ifdef DEBUG
-               if ((error = xfs_btree_check_lptr_disk(cur, *lpp, level))) {
-                       XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                       return error;
-               }
-#endif
-               *rkp = *lkp;
-               *rpp = *lpp;
-               xfs_bmbt_log_keys(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1);
-               xfs_bmbt_log_ptrs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1);
-       } else {
-               lrp = XFS_BMAP_REC_IADDR(left, be16_to_cpu(left->bb_numrecs), cur);
-               rrp = XFS_BMAP_REC_IADDR(right, 1, cur);
-               memmove(rrp + 1, rrp, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp));
-               *rrp = *lrp;
-               xfs_bmbt_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1);
-               key.br_startoff = cpu_to_be64(xfs_bmbt_disk_get_startoff(rrp));
-               rkp = &key;
-       }
-       be16_add_cpu(&left->bb_numrecs, -1);
-       xfs_bmbt_log_block(cur, lbp, XFS_BB_NUMRECS);
-       be16_add_cpu(&right->bb_numrecs, 1);
-#ifdef DEBUG
-       if (level > 0)
-               xfs_btree_check_key(XFS_BTNUM_BMAP, rkp, rkp + 1);
-       else
-               xfs_btree_check_rec(XFS_BTNUM_BMAP, rrp, rrp + 1);
-#endif
-       xfs_bmbt_log_block(cur, rbp, XFS_BB_NUMRECS);
-       if ((error = xfs_btree_dup_cursor(cur, &tcur))) {
-               XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-               return error;
-       }
-       i = xfs_btree_lastrec(tcur, level);
-       XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-       if ((error = xfs_bmbt_increment(tcur, level, &i))) {
-               XFS_BMBT_TRACE_CURSOR(tcur, ERROR);
-               goto error1;
-       }
-       XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-       if ((error = xfs_bmbt_updkey(tcur, rkp, level + 1))) {
-               XFS_BMBT_TRACE_CURSOR(tcur, ERROR);
-               goto error1;
-       }
-       xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
-       XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-       *stat = 1;
-       return 0;
-error0:
-       XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-error1:
-       xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
-       return error;
-}
-
 /*
  * Determine the extent state.
  */
@@ -1453,229 +60,15 @@ xfs_extent_state(
        return XFS_EXT_NORM;
 }
 
-
-/*
- * Split cur/level block in half.
- * Return new block number and its first record (to be inserted into parent).
- */
-STATIC int                                     /* error */
-xfs_bmbt_split(
-       xfs_btree_cur_t         *cur,
-       int                     level,
-       xfs_fsblock_t           *bnop,
-       __uint64_t              *startoff,
-       xfs_btree_cur_t         **curp,
-       int                     *stat)          /* success/failure */
-{
-       xfs_alloc_arg_t         args;           /* block allocation args */
-       int                     error;          /* error return value */
-       int                     i;              /* loop counter */
-       xfs_fsblock_t           lbno;           /* left sibling block number */
-       xfs_buf_t               *lbp;           /* left buffer pointer */
-       xfs_bmbt_block_t        *left;          /* left btree block */
-       xfs_bmbt_key_t          *lkp;           /* left btree key */
-       xfs_bmbt_ptr_t          *lpp;           /* left address pointer */
-       xfs_bmbt_rec_t          *lrp;           /* left record pointer */
-       xfs_buf_t               *rbp;           /* right buffer pointer */
-       xfs_bmbt_block_t        *right;         /* right btree block */
-       xfs_bmbt_key_t          *rkp;           /* right btree key */
-       xfs_bmbt_ptr_t          *rpp;           /* right address pointer */
-       xfs_bmbt_block_t        *rrblock;       /* right-right btree block */
-       xfs_buf_t               *rrbp;          /* right-right buffer pointer */
-       xfs_bmbt_rec_t          *rrp;           /* right record pointer */
-
-       XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
-       XFS_BMBT_TRACE_ARGIFK(cur, level, *bnop, *startoff);
-       args.tp = cur->bc_tp;
-       args.mp = cur->bc_mp;
-       lbp = cur->bc_bufs[level];
-       lbno = XFS_DADDR_TO_FSB(args.mp, XFS_BUF_ADDR(lbp));
-       left = XFS_BUF_TO_BMBT_BLOCK(lbp);
-       args.fsbno = cur->bc_private.b.firstblock;
-       args.firstblock = args.fsbno;
-       args.minleft = 0;
-       if (args.fsbno == NULLFSBLOCK) {
-               args.fsbno = lbno;
-               args.type = XFS_ALLOCTYPE_START_BNO;
-               /*
-                * Make sure there is sufficient room left in the AG to
-                * complete a full tree split for an extent insert.  If
-                * we are converting the middle part of an extent then
-                * we may need space for two tree splits.
-                *
-                * We are relying on the caller to make the correct block
-                * reservation for this operation to succeed.  If the
-                * reservation amount is insufficient then we may fail a
-                * block allocation here and corrupt the filesystem.
-                */
-               args.minleft = xfs_trans_get_block_res(args.tp);
-       } else if (cur->bc_private.b.flist->xbf_low)
-               args.type = XFS_ALLOCTYPE_START_BNO;
-       else
-               args.type = XFS_ALLOCTYPE_NEAR_BNO;
-       args.mod = args.alignment = args.total = args.isfl =
-               args.userdata = args.minalignslop = 0;
-       args.minlen = args.maxlen = args.prod = 1;
-       args.wasdel = cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL;
-       if (!args.wasdel && xfs_trans_get_block_res(args.tp) == 0) {
-               XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-               return XFS_ERROR(ENOSPC);
-       }
-       if ((error = xfs_alloc_vextent(&args))) {
-               XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-               return error;
-       }
-       if (args.fsbno == NULLFSBLOCK && args.minleft) {
-               /*
-                * Could not find an AG with enough free space to satisfy
-                * a full btree split.  Try again without minleft and if
-                * successful activate the lowspace algorithm.
-                */
-               args.fsbno = 0;
-               args.type = XFS_ALLOCTYPE_FIRST_AG;
-               args.minleft = 0;
-               if ((error = xfs_alloc_vextent(&args))) {
-                       XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                       return error;
-               }
-               cur->bc_private.b.flist->xbf_low = 1;
-       }
-       if (args.fsbno == NULLFSBLOCK) {
-               XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-               *stat = 0;
-               return 0;
-       }
-       ASSERT(args.len == 1);
-       cur->bc_private.b.firstblock = args.fsbno;
-       cur->bc_private.b.allocated++;
-       cur->bc_private.b.ip->i_d.di_nblocks++;
-       xfs_trans_log_inode(args.tp, cur->bc_private.b.ip, XFS_ILOG_CORE);
-       XFS_TRANS_MOD_DQUOT_BYINO(args.mp, args.tp, cur->bc_private.b.ip,
-                       XFS_TRANS_DQ_BCOUNT, 1L);
-       rbp = xfs_btree_get_bufl(args.mp, args.tp, args.fsbno, 0);
-       right = XFS_BUF_TO_BMBT_BLOCK(rbp);
-#ifdef DEBUG
-       if ((error = xfs_btree_check_lblock(cur, left, level, rbp))) {
-               XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-               return error;
-       }
-#endif
-       right->bb_magic = cpu_to_be32(XFS_BMAP_MAGIC);
-       right->bb_level = left->bb_level;
-       right->bb_numrecs = cpu_to_be16(be16_to_cpu(left->bb_numrecs) / 2);
-       if ((be16_to_cpu(left->bb_numrecs) & 1) &&
-           cur->bc_ptrs[level] <= be16_to_cpu(right->bb_numrecs) + 1)
-               be16_add_cpu(&right->bb_numrecs, 1);
-       i = be16_to_cpu(left->bb_numrecs) - be16_to_cpu(right->bb_numrecs) + 1;
-       if (level > 0) {
-               lkp = XFS_BMAP_KEY_IADDR(left, i, cur);
-               lpp = XFS_BMAP_PTR_IADDR(left, i, cur);
-               rkp = XFS_BMAP_KEY_IADDR(right, 1, cur);
-               rpp = XFS_BMAP_PTR_IADDR(right, 1, cur);
-#ifdef DEBUG
-               for (i = 0; i < be16_to_cpu(right->bb_numrecs); i++) {
-                       if ((error = xfs_btree_check_lptr_disk(cur, lpp[i], level))) {
-                               XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                               return error;
-                       }
-               }
-#endif
-               memcpy(rkp, lkp, be16_to_cpu(right->bb_numrecs) * sizeof(*rkp));
-               memcpy(rpp, lpp, be16_to_cpu(right->bb_numrecs) * sizeof(*rpp));
-               xfs_bmbt_log_keys(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
-               xfs_bmbt_log_ptrs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
-               *startoff = be64_to_cpu(rkp->br_startoff);
-       } else {
-               lrp = XFS_BMAP_REC_IADDR(left, i, cur);
-               rrp = XFS_BMAP_REC_IADDR(right, 1, cur);
-               memcpy(rrp, lrp, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp));
-               xfs_bmbt_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
-               *startoff = xfs_bmbt_disk_get_startoff(rrp);
-       }
-       be16_add_cpu(&left->bb_numrecs, -(be16_to_cpu(right->bb_numrecs)));
-       right->bb_rightsib = left->bb_rightsib;
-       left->bb_rightsib = cpu_to_be64(args.fsbno);
-       right->bb_leftsib = cpu_to_be64(lbno);
-       xfs_bmbt_log_block(cur, rbp, XFS_BB_ALL_BITS);
-       xfs_bmbt_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
-       if (be64_to_cpu(right->bb_rightsib) != NULLDFSBNO) {
-               if ((error = xfs_btree_read_bufl(args.mp, args.tp,
-                               be64_to_cpu(right->bb_rightsib), 0, &rrbp,
-                               XFS_BMAP_BTREE_REF))) {
-                       XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                       return error;
-               }
-               rrblock = XFS_BUF_TO_BMBT_BLOCK(rrbp);
-               if ((error = xfs_btree_check_lblock(cur, rrblock, level, rrbp))) {
-                       XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                       return error;
-               }
-               rrblock->bb_leftsib = cpu_to_be64(args.fsbno);
-               xfs_bmbt_log_block(cur, rrbp, XFS_BB_LEFTSIB);
-       }
-       if (cur->bc_ptrs[level] > be16_to_cpu(left->bb_numrecs) + 1) {
-               xfs_btree_setbuf(cur, level, rbp);
-               cur->bc_ptrs[level] -= be16_to_cpu(left->bb_numrecs);
-       }
-       if (level + 1 < cur->bc_nlevels) {
-               if ((error = xfs_btree_dup_cursor(cur, curp))) {
-                       XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                       return error;
-               }
-               (*curp)->bc_ptrs[level + 1]++;
-       }
-       *bnop = args.fsbno;
-       XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-       *stat = 1;
-       return 0;
-}
-
-
-/*
- * Update keys for the record.
- */
-STATIC int
-xfs_bmbt_updkey(
-       xfs_btree_cur_t         *cur,
-       xfs_bmbt_key_t          *keyp,  /* on-disk format */
-       int                     level)
-{
-       xfs_bmbt_block_t        *block;
-       xfs_buf_t               *bp;
-#ifdef DEBUG
-       int                     error;
-#endif
-       xfs_bmbt_key_t          *kp;
-       int                     ptr;
-
-       ASSERT(level >= 1);
-       XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
-       XFS_BMBT_TRACE_ARGIK(cur, level, keyp);
-       for (ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) {
-               block = xfs_bmbt_get_block(cur, level, &bp);
-#ifdef DEBUG
-               if ((error = xfs_btree_check_lblock(cur, block, level, bp))) {
-                       XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                       return error;
-               }
-#endif
-               ptr = cur->bc_ptrs[level];
-               kp = XFS_BMAP_KEY_IADDR(block, ptr, cur);
-               *kp = *keyp;
-               xfs_bmbt_log_keys(cur, bp, ptr, ptr);
-       }
-       XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-       return 0;
-}
-
 /*
  * Convert on-disk form of btree root to in-memory form.
  */
 void
 xfs_bmdr_to_bmbt(
+       struct xfs_mount        *mp,
        xfs_bmdr_block_t        *dblock,
        int                     dblocklen,
-       xfs_bmbt_block_t        *rblock,
+       struct xfs_btree_block  *rblock,
        int                     rblocklen)
 {
        int                     dmxr;
@@ -1688,128 +81,18 @@ xfs_bmdr_to_bmbt(
        rblock->bb_level = dblock->bb_level;
        ASSERT(be16_to_cpu(rblock->bb_level) > 0);
        rblock->bb_numrecs = dblock->bb_numrecs;
-       rblock->bb_leftsib = cpu_to_be64(NULLDFSBNO);
-       rblock->bb_rightsib = cpu_to_be64(NULLDFSBNO);
-       dmxr = (int)XFS_BTREE_BLOCK_MAXRECS(dblocklen, xfs_bmdr, 0);
-       fkp = XFS_BTREE_KEY_ADDR(xfs_bmdr, dblock, 1);
-       tkp = XFS_BMAP_BROOT_KEY_ADDR(rblock, 1, rblocklen);
-       fpp = XFS_BTREE_PTR_ADDR(xfs_bmdr, dblock, 1, dmxr);
-       tpp = XFS_BMAP_BROOT_PTR_ADDR(rblock, 1, rblocklen);
+       rblock->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO);
+       rblock->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO);
+       dmxr = xfs_bmdr_maxrecs(mp, dblocklen, 0);
+       fkp = XFS_BMDR_KEY_ADDR(dblock, 1);
+       tkp = XFS_BMBT_KEY_ADDR(mp, rblock, 1);
+       fpp = XFS_BMDR_PTR_ADDR(dblock, 1, dmxr);
+       tpp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, rblocklen);
        dmxr = be16_to_cpu(dblock->bb_numrecs);
        memcpy(tkp, fkp, sizeof(*fkp) * dmxr);
        memcpy(tpp, fpp, sizeof(*fpp) * dmxr);
 }
 
-/*
- * Decrement cursor by one record at the level.
- * For nonzero levels the leaf-ward information is untouched.
- */
-int                                            /* error */
-xfs_bmbt_decrement(
-       xfs_btree_cur_t         *cur,
-       int                     level,
-       int                     *stat)          /* success/failure */
-{
-       xfs_bmbt_block_t        *block;
-       xfs_buf_t               *bp;
-       int                     error;          /* error return value */
-       xfs_fsblock_t           fsbno;
-       int                     lev;
-       xfs_mount_t             *mp;
-       xfs_trans_t             *tp;
-
-       XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
-       XFS_BMBT_TRACE_ARGI(cur, level);
-       ASSERT(level < cur->bc_nlevels);
-       if (level < cur->bc_nlevels - 1)
-               xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA);
-       if (--cur->bc_ptrs[level] > 0) {
-               XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-               *stat = 1;
-               return 0;
-       }
-       block = xfs_bmbt_get_block(cur, level, &bp);
-#ifdef DEBUG
-       if ((error = xfs_btree_check_lblock(cur, block, level, bp))) {
-               XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-               return error;
-       }
-#endif
-       if (be64_to_cpu(block->bb_leftsib) == NULLDFSBNO) {
-               XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-               *stat = 0;
-               return 0;
-       }
-       for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
-               if (--cur->bc_ptrs[lev] > 0)
-                       break;
-               if (lev < cur->bc_nlevels - 1)
-                       xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA);
-       }
-       if (lev == cur->bc_nlevels) {
-               XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-               *stat = 0;
-               return 0;
-       }
-       tp = cur->bc_tp;
-       mp = cur->bc_mp;
-       for (block = xfs_bmbt_get_block(cur, lev, &bp); lev > level; ) {
-               fsbno = be64_to_cpu(*XFS_BMAP_PTR_IADDR(block, cur->bc_ptrs[lev], cur));
-               if ((error = xfs_btree_read_bufl(mp, tp, fsbno, 0, &bp,
-                               XFS_BMAP_BTREE_REF))) {
-                       XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                       return error;
-               }
-               lev--;
-               xfs_btree_setbuf(cur, lev, bp);
-               block = XFS_BUF_TO_BMBT_BLOCK(bp);
-               if ((error = xfs_btree_check_lblock(cur, block, lev, bp))) {
-                       XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                       return error;
-               }
-               cur->bc_ptrs[lev] = be16_to_cpu(block->bb_numrecs);
-       }
-       XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-       *stat = 1;
-       return 0;
-}
-
-/*
- * Delete the record pointed to by cur.
- */
-int                                    /* error */
-xfs_bmbt_delete(
-       xfs_btree_cur_t *cur,
-       int             *stat)          /* success/failure */
-{
-       int             error;          /* error return value */
-       int             i;
-       int             level;
-
-       XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
-       for (level = 0, i = 2; i == 2; level++) {
-               if ((error = xfs_bmbt_delrec(cur, level, &i))) {
-                       XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                       return error;
-               }
-       }
-       if (i == 0) {
-               for (level = 1; level < cur->bc_nlevels; level++) {
-                       if (cur->bc_ptrs[level] == 0) {
-                               if ((error = xfs_bmbt_decrement(cur, level,
-                                               &i))) {
-                                       XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                                       return error;
-                               }
-                               break;
-                       }
-               }
-       }
-       XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-       *stat = i;
-       return 0;
-}
-
 /*
  * Convert a compressed bmap extent record to an uncompressed form.
  * This code must be in sync with the routines xfs_bmbt_get_startoff,
@@ -1863,31 +146,6 @@ xfs_bmbt_get_all(
        __xfs_bmbt_get_all(r->l0, r->l1, s);
 }
 
-/*
- * Get the block pointer for the given level of the cursor.
- * Fill in the buffer pointer, if applicable.
- */
-xfs_bmbt_block_t *
-xfs_bmbt_get_block(
-       xfs_btree_cur_t         *cur,
-       int                     level,
-       xfs_buf_t               **bpp)
-{
-       xfs_ifork_t             *ifp;
-       xfs_bmbt_block_t        *rval;
-
-       if (level < cur->bc_nlevels - 1) {
-               *bpp = cur->bc_bufs[level];
-               rval = XFS_BUF_TO_BMBT_BLOCK(*bpp);
-       } else {
-               *bpp = NULL;
-               ifp = XFS_IFORK_PTR(cur->bc_private.b.ip,
-                       cur->bc_private.b.whichfork);
-               rval = ifp->if_broot;
-       }
-       return rval;
-}
-
 /*
  * Extract the blockcount field from an in memory bmap extent record.
  */
@@ -1950,373 +208,31 @@ xfs_bmbt_disk_get_all(
        xfs_bmbt_rec_t  *r,
        xfs_bmbt_irec_t *s)
 {
-       __xfs_bmbt_get_all(be64_to_cpu(r->l0), be64_to_cpu(r->l1), s);
-}
-
-/*
- * Extract the blockcount field from an on disk bmap extent record.
- */
-xfs_filblks_t
-xfs_bmbt_disk_get_blockcount(
-       xfs_bmbt_rec_t  *r)
-{
-       return (xfs_filblks_t)(be64_to_cpu(r->l1) & XFS_MASK64LO(21));
-}
-
-/*
- * Extract the startoff field from a disk format bmap extent record.
- */
-xfs_fileoff_t
-xfs_bmbt_disk_get_startoff(
-       xfs_bmbt_rec_t  *r)
-{
-       return ((xfs_fileoff_t)be64_to_cpu(r->l0) &
-                XFS_MASK64LO(64 - BMBT_EXNTFLAG_BITLEN)) >> 9;
-}
-
-/*
- * Increment cursor by one record at the level.
- * For nonzero levels the leaf-ward information is untouched.
- */
-int                                            /* error */
-xfs_bmbt_increment(
-       xfs_btree_cur_t         *cur,
-       int                     level,
-       int                     *stat)          /* success/failure */
-{
-       xfs_bmbt_block_t        *block;
-       xfs_buf_t               *bp;
-       int                     error;          /* error return value */
-       xfs_fsblock_t           fsbno;
-       int                     lev;
-       xfs_mount_t             *mp;
-       xfs_trans_t             *tp;
-
-       XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
-       XFS_BMBT_TRACE_ARGI(cur, level);
-       ASSERT(level < cur->bc_nlevels);
-       if (level < cur->bc_nlevels - 1)
-               xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
-       block = xfs_bmbt_get_block(cur, level, &bp);
-#ifdef DEBUG
-       if ((error = xfs_btree_check_lblock(cur, block, level, bp))) {
-               XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-               return error;
-       }
-#endif
-       if (++cur->bc_ptrs[level] <= be16_to_cpu(block->bb_numrecs)) {
-               XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-               *stat = 1;
-               return 0;
-       }
-       if (be64_to_cpu(block->bb_rightsib) == NULLDFSBNO) {
-               XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-               *stat = 0;
-               return 0;
-       }
-       for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
-               block = xfs_bmbt_get_block(cur, lev, &bp);
-#ifdef DEBUG
-               if ((error = xfs_btree_check_lblock(cur, block, lev, bp))) {
-                       XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                       return error;
-               }
-#endif
-               if (++cur->bc_ptrs[lev] <= be16_to_cpu(block->bb_numrecs))
-                       break;
-               if (lev < cur->bc_nlevels - 1)
-                       xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA);
-       }
-       if (lev == cur->bc_nlevels) {
-               XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-               *stat = 0;
-               return 0;
-       }
-       tp = cur->bc_tp;
-       mp = cur->bc_mp;
-       for (block = xfs_bmbt_get_block(cur, lev, &bp); lev > level; ) {
-               fsbno = be64_to_cpu(*XFS_BMAP_PTR_IADDR(block, cur->bc_ptrs[lev], cur));
-               if ((error = xfs_btree_read_bufl(mp, tp, fsbno, 0, &bp,
-                               XFS_BMAP_BTREE_REF))) {
-                       XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                       return error;
-               }
-               lev--;
-               xfs_btree_setbuf(cur, lev, bp);
-               block = XFS_BUF_TO_BMBT_BLOCK(bp);
-               if ((error = xfs_btree_check_lblock(cur, block, lev, bp))) {
-                       XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                       return error;
-               }
-               cur->bc_ptrs[lev] = 1;
-       }
-       XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-       *stat = 1;
-       return 0;
-}
-
-/*
- * Insert the current record at the point referenced by cur.
- *
- * A multi-level split of the tree on insert will invalidate the original
- * cursor.  All callers of this function should assume that the cursor is
- * no longer valid and revalidate it.
- */
-int                                    /* error */
-xfs_bmbt_insert(
-       xfs_btree_cur_t *cur,
-       int             *stat)          /* success/failure */
-{
-       int             error;          /* error return value */
-       int             i;
-       int             level;
-       xfs_fsblock_t   nbno;
-       xfs_btree_cur_t *ncur;
-       xfs_bmbt_rec_t  nrec;
-       xfs_btree_cur_t *pcur;
-
-       XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
-       level = 0;
-       nbno = NULLFSBLOCK;
-       xfs_bmbt_disk_set_all(&nrec, &cur->bc_rec.b);
-       ncur = NULL;
-       pcur = cur;
-       do {
-               if ((error = xfs_bmbt_insrec(pcur, level++, &nbno, &nrec, &ncur,
-                               &i))) {
-                       if (pcur != cur)
-                               xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR);
-                       XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                       return error;
-               }
-               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-               if (pcur != cur && (ncur || nbno == NULLFSBLOCK)) {
-                       cur->bc_nlevels = pcur->bc_nlevels;
-                       cur->bc_private.b.allocated +=
-                               pcur->bc_private.b.allocated;
-                       pcur->bc_private.b.allocated = 0;
-                       ASSERT((cur->bc_private.b.firstblock != NULLFSBLOCK) ||
-                              XFS_IS_REALTIME_INODE(cur->bc_private.b.ip));
-                       cur->bc_private.b.firstblock =
-                               pcur->bc_private.b.firstblock;
-                       ASSERT(cur->bc_private.b.flist ==
-                              pcur->bc_private.b.flist);
-                       xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR);
-               }
-               if (ncur) {
-                       pcur = ncur;
-                       ncur = NULL;
-               }
-       } while (nbno != NULLFSBLOCK);
-       XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-       *stat = i;
-       return 0;
-error0:
-       XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-       return error;
-}
-
-/*
- * Log fields from the btree block header.
- */
-void
-xfs_bmbt_log_block(
-       xfs_btree_cur_t         *cur,
-       xfs_buf_t               *bp,
-       int                     fields)
-{
-       int                     first;
-       int                     last;
-       xfs_trans_t             *tp;
-       static const short      offsets[] = {
-               offsetof(xfs_bmbt_block_t, bb_magic),
-               offsetof(xfs_bmbt_block_t, bb_level),
-               offsetof(xfs_bmbt_block_t, bb_numrecs),
-               offsetof(xfs_bmbt_block_t, bb_leftsib),
-               offsetof(xfs_bmbt_block_t, bb_rightsib),
-               sizeof(xfs_bmbt_block_t)
-       };
-
-       XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
-       XFS_BMBT_TRACE_ARGBI(cur, bp, fields);
-       tp = cur->bc_tp;
-       if (bp) {
-               xfs_btree_offsets(fields, offsets, XFS_BB_NUM_BITS, &first,
-                                 &last);
-               xfs_trans_log_buf(tp, bp, first, last);
-       } else
-               xfs_trans_log_inode(tp, cur->bc_private.b.ip,
-                       XFS_ILOG_FBROOT(cur->bc_private.b.whichfork));
-       XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-}
-
-/*
- * Log record values from the btree block.
- */
-void
-xfs_bmbt_log_recs(
-       xfs_btree_cur_t         *cur,
-       xfs_buf_t               *bp,
-       int                     rfirst,
-       int                     rlast)
-{
-       xfs_bmbt_block_t        *block;
-       int                     first;
-       int                     last;
-       xfs_bmbt_rec_t          *rp;
-       xfs_trans_t             *tp;
-
-       XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
-       XFS_BMBT_TRACE_ARGBII(cur, bp, rfirst, rlast);
-       ASSERT(bp);
-       tp = cur->bc_tp;
-       block = XFS_BUF_TO_BMBT_BLOCK(bp);
-       rp = XFS_BMAP_REC_DADDR(block, 1, cur);
-       first = (int)((xfs_caddr_t)&rp[rfirst - 1] - (xfs_caddr_t)block);
-       last = (int)(((xfs_caddr_t)&rp[rlast] - 1) - (xfs_caddr_t)block);
-       xfs_trans_log_buf(tp, bp, first, last);
-       XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-}
-
-int                                    /* error */
-xfs_bmbt_lookup_eq(
-       xfs_btree_cur_t *cur,
-       xfs_fileoff_t   off,
-       xfs_fsblock_t   bno,
-       xfs_filblks_t   len,
-       int             *stat)          /* success/failure */
-{
-       cur->bc_rec.b.br_startoff = off;
-       cur->bc_rec.b.br_startblock = bno;
-       cur->bc_rec.b.br_blockcount = len;
-       return xfs_bmbt_lookup(cur, XFS_LOOKUP_EQ, stat);
-}
-
-int                                    /* error */
-xfs_bmbt_lookup_ge(
-       xfs_btree_cur_t *cur,
-       xfs_fileoff_t   off,
-       xfs_fsblock_t   bno,
-       xfs_filblks_t   len,
-       int             *stat)          /* success/failure */
-{
-       cur->bc_rec.b.br_startoff = off;
-       cur->bc_rec.b.br_startblock = bno;
-       cur->bc_rec.b.br_blockcount = len;
-       return xfs_bmbt_lookup(cur, XFS_LOOKUP_GE, stat);
-}
-
-/*
- * Give the bmap btree a new root block.  Copy the old broot contents
- * down into a real block and make the broot point to it.
- */
-int                                            /* error */
-xfs_bmbt_newroot(
-       xfs_btree_cur_t         *cur,           /* btree cursor */
-       int                     *logflags,      /* logging flags for inode */
-       int                     *stat)          /* return status - 0 fail */
-{
-       xfs_alloc_arg_t         args;           /* allocation arguments */
-       xfs_bmbt_block_t        *block;         /* bmap btree block */
-       xfs_buf_t               *bp;            /* buffer for block */
-       xfs_bmbt_block_t        *cblock;        /* child btree block */
-       xfs_bmbt_key_t          *ckp;           /* child key pointer */
-       xfs_bmbt_ptr_t          *cpp;           /* child ptr pointer */
-       int                     error;          /* error return code */
-#ifdef DEBUG
-       int                     i;              /* loop counter */
-#endif
-       xfs_bmbt_key_t          *kp;            /* pointer to bmap btree key */
-       int                     level;          /* btree level */
-       xfs_bmbt_ptr_t          *pp;            /* pointer to bmap block addr */
-
-       XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
-       level = cur->bc_nlevels - 1;
-       block = xfs_bmbt_get_block(cur, level, &bp);
-       /*
-        * Copy the root into a real block.
-        */
-       args.mp = cur->bc_mp;
-       pp = XFS_BMAP_PTR_IADDR(block, 1, cur);
-       args.tp = cur->bc_tp;
-       args.fsbno = cur->bc_private.b.firstblock;
-       args.mod = args.minleft = args.alignment = args.total = args.isfl =
-               args.userdata = args.minalignslop = 0;
-       args.minlen = args.maxlen = args.prod = 1;
-       args.wasdel = cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL;
-       args.firstblock = args.fsbno;
-       if (args.fsbno == NULLFSBLOCK) {
-#ifdef DEBUG
-               if ((error = xfs_btree_check_lptr_disk(cur, *pp, level))) {
-                       XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                       return error;
-               }
-#endif
-               args.fsbno = be64_to_cpu(*pp);
-               args.type = XFS_ALLOCTYPE_START_BNO;
-       } else if (cur->bc_private.b.flist->xbf_low)
-               args.type = XFS_ALLOCTYPE_START_BNO;
-       else
-               args.type = XFS_ALLOCTYPE_NEAR_BNO;
-       if ((error = xfs_alloc_vextent(&args))) {
-               XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-               return error;
-       }
-       if (args.fsbno == NULLFSBLOCK) {
-               XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-               *stat = 0;
-               return 0;
-       }
-       ASSERT(args.len == 1);
-       cur->bc_private.b.firstblock = args.fsbno;
-       cur->bc_private.b.allocated++;
-       cur->bc_private.b.ip->i_d.di_nblocks++;
-       XFS_TRANS_MOD_DQUOT_BYINO(args.mp, args.tp, cur->bc_private.b.ip,
-                         XFS_TRANS_DQ_BCOUNT, 1L);
-       bp = xfs_btree_get_bufl(args.mp, cur->bc_tp, args.fsbno, 0);
-       cblock = XFS_BUF_TO_BMBT_BLOCK(bp);
-       *cblock = *block;
-       be16_add_cpu(&block->bb_level, 1);
-       block->bb_numrecs = cpu_to_be16(1);
-       cur->bc_nlevels++;
-       cur->bc_ptrs[level + 1] = 1;
-       kp = XFS_BMAP_KEY_IADDR(block, 1, cur);
-       ckp = XFS_BMAP_KEY_IADDR(cblock, 1, cur);
-       memcpy(ckp, kp, be16_to_cpu(cblock->bb_numrecs) * sizeof(*kp));
-       cpp = XFS_BMAP_PTR_IADDR(cblock, 1, cur);
-#ifdef DEBUG
-       for (i = 0; i < be16_to_cpu(cblock->bb_numrecs); i++) {
-               if ((error = xfs_btree_check_lptr_disk(cur, pp[i], level))) {
-                       XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-                       return error;
-               }
-       }
-#endif
-       memcpy(cpp, pp, be16_to_cpu(cblock->bb_numrecs) * sizeof(*pp));
-#ifdef DEBUG
-       if ((error = xfs_btree_check_lptr(cur, args.fsbno, level))) {
-               XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-               return error;
-       }
-#endif
-       *pp = cpu_to_be64(args.fsbno);
-       xfs_iroot_realloc(cur->bc_private.b.ip, 1 - be16_to_cpu(cblock->bb_numrecs),
-               cur->bc_private.b.whichfork);
-       xfs_btree_setbuf(cur, level, bp);
-       /*
-        * Do all this logging at the end so that
-        * the root is at the right level.
-        */
-       xfs_bmbt_log_block(cur, bp, XFS_BB_ALL_BITS);
-       xfs_bmbt_log_keys(cur, bp, 1, be16_to_cpu(cblock->bb_numrecs));
-       xfs_bmbt_log_ptrs(cur, bp, 1, be16_to_cpu(cblock->bb_numrecs));
-       XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-       *logflags |=
-               XFS_ILOG_CORE | XFS_ILOG_FBROOT(cur->bc_private.b.whichfork);
-       *stat = 1;
-       return 0;
+       __xfs_bmbt_get_all(be64_to_cpu(r->l0), be64_to_cpu(r->l1), s);
+}
+
+/*
+ * Extract the blockcount field from an on disk bmap extent record.
+ */
+xfs_filblks_t
+xfs_bmbt_disk_get_blockcount(
+       xfs_bmbt_rec_t  *r)
+{
+       return (xfs_filblks_t)(be64_to_cpu(r->l1) & XFS_MASK64LO(21));
+}
+
+/*
+ * Extract the startoff field from a disk format bmap extent record.
+ */
+xfs_fileoff_t
+xfs_bmbt_disk_get_startoff(
+       xfs_bmbt_rec_t  *r)
+{
+       return ((xfs_fileoff_t)be64_to_cpu(r->l0) &
+                XFS_MASK64LO(64 - BMBT_EXNTFLAG_BITLEN)) >> 9;
 }
 
+
 /*
  * Set all the fields in a bmap extent record from the arguments.
  */
@@ -2512,7 +428,8 @@ xfs_bmbt_set_state(
  */
 void
 xfs_bmbt_to_bmdr(
-       xfs_bmbt_block_t        *rblock,
+       struct xfs_mount        *mp,
+       struct xfs_btree_block  *rblock,
        int                     rblocklen,
        xfs_bmdr_block_t        *dblock,
        int                     dblocklen)
@@ -2524,66 +441,21 @@ xfs_bmbt_to_bmdr(
        __be64                  *tpp;
 
        ASSERT(be32_to_cpu(rblock->bb_magic) == XFS_BMAP_MAGIC);
-       ASSERT(be64_to_cpu(rblock->bb_leftsib) == NULLDFSBNO);
-       ASSERT(be64_to_cpu(rblock->bb_rightsib) == NULLDFSBNO);
+       ASSERT(be64_to_cpu(rblock->bb_u.l.bb_leftsib) == NULLDFSBNO);
+       ASSERT(be64_to_cpu(rblock->bb_u.l.bb_rightsib) == NULLDFSBNO);
        ASSERT(be16_to_cpu(rblock->bb_level) > 0);
        dblock->bb_level = rblock->bb_level;
        dblock->bb_numrecs = rblock->bb_numrecs;
-       dmxr = (int)XFS_BTREE_BLOCK_MAXRECS(dblocklen, xfs_bmdr, 0);
-       fkp = XFS_BMAP_BROOT_KEY_ADDR(rblock, 1, rblocklen);
-       tkp = XFS_BTREE_KEY_ADDR(xfs_bmdr, dblock, 1);
-       fpp = XFS_BMAP_BROOT_PTR_ADDR(rblock, 1, rblocklen);
-       tpp = XFS_BTREE_PTR_ADDR(xfs_bmdr, dblock, 1, dmxr);
+       dmxr = xfs_bmdr_maxrecs(mp, dblocklen, 0);
+       fkp = XFS_BMBT_KEY_ADDR(mp, rblock, 1);
+       tkp = XFS_BMDR_KEY_ADDR(dblock, 1);
+       fpp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, rblocklen);
+       tpp = XFS_BMDR_PTR_ADDR(dblock, 1, dmxr);
        dmxr = be16_to_cpu(dblock->bb_numrecs);
        memcpy(tkp, fkp, sizeof(*fkp) * dmxr);
        memcpy(tpp, fpp, sizeof(*fpp) * dmxr);
 }
 
-/*
- * Update the record to the passed values.
- */
-int
-xfs_bmbt_update(
-       xfs_btree_cur_t         *cur,
-       xfs_fileoff_t           off,
-       xfs_fsblock_t           bno,
-       xfs_filblks_t           len,
-       xfs_exntst_t            state)
-{
-       xfs_bmbt_block_t        *block;
-       xfs_buf_t               *bp;
-       int                     error;
-       xfs_bmbt_key_t          key;
-       int                     ptr;
-       xfs_bmbt_rec_t          *rp;
-
-       XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
-       XFS_BMBT_TRACE_ARGFFFI(cur, (xfs_dfiloff_t)off, (xfs_dfsbno_t)bno,
-               (xfs_dfilblks_t)len, (int)state);
-       block = xfs_bmbt_get_block(cur, 0, &bp);
-#ifdef DEBUG
-       if ((error = xfs_btree_check_lblock(cur, block, 0, bp))) {
-               XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-               return error;
-       }
-#endif
-       ptr = cur->bc_ptrs[0];
-       rp = XFS_BMAP_REC_IADDR(block, ptr, cur);
-       xfs_bmbt_disk_set_allf(rp, off, bno, len, state);
-       xfs_bmbt_log_recs(cur, bp, ptr, ptr);
-       if (ptr > 1) {
-               XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-               return 0;
-       }
-       key.br_startoff = cpu_to_be64(off);
-       if ((error = xfs_bmbt_updkey(cur, &key, 1))) {
-               XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-               return error;
-       }
-       XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-       return 0;
-}
-
 /*
  * Check extent records, which have just been read, for
  * any bit in the extent flag field. ASSERT on debug
@@ -2608,3 +480,451 @@ xfs_check_nostate_extents(
        }
        return 0;
 }
+
+
+STATIC struct xfs_btree_cur *
+xfs_bmbt_dup_cursor(
+       struct xfs_btree_cur    *cur)
+{
+       struct xfs_btree_cur    *new;
+
+       new = xfs_bmbt_init_cursor(cur->bc_mp, cur->bc_tp,
+                       cur->bc_private.b.ip, cur->bc_private.b.whichfork);
+
+       /*
+        * Copy the firstblock, flist, and flags values,
+        * since init cursor doesn't get them.
+        */
+       new->bc_private.b.firstblock = cur->bc_private.b.firstblock;
+       new->bc_private.b.flist = cur->bc_private.b.flist;
+       new->bc_private.b.flags = cur->bc_private.b.flags;
+
+       return new;
+}
+
+STATIC void
+xfs_bmbt_update_cursor(
+       struct xfs_btree_cur    *src,
+       struct xfs_btree_cur    *dst)
+{
+       ASSERT((dst->bc_private.b.firstblock != NULLFSBLOCK) ||
+              (dst->bc_private.b.ip->i_d.di_flags & XFS_DIFLAG_REALTIME));
+       ASSERT(dst->bc_private.b.flist == src->bc_private.b.flist);
+
+       dst->bc_private.b.allocated += src->bc_private.b.allocated;
+       dst->bc_private.b.firstblock = src->bc_private.b.firstblock;
+
+       src->bc_private.b.allocated = 0;
+}
+
+STATIC int
+xfs_bmbt_alloc_block(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_ptr     *start,
+       union xfs_btree_ptr     *new,
+       int                     length,
+       int                     *stat)
+{
+       xfs_alloc_arg_t         args;           /* block allocation args */
+       int                     error;          /* error return value */
+
+       memset(&args, 0, sizeof(args));
+       args.tp = cur->bc_tp;
+       args.mp = cur->bc_mp;
+       args.fsbno = cur->bc_private.b.firstblock;
+       args.firstblock = args.fsbno;
+
+       if (args.fsbno == NULLFSBLOCK) {
+               args.fsbno = be64_to_cpu(start->l);
+               args.type = XFS_ALLOCTYPE_START_BNO;
+               /*
+                * Make sure there is sufficient room left in the AG to
+                * complete a full tree split for an extent insert.  If
+                * we are converting the middle part of an extent then
+                * we may need space for two tree splits.
+                *
+                * We are relying on the caller to make the correct block
+                * reservation for this operation to succeed.  If the
+                * reservation amount is insufficient then we may fail a
+                * block allocation here and corrupt the filesystem.
+                */
+               args.minleft = xfs_trans_get_block_res(args.tp);
+       } else if (cur->bc_private.b.flist->xbf_low) {
+               args.type = XFS_ALLOCTYPE_START_BNO;
+       } else {
+               args.type = XFS_ALLOCTYPE_NEAR_BNO;
+       }
+
+       args.minlen = args.maxlen = args.prod = 1;
+       args.wasdel = cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL;
+       if (!args.wasdel && xfs_trans_get_block_res(args.tp) == 0) {
+               error = XFS_ERROR(ENOSPC);
+               goto error0;
+       }
+       error = xfs_alloc_vextent(&args);
+       if (error)
+               goto error0;
+
+       if (args.fsbno == NULLFSBLOCK && args.minleft) {
+               /*
+                * Could not find an AG with enough free space to satisfy
+                * a full btree split.  Try again without minleft and if
+                * successful activate the lowspace algorithm.
+                */
+               args.fsbno = 0;
+               args.type = XFS_ALLOCTYPE_FIRST_AG;
+               args.minleft = 0;
+               error = xfs_alloc_vextent(&args);
+               if (error)
+                       goto error0;
+               cur->bc_private.b.flist->xbf_low = 1;
+       }
+       if (args.fsbno == NULLFSBLOCK) {
+               XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+               *stat = 0;
+               return 0;
+       }
+       ASSERT(args.len == 1);
+       cur->bc_private.b.firstblock = args.fsbno;
+       cur->bc_private.b.allocated++;
+       cur->bc_private.b.ip->i_d.di_nblocks++;
+       xfs_trans_log_inode(args.tp, cur->bc_private.b.ip, XFS_ILOG_CORE);
+       XFS_TRANS_MOD_DQUOT_BYINO(args.mp, args.tp, cur->bc_private.b.ip,
+                       XFS_TRANS_DQ_BCOUNT, 1L);
+
+       new->l = cpu_to_be64(args.fsbno);
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+       *stat = 1;
+       return 0;
+
+ error0:
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+       return error;
+}
+
+STATIC int
+xfs_bmbt_free_block(
+       struct xfs_btree_cur    *cur,
+       struct xfs_buf          *bp)
+{
+       struct xfs_mount        *mp = cur->bc_mp;
+       struct xfs_inode        *ip = cur->bc_private.b.ip;
+       struct xfs_trans        *tp = cur->bc_tp;
+       xfs_fsblock_t           fsbno = XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(bp));
+
+       xfs_bmap_add_free(fsbno, 1, cur->bc_private.b.flist, mp);
+       ip->i_d.di_nblocks--;
+
+       xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+       XFS_TRANS_MOD_DQUOT_BYINO(mp, tp, ip, XFS_TRANS_DQ_BCOUNT, -1L);
+       xfs_trans_binval(tp, bp);
+       return 0;
+}
+
+STATIC int
+xfs_bmbt_get_minrecs(
+       struct xfs_btree_cur    *cur,
+       int                     level)
+{
+       if (level == cur->bc_nlevels - 1) {
+               struct xfs_ifork        *ifp;
+
+               ifp = XFS_IFORK_PTR(cur->bc_private.b.ip,
+                                   cur->bc_private.b.whichfork);
+
+               return xfs_bmbt_maxrecs(cur->bc_mp,
+                                       ifp->if_broot_bytes, level == 0) / 2;
+       }
+
+       return cur->bc_mp->m_bmap_dmnr[level != 0];
+}
+
+int
+xfs_bmbt_get_maxrecs(
+       struct xfs_btree_cur    *cur,
+       int                     level)
+{
+       if (level == cur->bc_nlevels - 1) {
+               struct xfs_ifork        *ifp;
+
+               ifp = XFS_IFORK_PTR(cur->bc_private.b.ip,
+                                   cur->bc_private.b.whichfork);
+
+               return xfs_bmbt_maxrecs(cur->bc_mp,
+                                       ifp->if_broot_bytes, level == 0);
+       }
+
+       return cur->bc_mp->m_bmap_dmxr[level != 0];
+
+}
+
+/*
+ * Get the maximum records we could store in the on-disk format.
+ *
+ * For non-root nodes this is equivalent to xfs_bmbt_get_maxrecs, but
+ * for the root node this checks the available space in the dinode fork
+ * so that we can resize the in-memory buffer to match it.  After a
+ * resize to the maximum size this function returns the same value
+ * as xfs_bmbt_get_maxrecs for the root node, too.
+ */
+STATIC int
+xfs_bmbt_get_dmaxrecs(
+       struct xfs_btree_cur    *cur,
+       int                     level)
+{
+       if (level != cur->bc_nlevels - 1)
+               return cur->bc_mp->m_bmap_dmxr[level != 0];
+       return xfs_bmdr_maxrecs(cur->bc_mp, cur->bc_private.b.forksize,
+                               level == 0);
+}
+
+STATIC void
+xfs_bmbt_init_key_from_rec(
+       union xfs_btree_key     *key,
+       union xfs_btree_rec     *rec)
+{
+       key->bmbt.br_startoff =
+               cpu_to_be64(xfs_bmbt_disk_get_startoff(&rec->bmbt));
+}
+
+STATIC void
+xfs_bmbt_init_rec_from_key(
+       union xfs_btree_key     *key,
+       union xfs_btree_rec     *rec)
+{
+       ASSERT(key->bmbt.br_startoff != 0);
+
+       xfs_bmbt_disk_set_allf(&rec->bmbt, be64_to_cpu(key->bmbt.br_startoff),
+                              0, 0, XFS_EXT_NORM);
+}
+
+STATIC void
+xfs_bmbt_init_rec_from_cur(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_rec     *rec)
+{
+       xfs_bmbt_disk_set_all(&rec->bmbt, &cur->bc_rec.b);
+}
+
+STATIC void
+xfs_bmbt_init_ptr_from_cur(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_ptr     *ptr)
+{
+       ptr->l = 0;
+}
+
+STATIC __int64_t
+xfs_bmbt_key_diff(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_key     *key)
+{
+       return (__int64_t)be64_to_cpu(key->bmbt.br_startoff) -
+                                     cur->bc_rec.b.br_startoff;
+}
+
+#ifdef DEBUG
+STATIC int
+xfs_bmbt_keys_inorder(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_key     *k1,
+       union xfs_btree_key     *k2)
+{
+       return be64_to_cpu(k1->bmbt.br_startoff) <
+               be64_to_cpu(k2->bmbt.br_startoff);
+}
+
+STATIC int
+xfs_bmbt_recs_inorder(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_rec     *r1,
+       union xfs_btree_rec     *r2)
+{
+       return xfs_bmbt_disk_get_startoff(&r1->bmbt) +
+               xfs_bmbt_disk_get_blockcount(&r1->bmbt) <=
+               xfs_bmbt_disk_get_startoff(&r2->bmbt);
+}
+#endif /* DEBUG */
+
+#ifdef XFS_BTREE_TRACE
+ktrace_t       *xfs_bmbt_trace_buf;
+
+STATIC void
+xfs_bmbt_trace_enter(
+       struct xfs_btree_cur    *cur,
+       const char              *func,
+       char                    *s,
+       int                     type,
+       int                     line,
+       __psunsigned_t          a0,
+       __psunsigned_t          a1,
+       __psunsigned_t          a2,
+       __psunsigned_t          a3,
+       __psunsigned_t          a4,
+       __psunsigned_t          a5,
+       __psunsigned_t          a6,
+       __psunsigned_t          a7,
+       __psunsigned_t          a8,
+       __psunsigned_t          a9,
+       __psunsigned_t          a10)
+{
+       struct xfs_inode        *ip = cur->bc_private.b.ip;
+       int                     whichfork = cur->bc_private.b.whichfork;
+
+       ktrace_enter(xfs_bmbt_trace_buf,
+               (void *)((__psint_t)type | (whichfork << 8) | (line << 16)),
+               (void *)func, (void *)s, (void *)ip, (void *)cur,
+               (void *)a0, (void *)a1, (void *)a2, (void *)a3,
+               (void *)a4, (void *)a5, (void *)a6, (void *)a7,
+               (void *)a8, (void *)a9, (void *)a10);
+       ktrace_enter(ip->i_btrace,
+               (void *)((__psint_t)type | (whichfork << 8) | (line << 16)),
+               (void *)func, (void *)s, (void *)ip, (void *)cur,
+               (void *)a0, (void *)a1, (void *)a2, (void *)a3,
+               (void *)a4, (void *)a5, (void *)a6, (void *)a7,
+               (void *)a8, (void *)a9, (void *)a10);
+}
+
+STATIC void
+xfs_bmbt_trace_cursor(
+       struct xfs_btree_cur    *cur,
+       __uint32_t              *s0,
+       __uint64_t              *l0,
+       __uint64_t              *l1)
+{
+       struct xfs_bmbt_rec_host r;
+
+       xfs_bmbt_set_all(&r, &cur->bc_rec.b);
+
+       *s0 = (cur->bc_nlevels << 24) |
+             (cur->bc_private.b.flags << 16) |
+              cur->bc_private.b.allocated;
+       *l0 = r.l0;
+       *l1 = r.l1;
+}
+
+STATIC void
+xfs_bmbt_trace_key(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_key     *key,
+       __uint64_t              *l0,
+       __uint64_t              *l1)
+{
+       *l0 = be64_to_cpu(key->bmbt.br_startoff);
+       *l1 = 0;
+}
+
+STATIC void
+xfs_bmbt_trace_record(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_rec     *rec,
+       __uint64_t              *l0,
+       __uint64_t              *l1,
+       __uint64_t              *l2)
+{
+       struct xfs_bmbt_irec    irec;
+
+       xfs_bmbt_disk_get_all(&rec->bmbt, &irec);
+       *l0 = irec.br_startoff;
+       *l1 = irec.br_startblock;
+       *l2 = irec.br_blockcount;
+}
+#endif /* XFS_BTREE_TRACE */
+
+static const struct xfs_btree_ops xfs_bmbt_ops = {
+       .rec_len                = sizeof(xfs_bmbt_rec_t),
+       .key_len                = sizeof(xfs_bmbt_key_t),
+
+       .dup_cursor             = xfs_bmbt_dup_cursor,
+       .update_cursor          = xfs_bmbt_update_cursor,
+       .alloc_block            = xfs_bmbt_alloc_block,
+       .free_block             = xfs_bmbt_free_block,
+       .get_maxrecs            = xfs_bmbt_get_maxrecs,
+       .get_minrecs            = xfs_bmbt_get_minrecs,
+       .get_dmaxrecs           = xfs_bmbt_get_dmaxrecs,
+       .init_key_from_rec      = xfs_bmbt_init_key_from_rec,
+       .init_rec_from_key      = xfs_bmbt_init_rec_from_key,
+       .init_rec_from_cur      = xfs_bmbt_init_rec_from_cur,
+       .init_ptr_from_cur      = xfs_bmbt_init_ptr_from_cur,
+       .key_diff               = xfs_bmbt_key_diff,
+
+#ifdef DEBUG
+       .keys_inorder           = xfs_bmbt_keys_inorder,
+       .recs_inorder           = xfs_bmbt_recs_inorder,
+#endif
+
+#ifdef XFS_BTREE_TRACE
+       .trace_enter            = xfs_bmbt_trace_enter,
+       .trace_cursor           = xfs_bmbt_trace_cursor,
+       .trace_key              = xfs_bmbt_trace_key,
+       .trace_record           = xfs_bmbt_trace_record,
+#endif
+};
+
+/*
+ * Allocate a new bmap btree cursor.
+ */
+struct xfs_btree_cur *                         /* new bmap btree cursor */
+xfs_bmbt_init_cursor(
+       struct xfs_mount        *mp,            /* file system mount point */
+       struct xfs_trans        *tp,            /* transaction pointer */
+       struct xfs_inode        *ip,            /* inode owning the btree */
+       int                     whichfork)      /* data or attr fork */
+{
+       struct xfs_ifork        *ifp = XFS_IFORK_PTR(ip, whichfork);
+       struct xfs_btree_cur    *cur;
+
+       cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP);
+
+       cur->bc_tp = tp;
+       cur->bc_mp = mp;
+       cur->bc_nlevels = be16_to_cpu(ifp->if_broot->bb_level) + 1;
+       cur->bc_btnum = XFS_BTNUM_BMAP;
+       cur->bc_blocklog = mp->m_sb.sb_blocklog;
+
+       cur->bc_ops = &xfs_bmbt_ops;
+       cur->bc_flags = XFS_BTREE_LONG_PTRS | XFS_BTREE_ROOT_IN_INODE;
+
+       cur->bc_private.b.forksize = XFS_IFORK_SIZE(ip, whichfork);
+       cur->bc_private.b.ip = ip;
+       cur->bc_private.b.firstblock = NULLFSBLOCK;
+       cur->bc_private.b.flist = NULL;
+       cur->bc_private.b.allocated = 0;
+       cur->bc_private.b.flags = 0;
+       cur->bc_private.b.whichfork = whichfork;
+
+       return cur;
+}
+
+/*
+ * Calculate number of records in a bmap btree block.
+ */
+int
+xfs_bmbt_maxrecs(
+       struct xfs_mount        *mp,
+       int                     blocklen,
+       int                     leaf)
+{
+       blocklen -= XFS_BMBT_BLOCK_LEN(mp);
+
+       if (leaf)
+               return blocklen / sizeof(xfs_bmbt_rec_t);
+       return blocklen / (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t));
+}
+
+/*
+ * Calculate number of records in a bmap btree inode root.
+ */
+int
+xfs_bmdr_maxrecs(
+       struct xfs_mount        *mp,
+       int                     blocklen,
+       int                     leaf)
+{
+       blocklen -= sizeof(xfs_bmdr_block_t);
+
+       if (leaf)
+               return blocklen / sizeof(xfs_bmdr_rec_t);
+       return blocklen / (sizeof(xfs_bmdr_key_t) + sizeof(xfs_bmdr_ptr_t));
+}
index cd0d4b4bb81645cb50331cc70a3d254eb80e849f..a4555abb6622a5d2ac6c08ab0f585761d6ff4585 100644 (file)
 #define XFS_BMAP_MAGIC 0x424d4150      /* 'BMAP' */
 
 struct xfs_btree_cur;
-struct xfs_btree_lblock;
+struct xfs_btree_block;
 struct xfs_mount;
 struct xfs_inode;
+struct xfs_trans;
 
 /*
  * Bmap root header, on-disk form only.
@@ -145,71 +146,60 @@ typedef struct xfs_bmbt_key {
 /* btree pointer type */
 typedef __be64 xfs_bmbt_ptr_t, xfs_bmdr_ptr_t;
 
-/* btree block header type */
-typedef struct xfs_btree_lblock xfs_bmbt_block_t;
-
-#define XFS_BUF_TO_BMBT_BLOCK(bp)      ((xfs_bmbt_block_t *)XFS_BUF_PTR(bp))
-
-#define XFS_BMAP_RBLOCK_DSIZE(lev,cur) ((cur)->bc_private.b.forksize)
-#define XFS_BMAP_RBLOCK_ISIZE(lev,cur) \
-       ((int)XFS_IFORK_PTR((cur)->bc_private.b.ip, \
-                   (cur)->bc_private.b.whichfork)->if_broot_bytes)
-
-#define XFS_BMAP_BLOCK_DMAXRECS(lev,cur) \
-       (((lev) == (cur)->bc_nlevels - 1 ? \
-               XFS_BTREE_BLOCK_MAXRECS(XFS_BMAP_RBLOCK_DSIZE(lev,cur), \
-                       xfs_bmdr, (lev) == 0) : \
-               ((cur)->bc_mp->m_bmap_dmxr[(lev) != 0])))
-#define XFS_BMAP_BLOCK_IMAXRECS(lev,cur) \
-       (((lev) == (cur)->bc_nlevels - 1 ? \
-                       XFS_BTREE_BLOCK_MAXRECS(XFS_BMAP_RBLOCK_ISIZE(lev,cur),\
-                               xfs_bmbt, (lev) == 0) : \
-                       ((cur)->bc_mp->m_bmap_dmxr[(lev) != 0])))
-
-#define XFS_BMAP_BLOCK_DMINRECS(lev,cur) \
-       (((lev) == (cur)->bc_nlevels - 1 ? \
-                       XFS_BTREE_BLOCK_MINRECS(XFS_BMAP_RBLOCK_DSIZE(lev,cur),\
-                               xfs_bmdr, (lev) == 0) : \
-                       ((cur)->bc_mp->m_bmap_dmnr[(lev) != 0])))
-#define XFS_BMAP_BLOCK_IMINRECS(lev,cur) \
-       (((lev) == (cur)->bc_nlevels - 1 ? \
-                       XFS_BTREE_BLOCK_MINRECS(XFS_BMAP_RBLOCK_ISIZE(lev,cur),\
-                               xfs_bmbt, (lev) == 0) : \
-                       ((cur)->bc_mp->m_bmap_dmnr[(lev) != 0])))
-
-#define XFS_BMAP_REC_DADDR(bb,i,cur)   (XFS_BTREE_REC_ADDR(xfs_bmbt, bb, i))
-
-#define XFS_BMAP_REC_IADDR(bb,i,cur)   (XFS_BTREE_REC_ADDR(xfs_bmbt, bb, i))
-
-#define XFS_BMAP_KEY_DADDR(bb,i,cur)   \
-       (XFS_BTREE_KEY_ADDR(xfs_bmbt, bb, i))
-
-#define XFS_BMAP_KEY_IADDR(bb,i,cur)   \
-       (XFS_BTREE_KEY_ADDR(xfs_bmbt, bb, i))
-
-#define XFS_BMAP_PTR_DADDR(bb,i,cur)   \
-       (XFS_BTREE_PTR_ADDR(xfs_bmbt, bb, i, XFS_BMAP_BLOCK_DMAXRECS(   \
-                               be16_to_cpu((bb)->bb_level), cur)))
-#define XFS_BMAP_PTR_IADDR(bb,i,cur)   \
-       (XFS_BTREE_PTR_ADDR(xfs_bmbt, bb, i, XFS_BMAP_BLOCK_IMAXRECS(   \
-                               be16_to_cpu((bb)->bb_level), cur)))
+/*
+ * Btree block header size depends on a superblock flag.
+ *
+ * (not quite yet, but soon)
+ */
+#define XFS_BMBT_BLOCK_LEN(mp) XFS_BTREE_LBLOCK_LEN
+
+#define XFS_BMBT_REC_ADDR(mp, block, index) \
+       ((xfs_bmbt_rec_t *) \
+               ((char *)(block) + \
+                XFS_BMBT_BLOCK_LEN(mp) + \
+                ((index) - 1) * sizeof(xfs_bmbt_rec_t)))
+
+#define XFS_BMBT_KEY_ADDR(mp, block, index) \
+       ((xfs_bmbt_key_t *) \
+               ((char *)(block) + \
+                XFS_BMBT_BLOCK_LEN(mp) + \
+                ((index) - 1) * sizeof(xfs_bmbt_key_t)))
+
+#define XFS_BMBT_PTR_ADDR(mp, block, index, maxrecs) \
+       ((xfs_bmbt_ptr_t *) \
+               ((char *)(block) + \
+                XFS_BMBT_BLOCK_LEN(mp) + \
+                (maxrecs) * sizeof(xfs_bmbt_key_t) + \
+                ((index) - 1) * sizeof(xfs_bmbt_ptr_t)))
+
+#define XFS_BMDR_REC_ADDR(block, index) \
+       ((xfs_bmdr_rec_t *) \
+               ((char *)(block) + \
+                sizeof(struct xfs_bmdr_block) + \
+                ((index) - 1) * sizeof(xfs_bmdr_rec_t)))
+
+#define XFS_BMDR_KEY_ADDR(block, index) \
+       ((xfs_bmdr_key_t *) \
+               ((char *)(block) + \
+                sizeof(struct xfs_bmdr_block) + \
+                ((index) - 1) * sizeof(xfs_bmdr_key_t)))
+
+#define XFS_BMDR_PTR_ADDR(block, index, maxrecs) \
+       ((xfs_bmdr_ptr_t *) \
+               ((char *)(block) + \
+                sizeof(struct xfs_bmdr_block) + \
+                (maxrecs) * sizeof(xfs_bmdr_key_t) + \
+                ((index) - 1) * sizeof(xfs_bmdr_ptr_t)))
 
 /*
  * These are to be used when we know the size of the block and
  * we don't have a cursor.
  */
-#define XFS_BMAP_BROOT_REC_ADDR(bb,i,sz) \
-       (XFS_BTREE_REC_ADDR(xfs_bmbt,bb,i))
-#define XFS_BMAP_BROOT_KEY_ADDR(bb,i,sz) \
-       (XFS_BTREE_KEY_ADDR(xfs_bmbt,bb,i))
-#define XFS_BMAP_BROOT_PTR_ADDR(bb,i,sz) \
-       (XFS_BTREE_PTR_ADDR(xfs_bmbt,bb,i,XFS_BMAP_BROOT_MAXRECS(sz)))
-
-#define XFS_BMAP_BROOT_NUMRECS(bb)     be16_to_cpu((bb)->bb_numrecs)
-#define XFS_BMAP_BROOT_MAXRECS(sz)     XFS_BTREE_BLOCK_MAXRECS(sz,xfs_bmbt,0)
+#define XFS_BMAP_BROOT_PTR_ADDR(mp, bb, i, sz) \
+       XFS_BMBT_PTR_ADDR(mp, bb, i, xfs_bmbt_maxrecs(mp, sz, 0))
 
 #define XFS_BMAP_BROOT_SPACE_CALC(nrecs) \
-       (int)(sizeof(xfs_bmbt_block_t) + \
+       (int)(XFS_BTREE_LBLOCK_LEN + \
               ((nrecs) * (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t))))
 
 #define XFS_BMAP_BROOT_SPACE(bb) \
@@ -223,42 +213,12 @@ typedef struct xfs_btree_lblock xfs_bmbt_block_t;
  */
 #define XFS_BM_MAXLEVELS(mp,w)         ((mp)->m_bm_maxlevels[(w)])
 
-#define XFS_BMAP_SANITY_CHECK(mp,bb,level) \
-       (be32_to_cpu((bb)->bb_magic) == XFS_BMAP_MAGIC && \
-        be16_to_cpu((bb)->bb_level) == level && \
-        be16_to_cpu((bb)->bb_numrecs) > 0 && \
-        be16_to_cpu((bb)->bb_numrecs) <= (mp)->m_bmap_dmxr[(level) != 0])
-
-
-#ifdef __KERNEL__
-
-#if defined(XFS_BMBT_TRACE)
-/*
- * Trace buffer entry types.
- */
-#define XFS_BMBT_KTRACE_ARGBI  1
-#define XFS_BMBT_KTRACE_ARGBII 2
-#define XFS_BMBT_KTRACE_ARGFFFI 3
-#define XFS_BMBT_KTRACE_ARGI   4
-#define XFS_BMBT_KTRACE_ARGIFK 5
-#define XFS_BMBT_KTRACE_ARGIFR 6
-#define XFS_BMBT_KTRACE_ARGIK  7
-#define XFS_BMBT_KTRACE_CUR    8
-
-#define XFS_BMBT_TRACE_SIZE    4096    /* size of global trace buffer */
-#define XFS_BMBT_KTRACE_SIZE   32      /* size of per-inode trace buffer */
-extern ktrace_t        *xfs_bmbt_trace_buf;
-#endif
-
 /*
  * Prototypes for xfs_bmap.c to call.
  */
-extern void xfs_bmdr_to_bmbt(xfs_bmdr_block_t *, int, xfs_bmbt_block_t *, int);
-extern int xfs_bmbt_decrement(struct xfs_btree_cur *, int, int *);
-extern int xfs_bmbt_delete(struct xfs_btree_cur *, int *);
+extern void xfs_bmdr_to_bmbt(struct xfs_mount *, xfs_bmdr_block_t *, int,
+                       struct xfs_btree_block *, int);
 extern void xfs_bmbt_get_all(xfs_bmbt_rec_host_t *r, xfs_bmbt_irec_t *s);
-extern xfs_bmbt_block_t *xfs_bmbt_get_block(struct xfs_btree_cur *cur,
-                                               int, struct xfs_buf **bpp);
 extern xfs_filblks_t xfs_bmbt_get_blockcount(xfs_bmbt_rec_host_t *r);
 extern xfs_fsblock_t xfs_bmbt_get_startblock(xfs_bmbt_rec_host_t *r);
 extern xfs_fileoff_t xfs_bmbt_get_startoff(xfs_bmbt_rec_host_t *r);
@@ -268,22 +228,6 @@ extern void xfs_bmbt_disk_get_all(xfs_bmbt_rec_t *r, xfs_bmbt_irec_t *s);
 extern xfs_filblks_t xfs_bmbt_disk_get_blockcount(xfs_bmbt_rec_t *r);
 extern xfs_fileoff_t xfs_bmbt_disk_get_startoff(xfs_bmbt_rec_t *r);
 
-extern int xfs_bmbt_increment(struct xfs_btree_cur *, int, int *);
-extern int xfs_bmbt_insert(struct xfs_btree_cur *, int *);
-extern void xfs_bmbt_log_block(struct xfs_btree_cur *, struct xfs_buf *, int);
-extern void xfs_bmbt_log_recs(struct xfs_btree_cur *, struct xfs_buf *, int,
-                               int);
-extern int xfs_bmbt_lookup_eq(struct xfs_btree_cur *, xfs_fileoff_t,
-                               xfs_fsblock_t, xfs_filblks_t, int *);
-extern int xfs_bmbt_lookup_ge(struct xfs_btree_cur *, xfs_fileoff_t,
-                               xfs_fsblock_t, xfs_filblks_t, int *);
-
-/*
- * Give the bmap btree a new root block.  Copy the old broot contents
- * down into a real block and make the broot point to it.
- */
-extern int xfs_bmbt_newroot(struct xfs_btree_cur *cur, int *lflags, int *stat);
-
 extern void xfs_bmbt_set_all(xfs_bmbt_rec_host_t *r, xfs_bmbt_irec_t *s);
 extern void xfs_bmbt_set_allf(xfs_bmbt_rec_host_t *r, xfs_fileoff_t o,
                        xfs_fsblock_t b, xfs_filblks_t c, xfs_exntst_t v);
@@ -296,10 +240,15 @@ extern void xfs_bmbt_disk_set_all(xfs_bmbt_rec_t *r, xfs_bmbt_irec_t *s);
 extern void xfs_bmbt_disk_set_allf(xfs_bmbt_rec_t *r, xfs_fileoff_t o,
                        xfs_fsblock_t b, xfs_filblks_t c, xfs_exntst_t v);
 
-extern void xfs_bmbt_to_bmdr(xfs_bmbt_block_t *, int, xfs_bmdr_block_t *, int);
-extern int xfs_bmbt_update(struct xfs_btree_cur *, xfs_fileoff_t,
-                               xfs_fsblock_t, xfs_filblks_t, xfs_exntst_t);
+extern void xfs_bmbt_to_bmdr(struct xfs_mount *, struct xfs_btree_block *, int,
+                       xfs_bmdr_block_t *, int);
+
+extern int xfs_bmbt_get_maxrecs(struct xfs_btree_cur *, int level);
+extern int xfs_bmdr_maxrecs(struct xfs_mount *, int blocklen, int leaf);
+extern int xfs_bmbt_maxrecs(struct xfs_mount *, int blocklen, int leaf);
+
+extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *,
+               struct xfs_trans *, struct xfs_inode *, int);
 
-#endif /* __KERNEL__ */
 
 #endif /* __XFS_BMAP_BTREE_H__ */
index cc593a84c34572211cc2ff44997457838bbe4f5d..7ed59267420d8e5a661efac5b10190a85066d2f0 100644 (file)
@@ -34,7 +34,9 @@
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
+#include "xfs_inode_item.h"
 #include "xfs_btree.h"
+#include "xfs_btree_trace.h"
 #include "xfs_ialloc.h"
 #include "xfs_error.h"
 
@@ -50,135 +52,33 @@ const __uint32_t xfs_magics[XFS_BTNUM_MAX] = {
        XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, XFS_BMAP_MAGIC, XFS_IBT_MAGIC
 };
 
-/*
- * Checking routine: return maxrecs for the block.
- */
-STATIC int                             /* number of records fitting in block */
-xfs_btree_maxrecs(
-       xfs_btree_cur_t         *cur,   /* btree cursor */
-       xfs_btree_block_t       *block) /* generic btree block pointer */
-{
-       switch (cur->bc_btnum) {
-       case XFS_BTNUM_BNO:
-       case XFS_BTNUM_CNT:
-               return (int)XFS_ALLOC_BLOCK_MAXRECS(
-                               be16_to_cpu(block->bb_h.bb_level), cur);
-       case XFS_BTNUM_BMAP:
-               return (int)XFS_BMAP_BLOCK_IMAXRECS(
-                               be16_to_cpu(block->bb_h.bb_level), cur);
-       case XFS_BTNUM_INO:
-               return (int)XFS_INOBT_BLOCK_MAXRECS(
-                               be16_to_cpu(block->bb_h.bb_level), cur);
-       default:
-               ASSERT(0);
-               return 0;
-       }
-}
-
-/*
- * External routines.
- */
-
-#ifdef DEBUG
-/*
- * Debug routine: check that block header is ok.
- */
-void
-xfs_btree_check_block(
-       xfs_btree_cur_t         *cur,   /* btree cursor */
-       xfs_btree_block_t       *block, /* generic btree block pointer */
-       int                     level,  /* level of the btree block */
-       xfs_buf_t               *bp)    /* buffer containing block, if any */
-{
-       if (XFS_BTREE_LONG_PTRS(cur->bc_btnum))
-               xfs_btree_check_lblock(cur, (xfs_btree_lblock_t *)block, level,
-                       bp);
-       else
-               xfs_btree_check_sblock(cur, (xfs_btree_sblock_t *)block, level,
-                       bp);
-}
-
-/*
- * Debug routine: check that keys are in the right order.
- */
-void
-xfs_btree_check_key(
-       xfs_btnum_t     btnum,          /* btree identifier */
-       void            *ak1,           /* pointer to left (lower) key */
-       void            *ak2)           /* pointer to right (higher) key */
-{
-       switch (btnum) {
-       case XFS_BTNUM_BNO: {
-               xfs_alloc_key_t *k1;
-               xfs_alloc_key_t *k2;
-
-               k1 = ak1;
-               k2 = ak2;
-               ASSERT(be32_to_cpu(k1->ar_startblock) < be32_to_cpu(k2->ar_startblock));
-               break;
-           }
-       case XFS_BTNUM_CNT: {
-               xfs_alloc_key_t *k1;
-               xfs_alloc_key_t *k2;
-
-               k1 = ak1;
-               k2 = ak2;
-               ASSERT(be32_to_cpu(k1->ar_blockcount) < be32_to_cpu(k2->ar_blockcount) ||
-                      (k1->ar_blockcount == k2->ar_blockcount &&
-                       be32_to_cpu(k1->ar_startblock) < be32_to_cpu(k2->ar_startblock)));
-               break;
-           }
-       case XFS_BTNUM_BMAP: {
-               xfs_bmbt_key_t  *k1;
-               xfs_bmbt_key_t  *k2;
-
-               k1 = ak1;
-               k2 = ak2;
-               ASSERT(be64_to_cpu(k1->br_startoff) < be64_to_cpu(k2->br_startoff));
-               break;
-           }
-       case XFS_BTNUM_INO: {
-               xfs_inobt_key_t *k1;
-               xfs_inobt_key_t *k2;
-
-               k1 = ak1;
-               k2 = ak2;
-               ASSERT(be32_to_cpu(k1->ir_startino) < be32_to_cpu(k2->ir_startino));
-               break;
-           }
-       default:
-               ASSERT(0);
-       }
-}
-#endif /* DEBUG */
 
-/*
- * Checking routine: check that long form block header is ok.
- */
-/* ARGSUSED */
-int                                    /* error (0 or EFSCORRUPTED) */
+STATIC int                             /* error (0 or EFSCORRUPTED) */
 xfs_btree_check_lblock(
-       xfs_btree_cur_t         *cur,   /* btree cursor */
-       xfs_btree_lblock_t      *block, /* btree long form block pointer */
+       struct xfs_btree_cur    *cur,   /* btree cursor */
+       struct xfs_btree_block  *block, /* btree long form block pointer */
        int                     level,  /* level of the btree block */
-       xfs_buf_t               *bp)    /* buffer for block, if any */
+       struct xfs_buf          *bp)    /* buffer for block, if any */
 {
        int                     lblock_ok; /* block passes checks */
-       xfs_mount_t             *mp;    /* file system mount point */
+       struct xfs_mount        *mp;    /* file system mount point */
 
        mp = cur->bc_mp;
        lblock_ok =
                be32_to_cpu(block->bb_magic) == xfs_magics[cur->bc_btnum] &&
                be16_to_cpu(block->bb_level) == level &&
                be16_to_cpu(block->bb_numrecs) <=
-                       xfs_btree_maxrecs(cur, (xfs_btree_block_t *)block) &&
-               block->bb_leftsib &&
-               (be64_to_cpu(block->bb_leftsib) == NULLDFSBNO ||
-                XFS_FSB_SANITY_CHECK(mp, be64_to_cpu(block->bb_leftsib))) &&
-               block->bb_rightsib &&
-               (be64_to_cpu(block->bb_rightsib) == NULLDFSBNO ||
-                XFS_FSB_SANITY_CHECK(mp, be64_to_cpu(block->bb_rightsib)));
-       if (unlikely(XFS_TEST_ERROR(!lblock_ok, mp, XFS_ERRTAG_BTREE_CHECK_LBLOCK,
+                       cur->bc_ops->get_maxrecs(cur, level) &&
+               block->bb_u.l.bb_leftsib &&
+               (be64_to_cpu(block->bb_u.l.bb_leftsib) == NULLDFSBNO ||
+                XFS_FSB_SANITY_CHECK(mp,
+                       be64_to_cpu(block->bb_u.l.bb_leftsib))) &&
+               block->bb_u.l.bb_rightsib &&
+               (be64_to_cpu(block->bb_u.l.bb_rightsib) == NULLDFSBNO ||
+                XFS_FSB_SANITY_CHECK(mp,
+                       be64_to_cpu(block->bb_u.l.bb_rightsib)));
+       if (unlikely(XFS_TEST_ERROR(!lblock_ok, mp,
+                       XFS_ERRTAG_BTREE_CHECK_LBLOCK,
                        XFS_RANDOM_BTREE_CHECK_LBLOCK))) {
                if (bp)
                        xfs_buftrace("LBTREE ERROR", bp);
@@ -189,98 +89,15 @@ xfs_btree_check_lblock(
        return 0;
 }
 
-/*
- * Checking routine: check that (long) pointer is ok.
- */
-int                                    /* error (0 or EFSCORRUPTED) */
-xfs_btree_check_lptr(
-       xfs_btree_cur_t *cur,           /* btree cursor */
-       xfs_dfsbno_t    ptr,            /* btree block disk address */
-       int             level)          /* btree block level */
-{
-       xfs_mount_t     *mp;            /* file system mount point */
-
-       mp = cur->bc_mp;
-       XFS_WANT_CORRUPTED_RETURN(
-               level > 0 &&
-               ptr != NULLDFSBNO &&
-               XFS_FSB_SANITY_CHECK(mp, ptr));
-       return 0;
-}
-
-#ifdef DEBUG
-/*
- * Debug routine: check that records are in the right order.
- */
-void
-xfs_btree_check_rec(
-       xfs_btnum_t     btnum,          /* btree identifier */
-       void            *ar1,           /* pointer to left (lower) record */
-       void            *ar2)           /* pointer to right (higher) record */
-{
-       switch (btnum) {
-       case XFS_BTNUM_BNO: {
-               xfs_alloc_rec_t *r1;
-               xfs_alloc_rec_t *r2;
-
-               r1 = ar1;
-               r2 = ar2;
-               ASSERT(be32_to_cpu(r1->ar_startblock) +
-                      be32_to_cpu(r1->ar_blockcount) <=
-                      be32_to_cpu(r2->ar_startblock));
-               break;
-           }
-       case XFS_BTNUM_CNT: {
-               xfs_alloc_rec_t *r1;
-               xfs_alloc_rec_t *r2;
-
-               r1 = ar1;
-               r2 = ar2;
-               ASSERT(be32_to_cpu(r1->ar_blockcount) < be32_to_cpu(r2->ar_blockcount) ||
-                      (r1->ar_blockcount == r2->ar_blockcount &&
-                       be32_to_cpu(r1->ar_startblock) < be32_to_cpu(r2->ar_startblock)));
-               break;
-           }
-       case XFS_BTNUM_BMAP: {
-               xfs_bmbt_rec_t  *r1;
-               xfs_bmbt_rec_t  *r2;
-
-               r1 = ar1;
-               r2 = ar2;
-               ASSERT(xfs_bmbt_disk_get_startoff(r1) +
-                      xfs_bmbt_disk_get_blockcount(r1) <=
-                      xfs_bmbt_disk_get_startoff(r2));
-               break;
-           }
-       case XFS_BTNUM_INO: {
-               xfs_inobt_rec_t *r1;
-               xfs_inobt_rec_t *r2;
-
-               r1 = ar1;
-               r2 = ar2;
-               ASSERT(be32_to_cpu(r1->ir_startino) + XFS_INODES_PER_CHUNK <=
-                      be32_to_cpu(r2->ir_startino));
-               break;
-           }
-       default:
-               ASSERT(0);
-       }
-}
-#endif /* DEBUG */
-
-/*
- * Checking routine: check that block header is ok.
- */
-/* ARGSUSED */
-int                                    /* error (0 or EFSCORRUPTED) */
+STATIC int                             /* error (0 or EFSCORRUPTED) */
 xfs_btree_check_sblock(
-       xfs_btree_cur_t         *cur,   /* btree cursor */
-       xfs_btree_sblock_t      *block, /* btree short form block pointer */
+       struct xfs_btree_cur    *cur,   /* btree cursor */
+       struct xfs_btree_block  *block, /* btree short form block pointer */
        int                     level,  /* level of the btree block */
-       xfs_buf_t               *bp)    /* buffer containing block */
+       struct xfs_buf          *bp)    /* buffer containing block */
 {
-       xfs_buf_t               *agbp;  /* buffer for ag. freespace struct */
-       xfs_agf_t               *agf;   /* ag. freespace structure */
+       struct xfs_buf          *agbp;  /* buffer for ag. freespace struct */
+       struct xfs_agf          *agf;   /* ag. freespace structure */
        xfs_agblock_t           agflen; /* native ag. freespace length */
        int                     sblock_ok; /* block passes checks */
 
@@ -291,13 +108,13 @@ xfs_btree_check_sblock(
                be32_to_cpu(block->bb_magic) == xfs_magics[cur->bc_btnum] &&
                be16_to_cpu(block->bb_level) == level &&
                be16_to_cpu(block->bb_numrecs) <=
-                       xfs_btree_maxrecs(cur, (xfs_btree_block_t *)block) &&
-               (be32_to_cpu(block->bb_leftsib) == NULLAGBLOCK ||
-                be32_to_cpu(block->bb_leftsib) < agflen) &&
-               block->bb_leftsib &&
-               (be32_to_cpu(block->bb_rightsib) == NULLAGBLOCK ||
-                be32_to_cpu(block->bb_rightsib) < agflen) &&
-               block->bb_rightsib;
+                       cur->bc_ops->get_maxrecs(cur, level) &&
+               (be32_to_cpu(block->bb_u.s.bb_leftsib) == NULLAGBLOCK ||
+                be32_to_cpu(block->bb_u.s.bb_leftsib) < agflen) &&
+               block->bb_u.s.bb_leftsib &&
+               (be32_to_cpu(block->bb_u.s.bb_rightsib) == NULLAGBLOCK ||
+                be32_to_cpu(block->bb_u.s.bb_rightsib) < agflen) &&
+               block->bb_u.s.bb_rightsib;
        if (unlikely(XFS_TEST_ERROR(!sblock_ok, cur->bc_mp,
                        XFS_ERRTAG_BTREE_CHECK_SBLOCK,
                        XFS_RANDOM_BTREE_CHECK_SBLOCK))) {
@@ -311,26 +128,77 @@ xfs_btree_check_sblock(
 }
 
 /*
- * Checking routine: check that (short) pointer is ok.
+ * Debug routine: check that block header is ok.
+ */
+int
+xfs_btree_check_block(
+       struct xfs_btree_cur    *cur,   /* btree cursor */
+       struct xfs_btree_block  *block, /* generic btree block pointer */
+       int                     level,  /* level of the btree block */
+       struct xfs_buf          *bp)    /* buffer containing block, if any */
+{
+       if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+               return xfs_btree_check_lblock(cur, block, level, bp);
+       else
+               return xfs_btree_check_sblock(cur, block, level, bp);
+}
+
+/*
+ * Check that (long) pointer is ok.
  */
 int                                    /* error (0 or EFSCORRUPTED) */
+xfs_btree_check_lptr(
+       struct xfs_btree_cur    *cur,   /* btree cursor */
+       xfs_dfsbno_t            bno,    /* btree block disk address */
+       int                     level)  /* btree block level */
+{
+       XFS_WANT_CORRUPTED_RETURN(
+               level > 0 &&
+               bno != NULLDFSBNO &&
+               XFS_FSB_SANITY_CHECK(cur->bc_mp, bno));
+       return 0;
+}
+
+#ifdef DEBUG
+/*
+ * Check that (short) pointer is ok.
+ */
+STATIC int                             /* error (0 or EFSCORRUPTED) */
 xfs_btree_check_sptr(
-       xfs_btree_cur_t *cur,           /* btree cursor */
-       xfs_agblock_t   ptr,            /* btree block disk address */
-       int             level)          /* btree block level */
+       struct xfs_btree_cur    *cur,   /* btree cursor */
+       xfs_agblock_t           bno,    /* btree block disk address */
+       int                     level)  /* btree block level */
 {
-       xfs_buf_t       *agbp;          /* buffer for ag. freespace struct */
-       xfs_agf_t       *agf;           /* ag. freespace structure */
+       xfs_agblock_t           agblocks = cur->bc_mp->m_sb.sb_agblocks;
 
-       agbp = cur->bc_private.a.agbp;
-       agf = XFS_BUF_TO_AGF(agbp);
        XFS_WANT_CORRUPTED_RETURN(
                level > 0 &&
-               ptr != NULLAGBLOCK && ptr != 0 &&
-               ptr < be32_to_cpu(agf->agf_length));
+               bno != NULLAGBLOCK &&
+               bno != 0 &&
+               bno < agblocks);
        return 0;
 }
 
+/*
+ * Check that block ptr is ok.
+ */
+STATIC int                             /* error (0 or EFSCORRUPTED) */
+xfs_btree_check_ptr(
+       struct xfs_btree_cur    *cur,   /* btree cursor */
+       union xfs_btree_ptr     *ptr,   /* btree block disk address */
+       int                     index,  /* offset from ptr to check */
+       int                     level)  /* btree block level */
+{
+       if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+               return xfs_btree_check_lptr(cur,
+                               be64_to_cpu((&ptr->l)[index]), level);
+       } else {
+               return xfs_btree_check_sptr(cur,
+                               be32_to_cpu((&ptr->s)[index]), level);
+       }
+}
+#endif
+
 /*
  * Delete the btree cursor.
  */
@@ -387,16 +255,17 @@ xfs_btree_dup_cursor(
 
        tp = cur->bc_tp;
        mp = cur->bc_mp;
+
        /*
         * Allocate a new cursor like the old one.
         */
-       new = xfs_btree_init_cursor(mp, tp, cur->bc_private.a.agbp,
-               cur->bc_private.a.agno, cur->bc_btnum, cur->bc_private.b.ip,
-               cur->bc_private.b.whichfork);
+       new = cur->bc_ops->dup_cursor(cur);
+
        /*
         * Copy the record currently in the cursor.
         */
        new->bc_rec = cur->bc_rec;
+
        /*
         * For each level current, re-get the buffer and copy the ptr value.
         */
@@ -416,46 +285,174 @@ xfs_btree_dup_cursor(
                } else
                        new->bc_bufs[i] = NULL;
        }
-       /*
-        * For bmap btrees, copy the firstblock, flist, and flags values,
-        * since init cursor doesn't get them.
-        */
-       if (new->bc_btnum == XFS_BTNUM_BMAP) {
-               new->bc_private.b.firstblock = cur->bc_private.b.firstblock;
-               new->bc_private.b.flist = cur->bc_private.b.flist;
-               new->bc_private.b.flags = cur->bc_private.b.flags;
-       }
        *ncur = new;
        return 0;
 }
 
+/*
+ * XFS btree block layout and addressing:
+ *
+ * There are two types of blocks in the btree: leaf and non-leaf blocks.
+ *
+ * The leaf record start with a header then followed by records containing
+ * the values.  A non-leaf block also starts with the same header, and
+ * then first contains lookup keys followed by an equal number of pointers
+ * to the btree blocks at the previous level.
+ *
+ *             +--------+-------+-------+-------+-------+-------+-------+
+ * Leaf:       | header | rec 1 | rec 2 | rec 3 | rec 4 | rec 5 | rec N |
+ *             +--------+-------+-------+-------+-------+-------+-------+
+ *
+ *             +--------+-------+-------+-------+-------+-------+-------+
+ * Non-Leaf:   | header | key 1 | key 2 | key N | ptr 1 | ptr 2 | ptr N |
+ *             +--------+-------+-------+-------+-------+-------+-------+
+ *
+ * The header is called struct xfs_btree_block for reasons better left unknown
+ * and comes in different versions for short (32bit) and long (64bit) block
+ * pointers.  The record and key structures are defined by the btree instances
+ * and opaque to the btree core.  The block pointers are simple disk endian
+ * integers, available in a short (32bit) and long (64bit) variant.
+ *
+ * The helpers below calculate the offset of a given record, key or pointer
+ * into a btree block (xfs_btree_*_offset) or return a pointer to the given
+ * record, key or pointer (xfs_btree_*_addr).  Note that all addressing
+ * inside the btree block is done using indices starting at one, not zero!
+ */
+
+/*
+ * Return size of the btree block header for this btree instance.
+ */
+static inline size_t xfs_btree_block_len(struct xfs_btree_cur *cur)
+{
+       return (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
+               XFS_BTREE_LBLOCK_LEN :
+               XFS_BTREE_SBLOCK_LEN;
+}
+
+/*
+ * Return size of btree block pointers for this btree instance.
+ */
+static inline size_t xfs_btree_ptr_len(struct xfs_btree_cur *cur)
+{
+       return (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
+               sizeof(__be64) : sizeof(__be32);
+}
+
+/*
+ * Calculate offset of the n-th record in a btree block.
+ */
+STATIC size_t
+xfs_btree_rec_offset(
+       struct xfs_btree_cur    *cur,
+       int                     n)
+{
+       return xfs_btree_block_len(cur) +
+               (n - 1) * cur->bc_ops->rec_len;
+}
+
+/*
+ * Calculate offset of the n-th key in a btree block.
+ */
+STATIC size_t
+xfs_btree_key_offset(
+       struct xfs_btree_cur    *cur,
+       int                     n)
+{
+       return xfs_btree_block_len(cur) +
+               (n - 1) * cur->bc_ops->key_len;
+}
+
+/*
+ * Calculate offset of the n-th block pointer in a btree block.
+ */
+STATIC size_t
+xfs_btree_ptr_offset(
+       struct xfs_btree_cur    *cur,
+       int                     n,
+       int                     level)
+{
+       return xfs_btree_block_len(cur) +
+               cur->bc_ops->get_maxrecs(cur, level) * cur->bc_ops->key_len +
+               (n - 1) * xfs_btree_ptr_len(cur);
+}
+
+/*
+ * Return a pointer to the n-th record in the btree block.
+ */
+STATIC union xfs_btree_rec *
+xfs_btree_rec_addr(
+       struct xfs_btree_cur    *cur,
+       int                     n,
+       struct xfs_btree_block  *block)
+{
+       return (union xfs_btree_rec *)
+               ((char *)block + xfs_btree_rec_offset(cur, n));
+}
+
+/*
+ * Return a pointer to the n-th key in the btree block.
+ */
+STATIC union xfs_btree_key *
+xfs_btree_key_addr(
+       struct xfs_btree_cur    *cur,
+       int                     n,
+       struct xfs_btree_block  *block)
+{
+       return (union xfs_btree_key *)
+               ((char *)block + xfs_btree_key_offset(cur, n));
+}
+
+/*
+ * Return a pointer to the n-th block pointer in the btree block.
+ */
+STATIC union xfs_btree_ptr *
+xfs_btree_ptr_addr(
+       struct xfs_btree_cur    *cur,
+       int                     n,
+       struct xfs_btree_block  *block)
+{
+       int                     level = xfs_btree_get_level(block);
+
+       ASSERT(block->bb_level != 0);
+
+       return (union xfs_btree_ptr *)
+               ((char *)block + xfs_btree_ptr_offset(cur, n, level));
+}
+
+/*
+ * Get a the root block which is stored in the inode.
+ *
+ * For now this btree implementation assumes the btree root is always
+ * stored in the if_broot field of an inode fork.
+ */
+STATIC struct xfs_btree_block *
+xfs_btree_get_iroot(
+       struct xfs_btree_cur    *cur)
+{
+       struct xfs_ifork        *ifp;
+
+       ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, cur->bc_private.b.whichfork);
+       return (struct xfs_btree_block *)ifp->if_broot;
+}
+
 /*
  * Retrieve the block pointer from the cursor at the given level.
- * This may be a bmap btree root or from a buffer.
+ * This may be an inode btree root or from a buffer.
  */
-STATIC xfs_btree_block_t *             /* generic btree block pointer */
+STATIC struct xfs_btree_block *                /* generic btree block pointer */
 xfs_btree_get_block(
-       xfs_btree_cur_t         *cur,   /* btree cursor */
+       struct xfs_btree_cur    *cur,   /* btree cursor */
        int                     level,  /* level in btree */
-       xfs_buf_t               **bpp)  /* buffer containing the block */
-{
-       xfs_btree_block_t       *block; /* return value */
-       xfs_buf_t               *bp;    /* return buffer */
-       xfs_ifork_t             *ifp;   /* inode fork pointer */
-       int                     whichfork; /* data or attr fork */
-
-       if (cur->bc_btnum == XFS_BTNUM_BMAP && level == cur->bc_nlevels - 1) {
-               whichfork = cur->bc_private.b.whichfork;
-               ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, whichfork);
-               block = (xfs_btree_block_t *)ifp->if_broot;
-               bp = NULL;
-       } else {
-               bp = cur->bc_bufs[level];
-               block = XFS_BUF_TO_BLOCK(bp);
+       struct xfs_buf          **bpp)  /* buffer containing the block */
+{
+       if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+           (level == cur->bc_nlevels - 1)) {
+               *bpp = NULL;
+               return xfs_btree_get_iroot(cur);
        }
-       ASSERT(block != NULL);
-       *bpp = bp;
-       return block;
+
+       *bpp = cur->bc_bufs[level];
+       return XFS_BUF_TO_BLOCK(*bpp);
 }
 
 /*
@@ -504,97 +501,6 @@ xfs_btree_get_bufs(
        return bp;
 }
 
-/*
- * Allocate a new btree cursor.
- * The cursor is either for allocation (A) or bmap (B) or inodes (I).
- */
-xfs_btree_cur_t *                      /* new btree cursor */
-xfs_btree_init_cursor(
-       xfs_mount_t     *mp,            /* file system mount point */
-       xfs_trans_t     *tp,            /* transaction pointer */
-       xfs_buf_t       *agbp,          /* (A only) buffer for agf structure */
-                                       /* (I only) buffer for agi structure */
-       xfs_agnumber_t  agno,           /* (AI only) allocation group number */
-       xfs_btnum_t     btnum,          /* btree identifier */
-       xfs_inode_t     *ip,            /* (B only) inode owning the btree */
-       int             whichfork)      /* (B only) data or attr fork */
-{
-       xfs_agf_t       *agf;           /* (A) allocation group freespace */
-       xfs_agi_t       *agi;           /* (I) allocation group inodespace */
-       xfs_btree_cur_t *cur;           /* return value */
-       xfs_ifork_t     *ifp;           /* (I) inode fork pointer */
-       int             nlevels=0;      /* number of levels in the btree */
-
-       ASSERT(xfs_btree_cur_zone != NULL);
-       /*
-        * Allocate a new cursor.
-        */
-       cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP);
-       /*
-        * Deduce the number of btree levels from the arguments.
-        */
-       switch (btnum) {
-       case XFS_BTNUM_BNO:
-       case XFS_BTNUM_CNT:
-               agf = XFS_BUF_TO_AGF(agbp);
-               nlevels = be32_to_cpu(agf->agf_levels[btnum]);
-               break;
-       case XFS_BTNUM_BMAP:
-               ifp = XFS_IFORK_PTR(ip, whichfork);
-               nlevels = be16_to_cpu(ifp->if_broot->bb_level) + 1;
-               break;
-       case XFS_BTNUM_INO:
-               agi = XFS_BUF_TO_AGI(agbp);
-               nlevels = be32_to_cpu(agi->agi_level);
-               break;
-       default:
-               ASSERT(0);
-       }
-       /*
-        * Fill in the common fields.
-        */
-       cur->bc_tp = tp;
-       cur->bc_mp = mp;
-       cur->bc_nlevels = nlevels;
-       cur->bc_btnum = btnum;
-       cur->bc_blocklog = mp->m_sb.sb_blocklog;
-       /*
-        * Fill in private fields.
-        */
-       switch (btnum) {
-       case XFS_BTNUM_BNO:
-       case XFS_BTNUM_CNT:
-               /*
-                * Allocation btree fields.
-                */
-               cur->bc_private.a.agbp = agbp;
-               cur->bc_private.a.agno = agno;
-               break;
-       case XFS_BTNUM_INO:
-               /*
-                * Inode allocation btree fields.
-                */
-               cur->bc_private.a.agbp = agbp;
-               cur->bc_private.a.agno = agno;
-               break;
-       case XFS_BTNUM_BMAP:
-               /*
-                * Bmap btree fields.
-                */
-               cur->bc_private.b.forksize = XFS_IFORK_SIZE(ip, whichfork);
-               cur->bc_private.b.ip = ip;
-               cur->bc_private.b.firstblock = NULLFSBLOCK;
-               cur->bc_private.b.flist = NULL;
-               cur->bc_private.b.allocated = 0;
-               cur->bc_private.b.flags = 0;
-               cur->bc_private.b.whichfork = whichfork;
-               break;
-       default:
-               ASSERT(0);
-       }
-       return cur;
-}
-
 /*
  * Check for the cursor referring to the last block at the given level.
  */
@@ -603,12 +509,12 @@ xfs_btree_islastblock(
        xfs_btree_cur_t         *cur,   /* btree cursor */
        int                     level)  /* level to check */
 {
-       xfs_btree_block_t       *block; /* generic btree block pointer */
+       struct xfs_btree_block  *block; /* generic btree block pointer */
        xfs_buf_t               *bp;    /* buffer containing block */
 
        block = xfs_btree_get_block(cur, level, &bp);
        xfs_btree_check_block(cur, block, level, bp);
-       if (XFS_BTREE_LONG_PTRS(cur->bc_btnum))
+       if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
                return be64_to_cpu(block->bb_u.l.bb_rightsib) == NULLDFSBNO;
        else
                return be32_to_cpu(block->bb_u.s.bb_rightsib) == NULLAGBLOCK;
@@ -618,12 +524,12 @@ xfs_btree_islastblock(
  * Change the cursor to point to the first record at the given level.
  * Other levels are unaffected.
  */
-int                                    /* success=1, failure=0 */
+STATIC int                             /* success=1, failure=0 */
 xfs_btree_firstrec(
        xfs_btree_cur_t         *cur,   /* btree cursor */
        int                     level)  /* level to change */
 {
-       xfs_btree_block_t       *block; /* generic btree block pointer */
+       struct xfs_btree_block  *block; /* generic btree block pointer */
        xfs_buf_t               *bp;    /* buffer containing block */
 
        /*
@@ -634,7 +540,7 @@ xfs_btree_firstrec(
        /*
         * It's empty, there is no such record.
         */
-       if (!block->bb_h.bb_numrecs)
+       if (!block->bb_numrecs)
                return 0;
        /*
         * Set the ptr value to 1, that's the first record/key.
@@ -647,12 +553,12 @@ xfs_btree_firstrec(
  * Change the cursor to point to the last record in the current block
  * at the given level.  Other levels are unaffected.
  */
-int                                    /* success=1, failure=0 */
+STATIC int                             /* success=1, failure=0 */
 xfs_btree_lastrec(
        xfs_btree_cur_t         *cur,   /* btree cursor */
        int                     level)  /* level to change */
 {
-       xfs_btree_block_t       *block; /* generic btree block pointer */
+       struct xfs_btree_block  *block; /* generic btree block pointer */
        xfs_buf_t               *bp;    /* buffer containing block */
 
        /*
@@ -663,12 +569,12 @@ xfs_btree_lastrec(
        /*
         * It's empty, there is no such record.
         */
-       if (!block->bb_h.bb_numrecs)
+       if (!block->bb_numrecs)
                return 0;
        /*
         * Set the ptr value to numrecs, that's the last record/key.
         */
-       cur->bc_ptrs[level] = be16_to_cpu(block->bb_h.bb_numrecs);
+       cur->bc_ptrs[level] = be16_to_cpu(block->bb_numrecs);
        return 1;
 }
 
@@ -817,80 +723,98 @@ xfs_btree_reada_bufs(
        xfs_baread(mp->m_ddev_targp, d, mp->m_bsize * count);
 }
 
-/*
- * Read-ahead btree blocks, at the given level.
- * Bits in lr are set from XFS_BTCUR_{LEFT,RIGHT}RA.
- */
-int
-xfs_btree_readahead_core(
-       xfs_btree_cur_t         *cur,           /* btree cursor */
-       int                     lev,            /* level in btree */
-       int                     lr)             /* left/right bits */
+STATIC int
+xfs_btree_readahead_lblock(
+       struct xfs_btree_cur    *cur,
+       int                     lr,
+       struct xfs_btree_block  *block)
 {
-       xfs_alloc_block_t       *a;
-       xfs_bmbt_block_t        *b;
-       xfs_inobt_block_t       *i;
        int                     rval = 0;
+       xfs_fsblock_t           left = be64_to_cpu(block->bb_u.l.bb_leftsib);
+       xfs_fsblock_t           right = be64_to_cpu(block->bb_u.l.bb_rightsib);
 
-       ASSERT(cur->bc_bufs[lev] != NULL);
-       cur->bc_ra[lev] |= lr;
-       switch (cur->bc_btnum) {
-       case XFS_BTNUM_BNO:
-       case XFS_BTNUM_CNT:
-               a = XFS_BUF_TO_ALLOC_BLOCK(cur->bc_bufs[lev]);
-               if ((lr & XFS_BTCUR_LEFTRA) && be32_to_cpu(a->bb_leftsib) != NULLAGBLOCK) {
-                       xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
-                               be32_to_cpu(a->bb_leftsib), 1);
-                       rval++;
-               }
-               if ((lr & XFS_BTCUR_RIGHTRA) && be32_to_cpu(a->bb_rightsib) != NULLAGBLOCK) {
-                       xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
-                               be32_to_cpu(a->bb_rightsib), 1);
-                       rval++;
-               }
-               break;
-       case XFS_BTNUM_BMAP:
-               b = XFS_BUF_TO_BMBT_BLOCK(cur->bc_bufs[lev]);
-               if ((lr & XFS_BTCUR_LEFTRA) && be64_to_cpu(b->bb_leftsib) != NULLDFSBNO) {
-                       xfs_btree_reada_bufl(cur->bc_mp, be64_to_cpu(b->bb_leftsib), 1);
-                       rval++;
-               }
-               if ((lr & XFS_BTCUR_RIGHTRA) && be64_to_cpu(b->bb_rightsib) != NULLDFSBNO) {
-                       xfs_btree_reada_bufl(cur->bc_mp, be64_to_cpu(b->bb_rightsib), 1);
-                       rval++;
-               }
-               break;
-       case XFS_BTNUM_INO:
-               i = XFS_BUF_TO_INOBT_BLOCK(cur->bc_bufs[lev]);
-               if ((lr & XFS_BTCUR_LEFTRA) && be32_to_cpu(i->bb_leftsib) != NULLAGBLOCK) {
-                       xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
-                               be32_to_cpu(i->bb_leftsib), 1);
-                       rval++;
-               }
-               if ((lr & XFS_BTCUR_RIGHTRA) && be32_to_cpu(i->bb_rightsib) != NULLAGBLOCK) {
-                       xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
-                               be32_to_cpu(i->bb_rightsib), 1);
-                       rval++;
-               }
-               break;
-       default:
-               ASSERT(0);
+       if ((lr & XFS_BTCUR_LEFTRA) && left != NULLDFSBNO) {
+               xfs_btree_reada_bufl(cur->bc_mp, left, 1);
+               rval++;
+       }
+
+       if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLDFSBNO) {
+               xfs_btree_reada_bufl(cur->bc_mp, right, 1);
+               rval++;
        }
+
        return rval;
 }
 
-/*
- * Set the buffer for level "lev" in the cursor to bp, releasing
- * any previous buffer.
- */
-void
-xfs_btree_setbuf(
-       xfs_btree_cur_t         *cur,   /* btree cursor */
-       int                     lev,    /* level in btree */
-       xfs_buf_t               *bp)    /* new buffer to set */
+STATIC int
+xfs_btree_readahead_sblock(
+       struct xfs_btree_cur    *cur,
+       int                     lr,
+       struct xfs_btree_block *block)
 {
-       xfs_btree_block_t       *b;     /* btree block */
-       xfs_buf_t               *obp;   /* old buffer pointer */
+       int                     rval = 0;
+       xfs_agblock_t           left = be32_to_cpu(block->bb_u.s.bb_leftsib);
+       xfs_agblock_t           right = be32_to_cpu(block->bb_u.s.bb_rightsib);
+
+
+       if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) {
+               xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
+                                    left, 1);
+               rval++;
+       }
+
+       if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) {
+               xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
+                                    right, 1);
+               rval++;
+       }
+
+       return rval;
+}
+
+/*
+ * Read-ahead btree blocks, at the given level.
+ * Bits in lr are set from XFS_BTCUR_{LEFT,RIGHT}RA.
+ */
+STATIC int
+xfs_btree_readahead(
+       struct xfs_btree_cur    *cur,           /* btree cursor */
+       int                     lev,            /* level in btree */
+       int                     lr)             /* left/right bits */
+{
+       struct xfs_btree_block  *block;
+
+       /*
+        * No readahead needed if we are at the root level and the
+        * btree root is stored in the inode.
+        */
+       if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+           (lev == cur->bc_nlevels - 1))
+               return 0;
+
+       if ((cur->bc_ra[lev] | lr) == cur->bc_ra[lev])
+               return 0;
+
+       cur->bc_ra[lev] |= lr;
+       block = XFS_BUF_TO_BLOCK(cur->bc_bufs[lev]);
+
+       if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+               return xfs_btree_readahead_lblock(cur, lr, block);
+       return xfs_btree_readahead_sblock(cur, lr, block);
+}
+
+/*
+ * Set the buffer for level "lev" in the cursor to bp, releasing
+ * any previous buffer.
+ */
+void
+xfs_btree_setbuf(
+       xfs_btree_cur_t         *cur,   /* btree cursor */
+       int                     lev,    /* level in btree */
+       xfs_buf_t               *bp)    /* new buffer to set */
+{
+       struct xfs_btree_block  *b;     /* btree block */
+       xfs_buf_t               *obp;   /* old buffer pointer */
 
        obp = cur->bc_bufs[lev];
        if (obp)
@@ -900,7 +824,7 @@ xfs_btree_setbuf(
        if (!bp)
                return;
        b = XFS_BUF_TO_BLOCK(bp);
-       if (XFS_BTREE_LONG_PTRS(cur->bc_btnum)) {
+       if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
                if (be64_to_cpu(b->bb_u.l.bb_leftsib) == NULLDFSBNO)
                        cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA;
                if (be64_to_cpu(b->bb_u.l.bb_rightsib) == NULLDFSBNO)
@@ -912,3 +836,2855 @@ xfs_btree_setbuf(
                        cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA;
        }
 }
+
+STATIC int
+xfs_btree_ptr_is_null(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_ptr     *ptr)
+{
+       if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+               return be64_to_cpu(ptr->l) == NULLFSBLOCK;
+       else
+               return be32_to_cpu(ptr->s) == NULLAGBLOCK;
+}
+
+STATIC void
+xfs_btree_set_ptr_null(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_ptr     *ptr)
+{
+       if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+               ptr->l = cpu_to_be64(NULLFSBLOCK);
+       else
+               ptr->s = cpu_to_be32(NULLAGBLOCK);
+}
+
+/*
+ * Get/set/init sibling pointers
+ */
+STATIC void
+xfs_btree_get_sibling(
+       struct xfs_btree_cur    *cur,
+       struct xfs_btree_block  *block,
+       union xfs_btree_ptr     *ptr,
+       int                     lr)
+{
+       ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB);
+
+       if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+               if (lr == XFS_BB_RIGHTSIB)
+                       ptr->l = block->bb_u.l.bb_rightsib;
+               else
+                       ptr->l = block->bb_u.l.bb_leftsib;
+       } else {
+               if (lr == XFS_BB_RIGHTSIB)
+                       ptr->s = block->bb_u.s.bb_rightsib;
+               else
+                       ptr->s = block->bb_u.s.bb_leftsib;
+       }
+}
+
+STATIC void
+xfs_btree_set_sibling(
+       struct xfs_btree_cur    *cur,
+       struct xfs_btree_block  *block,
+       union xfs_btree_ptr     *ptr,
+       int                     lr)
+{
+       ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB);
+
+       if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+               if (lr == XFS_BB_RIGHTSIB)
+                       block->bb_u.l.bb_rightsib = ptr->l;
+               else
+                       block->bb_u.l.bb_leftsib = ptr->l;
+       } else {
+               if (lr == XFS_BB_RIGHTSIB)
+                       block->bb_u.s.bb_rightsib = ptr->s;
+               else
+                       block->bb_u.s.bb_leftsib = ptr->s;
+       }
+}
+
+STATIC void
+xfs_btree_init_block(
+       struct xfs_btree_cur    *cur,
+       int                     level,
+       int                     numrecs,
+       struct xfs_btree_block  *new)   /* new block */
+{
+       new->bb_magic = cpu_to_be32(xfs_magics[cur->bc_btnum]);
+       new->bb_level = cpu_to_be16(level);
+       new->bb_numrecs = cpu_to_be16(numrecs);
+
+       if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+               new->bb_u.l.bb_leftsib = cpu_to_be64(NULLFSBLOCK);
+               new->bb_u.l.bb_rightsib = cpu_to_be64(NULLFSBLOCK);
+       } else {
+               new->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
+               new->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
+       }
+}
+
+/*
+ * Return true if ptr is the last record in the btree and
+ * we need to track updateÑ• to this record.  The decision
+ * will be further refined in the update_lastrec method.
+ */
+STATIC int
+xfs_btree_is_lastrec(
+       struct xfs_btree_cur    *cur,
+       struct xfs_btree_block  *block,
+       int                     level)
+{
+       union xfs_btree_ptr     ptr;
+
+       if (level > 0)
+               return 0;
+       if (!(cur->bc_flags & XFS_BTREE_LASTREC_UPDATE))
+               return 0;
+
+       xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
+       if (!xfs_btree_ptr_is_null(cur, &ptr))
+               return 0;
+       return 1;
+}
+
+STATIC void
+xfs_btree_buf_to_ptr(
+       struct xfs_btree_cur    *cur,
+       struct xfs_buf          *bp,
+       union xfs_btree_ptr     *ptr)
+{
+       if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+               ptr->l = cpu_to_be64(XFS_DADDR_TO_FSB(cur->bc_mp,
+                                       XFS_BUF_ADDR(bp)));
+       else {
+               ptr->s = cpu_to_be32(XFS_DADDR_TO_AGBNO(cur->bc_mp,
+                                       XFS_BUF_ADDR(bp)));
+       }
+}
+
+STATIC xfs_daddr_t
+xfs_btree_ptr_to_daddr(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_ptr     *ptr)
+{
+       if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+               ASSERT(be64_to_cpu(ptr->l) != NULLFSBLOCK);
+
+               return XFS_FSB_TO_DADDR(cur->bc_mp, be64_to_cpu(ptr->l));
+       } else {
+               ASSERT(cur->bc_private.a.agno != NULLAGNUMBER);
+               ASSERT(be32_to_cpu(ptr->s) != NULLAGBLOCK);
+
+               return XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_private.a.agno,
+                                       be32_to_cpu(ptr->s));
+       }
+}
+
+STATIC void
+xfs_btree_set_refs(
+       struct xfs_btree_cur    *cur,
+       struct xfs_buf          *bp)
+{
+       switch (cur->bc_btnum) {
+       case XFS_BTNUM_BNO:
+       case XFS_BTNUM_CNT:
+               XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_MAP, XFS_ALLOC_BTREE_REF);
+               break;
+       case XFS_BTNUM_INO:
+               XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_INOMAP, XFS_INO_BTREE_REF);
+               break;
+       case XFS_BTNUM_BMAP:
+               XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_MAP, XFS_BMAP_BTREE_REF);
+               break;
+       default:
+               ASSERT(0);
+       }
+}
+
+STATIC int
+xfs_btree_get_buf_block(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_ptr     *ptr,
+       int                     flags,
+       struct xfs_btree_block  **block,
+       struct xfs_buf          **bpp)
+{
+       struct xfs_mount        *mp = cur->bc_mp;
+       xfs_daddr_t             d;
+
+       /* need to sort out how callers deal with failures first */
+       ASSERT(!(flags & XFS_BUF_TRYLOCK));
+
+       d = xfs_btree_ptr_to_daddr(cur, ptr);
+       *bpp = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d,
+                                mp->m_bsize, flags);
+
+       ASSERT(*bpp);
+       ASSERT(!XFS_BUF_GETERROR(*bpp));
+
+       *block = XFS_BUF_TO_BLOCK(*bpp);
+       return 0;
+}
+
+/*
+ * Read in the buffer at the given ptr and return the buffer and
+ * the block pointer within the buffer.
+ */
+STATIC int
+xfs_btree_read_buf_block(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_ptr     *ptr,
+       int                     level,
+       int                     flags,
+       struct xfs_btree_block  **block,
+       struct xfs_buf          **bpp)
+{
+       struct xfs_mount        *mp = cur->bc_mp;
+       xfs_daddr_t             d;
+       int                     error;
+
+       /* need to sort out how callers deal with failures first */
+       ASSERT(!(flags & XFS_BUF_TRYLOCK));
+
+       d = xfs_btree_ptr_to_daddr(cur, ptr);
+       error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d,
+                                  mp->m_bsize, flags, bpp);
+       if (error)
+               return error;
+
+       ASSERT(*bpp != NULL);
+       ASSERT(!XFS_BUF_GETERROR(*bpp));
+
+       xfs_btree_set_refs(cur, *bpp);
+       *block = XFS_BUF_TO_BLOCK(*bpp);
+
+       error = xfs_btree_check_block(cur, *block, level, *bpp);
+       if (error)
+               xfs_trans_brelse(cur->bc_tp, *bpp);
+       return error;
+}
+
+/*
+ * Copy keys from one btree block to another.
+ */
+STATIC void
+xfs_btree_copy_keys(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_key     *dst_key,
+       union xfs_btree_key     *src_key,
+       int                     numkeys)
+{
+       ASSERT(numkeys >= 0);
+       memcpy(dst_key, src_key, numkeys * cur->bc_ops->key_len);
+}
+
+/*
+ * Copy records from one btree block to another.
+ */
+STATIC void
+xfs_btree_copy_recs(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_rec     *dst_rec,
+       union xfs_btree_rec     *src_rec,
+       int                     numrecs)
+{
+       ASSERT(numrecs >= 0);
+       memcpy(dst_rec, src_rec, numrecs * cur->bc_ops->rec_len);
+}
+
+/*
+ * Copy block pointers from one btree block to another.
+ */
+STATIC void
+xfs_btree_copy_ptrs(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_ptr     *dst_ptr,
+       union xfs_btree_ptr     *src_ptr,
+       int                     numptrs)
+{
+       ASSERT(numptrs >= 0);
+       memcpy(dst_ptr, src_ptr, numptrs * xfs_btree_ptr_len(cur));
+}
+
+/*
+ * Shift keys one index left/right inside a single btree block.
+ */
+STATIC void
+xfs_btree_shift_keys(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_key     *key,
+       int                     dir,
+       int                     numkeys)
+{
+       char                    *dst_key;
+
+       ASSERT(numkeys >= 0);
+       ASSERT(dir == 1 || dir == -1);
+
+       dst_key = (char *)key + (dir * cur->bc_ops->key_len);
+       memmove(dst_key, key, numkeys * cur->bc_ops->key_len);
+}
+
+/*
+ * Shift records one index left/right inside a single btree block.
+ */
+STATIC void
+xfs_btree_shift_recs(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_rec     *rec,
+       int                     dir,
+       int                     numrecs)
+{
+       char                    *dst_rec;
+
+       ASSERT(numrecs >= 0);
+       ASSERT(dir == 1 || dir == -1);
+
+       dst_rec = (char *)rec + (dir * cur->bc_ops->rec_len);
+       memmove(dst_rec, rec, numrecs * cur->bc_ops->rec_len);
+}
+
+/*
+ * Shift block pointers one index left/right inside a single btree block.
+ */
+STATIC void
+xfs_btree_shift_ptrs(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_ptr     *ptr,
+       int                     dir,
+       int                     numptrs)
+{
+       char                    *dst_ptr;
+
+       ASSERT(numptrs >= 0);
+       ASSERT(dir == 1 || dir == -1);
+
+       dst_ptr = (char *)ptr + (dir * xfs_btree_ptr_len(cur));
+       memmove(dst_ptr, ptr, numptrs * xfs_btree_ptr_len(cur));
+}
+
+/*
+ * Log key values from the btree block.
+ */
+STATIC void
+xfs_btree_log_keys(
+       struct xfs_btree_cur    *cur,
+       struct xfs_buf          *bp,
+       int                     first,
+       int                     last)
+{
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+       XFS_BTREE_TRACE_ARGBII(cur, bp, first, last);
+
+       if (bp) {
+               xfs_trans_log_buf(cur->bc_tp, bp,
+                                 xfs_btree_key_offset(cur, first),
+                                 xfs_btree_key_offset(cur, last + 1) - 1);
+       } else {
+               xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
+                               xfs_ilog_fbroot(cur->bc_private.b.whichfork));
+       }
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+}
+
+/*
+ * Log record values from the btree block.
+ */
+void
+xfs_btree_log_recs(
+       struct xfs_btree_cur    *cur,
+       struct xfs_buf          *bp,
+       int                     first,
+       int                     last)
+{
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+       XFS_BTREE_TRACE_ARGBII(cur, bp, first, last);
+
+       xfs_trans_log_buf(cur->bc_tp, bp,
+                         xfs_btree_rec_offset(cur, first),
+                         xfs_btree_rec_offset(cur, last + 1) - 1);
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+}
+
+/*
+ * Log block pointer fields from a btree block (nonleaf).
+ */
+STATIC void
+xfs_btree_log_ptrs(
+       struct xfs_btree_cur    *cur,   /* btree cursor */
+       struct xfs_buf          *bp,    /* buffer containing btree block */
+       int                     first,  /* index of first pointer to log */
+       int                     last)   /* index of last pointer to log */
+{
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+       XFS_BTREE_TRACE_ARGBII(cur, bp, first, last);
+
+       if (bp) {
+               struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
+               int                     level = xfs_btree_get_level(block);
+
+               xfs_trans_log_buf(cur->bc_tp, bp,
+                               xfs_btree_ptr_offset(cur, first, level),
+                               xfs_btree_ptr_offset(cur, last + 1, level) - 1);
+       } else {
+               xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
+                       xfs_ilog_fbroot(cur->bc_private.b.whichfork));
+       }
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+}
+
+/*
+ * Log fields from a btree block header.
+ */
+void
+xfs_btree_log_block(
+       struct xfs_btree_cur    *cur,   /* btree cursor */
+       struct xfs_buf          *bp,    /* buffer containing btree block */
+       int                     fields) /* mask of fields: XFS_BB_... */
+{
+       int                     first;  /* first byte offset logged */
+       int                     last;   /* last byte offset logged */
+       static const short      soffsets[] = {  /* table of offsets (short) */
+               offsetof(struct xfs_btree_block, bb_magic),
+               offsetof(struct xfs_btree_block, bb_level),
+               offsetof(struct xfs_btree_block, bb_numrecs),
+               offsetof(struct xfs_btree_block, bb_u.s.bb_leftsib),
+               offsetof(struct xfs_btree_block, bb_u.s.bb_rightsib),
+               XFS_BTREE_SBLOCK_LEN
+       };
+       static const short      loffsets[] = {  /* table of offsets (long) */
+               offsetof(struct xfs_btree_block, bb_magic),
+               offsetof(struct xfs_btree_block, bb_level),
+               offsetof(struct xfs_btree_block, bb_numrecs),
+               offsetof(struct xfs_btree_block, bb_u.l.bb_leftsib),
+               offsetof(struct xfs_btree_block, bb_u.l.bb_rightsib),
+               XFS_BTREE_LBLOCK_LEN
+       };
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+       XFS_BTREE_TRACE_ARGBI(cur, bp, fields);
+
+       if (bp) {
+               xfs_btree_offsets(fields,
+                                 (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
+                                       loffsets : soffsets,
+                                 XFS_BB_NUM_BITS, &first, &last);
+               xfs_trans_log_buf(cur->bc_tp, bp, first, last);
+       } else {
+               xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
+                       xfs_ilog_fbroot(cur->bc_private.b.whichfork));
+       }
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+}
+
+/*
+ * Increment cursor by one record at the level.
+ * For nonzero levels the leaf-ward information is untouched.
+ */
+int                                            /* error */
+xfs_btree_increment(
+       struct xfs_btree_cur    *cur,
+       int                     level,
+       int                     *stat)          /* success/failure */
+{
+       struct xfs_btree_block  *block;
+       union xfs_btree_ptr     ptr;
+       struct xfs_buf          *bp;
+       int                     error;          /* error return value */
+       int                     lev;
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+       XFS_BTREE_TRACE_ARGI(cur, level);
+
+       ASSERT(level < cur->bc_nlevels);
+
+       /* Read-ahead to the right at this level. */
+       xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
+
+       /* Get a pointer to the btree block. */
+       block = xfs_btree_get_block(cur, level, &bp);
+
+#ifdef DEBUG
+       error = xfs_btree_check_block(cur, block, level, bp);
+       if (error)
+               goto error0;
+#endif
+
+       /* We're done if we remain in the block after the increment. */
+       if (++cur->bc_ptrs[level] <= xfs_btree_get_numrecs(block))
+               goto out1;
+
+       /* Fail if we just went off the right edge of the tree. */
+       xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
+       if (xfs_btree_ptr_is_null(cur, &ptr))
+               goto out0;
+
+       XFS_BTREE_STATS_INC(cur, increment);
+
+       /*
+        * March up the tree incrementing pointers.
+        * Stop when we don't go off the right edge of a block.
+        */
+       for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
+               block = xfs_btree_get_block(cur, lev, &bp);
+
+#ifdef DEBUG
+               error = xfs_btree_check_block(cur, block, lev, bp);
+               if (error)
+                       goto error0;
+#endif
+
+               if (++cur->bc_ptrs[lev] <= xfs_btree_get_numrecs(block))
+                       break;
+
+               /* Read-ahead the right block for the next loop. */
+               xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA);
+       }
+
+       /*
+        * If we went off the root then we are either seriously
+        * confused or have the tree root in an inode.
+        */
+       if (lev == cur->bc_nlevels) {
+               if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
+                       goto out0;
+               ASSERT(0);
+               error = EFSCORRUPTED;
+               goto error0;
+       }
+       ASSERT(lev < cur->bc_nlevels);
+
+       /*
+        * Now walk back down the tree, fixing up the cursor's buffer
+        * pointers and key numbers.
+        */
+       for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) {
+               union xfs_btree_ptr     *ptrp;
+
+               ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block);
+               error = xfs_btree_read_buf_block(cur, ptrp, --lev,
+                                                       0, &block, &bp);
+               if (error)
+                       goto error0;
+
+               xfs_btree_setbuf(cur, lev, bp);
+               cur->bc_ptrs[lev] = 1;
+       }
+out1:
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+       *stat = 1;
+       return 0;
+
+out0:
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+       *stat = 0;
+       return 0;
+
+error0:
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+       return error;
+}
+
+/*
+ * Decrement cursor by one record at the level.
+ * For nonzero levels the leaf-ward information is untouched.
+ */
+int                                            /* error */
+xfs_btree_decrement(
+       struct xfs_btree_cur    *cur,
+       int                     level,
+       int                     *stat)          /* success/failure */
+{
+       struct xfs_btree_block  *block;
+       xfs_buf_t               *bp;
+       int                     error;          /* error return value */
+       int                     lev;
+       union xfs_btree_ptr     ptr;
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+       XFS_BTREE_TRACE_ARGI(cur, level);
+
+       ASSERT(level < cur->bc_nlevels);
+
+       /* Read-ahead to the left at this level. */
+       xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA);
+
+       /* We're done if we remain in the block after the decrement. */
+       if (--cur->bc_ptrs[level] > 0)
+               goto out1;
+
+       /* Get a pointer to the btree block. */
+       block = xfs_btree_get_block(cur, level, &bp);
+
+#ifdef DEBUG
+       error = xfs_btree_check_block(cur, block, level, bp);
+       if (error)
+               goto error0;
+#endif
+
+       /* Fail if we just went off the left edge of the tree. */
+       xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB);
+       if (xfs_btree_ptr_is_null(cur, &ptr))
+               goto out0;
+
+       XFS_BTREE_STATS_INC(cur, decrement);
+
+       /*
+        * March up the tree decrementing pointers.
+        * Stop when we don't go off the left edge of a block.
+        */
+       for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
+               if (--cur->bc_ptrs[lev] > 0)
+                       break;
+               /* Read-ahead the left block for the next loop. */
+               xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA);
+       }
+
+       /*
+        * If we went off the root then we are seriously confused.
+        * or the root of the tree is in an inode.
+        */
+       if (lev == cur->bc_nlevels) {
+               if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
+                       goto out0;
+               ASSERT(0);
+               error = EFSCORRUPTED;
+               goto error0;
+       }
+       ASSERT(lev < cur->bc_nlevels);
+
+       /*
+        * Now walk back down the tree, fixing up the cursor's buffer
+        * pointers and key numbers.
+        */
+       for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) {
+               union xfs_btree_ptr     *ptrp;
+
+               ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block);
+               error = xfs_btree_read_buf_block(cur, ptrp, --lev,
+                                                       0, &block, &bp);
+               if (error)
+                       goto error0;
+               xfs_btree_setbuf(cur, lev, bp);
+               cur->bc_ptrs[lev] = xfs_btree_get_numrecs(block);
+       }
+out1:
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+       *stat = 1;
+       return 0;
+
+out0:
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+       *stat = 0;
+       return 0;
+
+error0:
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+       return error;
+}
+
+STATIC int
+xfs_btree_lookup_get_block(
+       struct xfs_btree_cur    *cur,   /* btree cursor */
+       int                     level,  /* level in the btree */
+       union xfs_btree_ptr     *pp,    /* ptr to btree block */
+       struct xfs_btree_block  **blkp) /* return btree block */
+{
+       struct xfs_buf          *bp;    /* buffer pointer for btree block */
+       int                     error = 0;
+
+       /* special case the root block if in an inode */
+       if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+           (level == cur->bc_nlevels - 1)) {
+               *blkp = xfs_btree_get_iroot(cur);
+               return 0;
+       }
+
+       /*
+        * If the old buffer at this level for the disk address we are
+        * looking for re-use it.
+        *
+        * Otherwise throw it away and get a new one.
+        */
+       bp = cur->bc_bufs[level];
+       if (bp && XFS_BUF_ADDR(bp) == xfs_btree_ptr_to_daddr(cur, pp)) {
+               *blkp = XFS_BUF_TO_BLOCK(bp);
+               return 0;
+       }
+
+       error = xfs_btree_read_buf_block(cur, pp, level, 0, blkp, &bp);
+       if (error)
+               return error;
+
+       xfs_btree_setbuf(cur, level, bp);
+       return 0;
+}
+
+/*
+ * Get current search key.  For level 0 we don't actually have a key
+ * structure so we make one up from the record.  For all other levels
+ * we just return the right key.
+ */
+STATIC union xfs_btree_key *
+xfs_lookup_get_search_key(
+       struct xfs_btree_cur    *cur,
+       int                     level,
+       int                     keyno,
+       struct xfs_btree_block  *block,
+       union xfs_btree_key     *kp)
+{
+       if (level == 0) {
+               cur->bc_ops->init_key_from_rec(kp,
+                               xfs_btree_rec_addr(cur, keyno, block));
+               return kp;
+       }
+
+       return xfs_btree_key_addr(cur, keyno, block);
+}
+
+/*
+ * Lookup the record.  The cursor is made to point to it, based on dir.
+ * Return 0 if can't find any such record, 1 for success.
+ */
+int                                    /* error */
+xfs_btree_lookup(
+       struct xfs_btree_cur    *cur,   /* btree cursor */
+       xfs_lookup_t            dir,    /* <=, ==, or >= */
+       int                     *stat)  /* success/failure */
+{
+       struct xfs_btree_block  *block; /* current btree block */
+       __int64_t               diff;   /* difference for the current key */
+       int                     error;  /* error return value */
+       int                     keyno;  /* current key number */
+       int                     level;  /* level in the btree */
+       union xfs_btree_ptr     *pp;    /* ptr to btree block */
+       union xfs_btree_ptr     ptr;    /* ptr to btree block */
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+       XFS_BTREE_TRACE_ARGI(cur, dir);
+
+       XFS_BTREE_STATS_INC(cur, lookup);
+
+       block = NULL;
+       keyno = 0;
+
+       /* initialise start pointer from cursor */
+       cur->bc_ops->init_ptr_from_cur(cur, &ptr);
+       pp = &ptr;
+
+       /*
+        * Iterate over each level in the btree, starting at the root.
+        * For each level above the leaves, find the key we need, based
+        * on the lookup record, then follow the corresponding block
+        * pointer down to the next level.
+        */
+       for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) {
+               /* Get the block we need to do the lookup on. */
+               error = xfs_btree_lookup_get_block(cur, level, pp, &block);
+               if (error)
+                       goto error0;
+
+               if (diff == 0) {
+                       /*
+                        * If we already had a key match at a higher level, we
+                        * know we need to use the first entry in this block.
+                        */
+                       keyno = 1;
+               } else {
+                       /* Otherwise search this block. Do a binary search. */
+
+                       int     high;   /* high entry number */
+                       int     low;    /* low entry number */
+
+                       /* Set low and high entry numbers, 1-based. */
+                       low = 1;
+                       high = xfs_btree_get_numrecs(block);
+                       if (!high) {
+                               /* Block is empty, must be an empty leaf. */
+                               ASSERT(level == 0 && cur->bc_nlevels == 1);
+
+                               cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE;
+                               XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+                               *stat = 0;
+                               return 0;
+                       }
+
+                       /* Binary search the block. */
+                       while (low <= high) {
+                               union xfs_btree_key     key;
+                               union xfs_btree_key     *kp;
+
+                               XFS_BTREE_STATS_INC(cur, compare);
+
+                               /* keyno is average of low and high. */
+                               keyno = (low + high) >> 1;
+
+                               /* Get current search key */
+                               kp = xfs_lookup_get_search_key(cur, level,
+                                               keyno, block, &key);
+
+                               /*
+                                * Compute difference to get next direction:
+                                *  - less than, move right
+                                *  - greater than, move left
+                                *  - equal, we're done
+                                */
+                               diff = cur->bc_ops->key_diff(cur, kp);
+                               if (diff < 0)
+                                       low = keyno + 1;
+                               else if (diff > 0)
+                                       high = keyno - 1;
+                               else
+                                       break;
+                       }
+               }
+
+               /*
+                * If there are more levels, set up for the next level
+                * by getting the block number and filling in the cursor.
+                */
+               if (level > 0) {
+                       /*
+                        * If we moved left, need the previous key number,
+                        * unless there isn't one.
+                        */
+                       if (diff > 0 && --keyno < 1)
+                               keyno = 1;
+                       pp = xfs_btree_ptr_addr(cur, keyno, block);
+
+#ifdef DEBUG
+                       error = xfs_btree_check_ptr(cur, pp, 0, level);
+                       if (error)
+                               goto error0;
+#endif
+                       cur->bc_ptrs[level] = keyno;
+               }
+       }
+
+       /* Done with the search. See if we need to adjust the results. */
+       if (dir != XFS_LOOKUP_LE && diff < 0) {
+               keyno++;
+               /*
+                * If ge search and we went off the end of the block, but it's
+                * not the last block, we're in the wrong block.
+                */
+               xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
+               if (dir == XFS_LOOKUP_GE &&
+                   keyno > xfs_btree_get_numrecs(block) &&
+                   !xfs_btree_ptr_is_null(cur, &ptr)) {
+                       int     i;
+
+                       cur->bc_ptrs[0] = keyno;
+                       error = xfs_btree_increment(cur, 0, &i);
+                       if (error)
+                               goto error0;
+                       XFS_WANT_CORRUPTED_RETURN(i == 1);
+                       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+                       *stat = 1;
+                       return 0;
+               }
+       } else if (dir == XFS_LOOKUP_LE && diff > 0)
+               keyno--;
+       cur->bc_ptrs[0] = keyno;
+
+       /* Return if we succeeded or not. */
+       if (keyno == 0 || keyno > xfs_btree_get_numrecs(block))
+               *stat = 0;
+       else if (dir != XFS_LOOKUP_EQ || diff == 0)
+               *stat = 1;
+       else
+               *stat = 0;
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+       return 0;
+
+error0:
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+       return error;
+}
+
+/*
+ * Update keys at all levels from here to the root along the cursor's path.
+ */
+STATIC int
+xfs_btree_updkey(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_key     *keyp,
+       int                     level)
+{
+       struct xfs_btree_block  *block;
+       struct xfs_buf          *bp;
+       union xfs_btree_key     *kp;
+       int                     ptr;
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+       XFS_BTREE_TRACE_ARGIK(cur, level, keyp);
+
+       ASSERT(!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) || level >= 1);
+
+       /*
+        * Go up the tree from this level toward the root.
+        * At each level, update the key value to the value input.
+        * Stop when we reach a level where the cursor isn't pointing
+        * at the first entry in the block.
+        */
+       for (ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) {
+#ifdef DEBUG
+               int             error;
+#endif
+               block = xfs_btree_get_block(cur, level, &bp);
+#ifdef DEBUG
+               error = xfs_btree_check_block(cur, block, level, bp);
+               if (error) {
+                       XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+                       return error;
+               }
+#endif
+               ptr = cur->bc_ptrs[level];
+               kp = xfs_btree_key_addr(cur, ptr, block);
+               xfs_btree_copy_keys(cur, kp, keyp, 1);
+               xfs_btree_log_keys(cur, bp, ptr, ptr);
+       }
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+       return 0;
+}
+
+/*
+ * Update the record referred to by cur to the value in the
+ * given record. This either works (return 0) or gets an
+ * EFSCORRUPTED error.
+ */
+int
+xfs_btree_update(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_rec     *rec)
+{
+       struct xfs_btree_block  *block;
+       struct xfs_buf          *bp;
+       int                     error;
+       int                     ptr;
+       union xfs_btree_rec     *rp;
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+       XFS_BTREE_TRACE_ARGR(cur, rec);
+
+       /* Pick up the current block. */
+       block = xfs_btree_get_block(cur, 0, &bp);
+
+#ifdef DEBUG
+       error = xfs_btree_check_block(cur, block, 0, bp);
+       if (error)
+               goto error0;
+#endif
+       /* Get the address of the rec to be updated. */
+       ptr = cur->bc_ptrs[0];
+       rp = xfs_btree_rec_addr(cur, ptr, block);
+
+       /* Fill in the new contents and log them. */
+       xfs_btree_copy_recs(cur, rp, rec, 1);
+       xfs_btree_log_recs(cur, bp, ptr, ptr);
+
+       /*
+        * If we are tracking the last record in the tree and
+        * we are at the far right edge of the tree, update it.
+        */
+       if (xfs_btree_is_lastrec(cur, block, 0)) {
+               cur->bc_ops->update_lastrec(cur, block, rec,
+                                           ptr, LASTREC_UPDATE);
+       }
+
+       /* Updating first rec in leaf. Pass new key value up to our parent. */
+       if (ptr == 1) {
+               union xfs_btree_key     key;
+
+               cur->bc_ops->init_key_from_rec(&key, rec);
+               error = xfs_btree_updkey(cur, &key, 1);
+               if (error)
+                       goto error0;
+       }
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+       return 0;
+
+error0:
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+       return error;
+}
+
+/*
+ * Move 1 record left from cur/level if possible.
+ * Update cur to reflect the new path.
+ */
+STATIC int                                     /* error */
+xfs_btree_lshift(
+       struct xfs_btree_cur    *cur,
+       int                     level,
+       int                     *stat)          /* success/failure */
+{
+       union xfs_btree_key     key;            /* btree key */
+       struct xfs_buf          *lbp;           /* left buffer pointer */
+       struct xfs_btree_block  *left;          /* left btree block */
+       int                     lrecs;          /* left record count */
+       struct xfs_buf          *rbp;           /* right buffer pointer */
+       struct xfs_btree_block  *right;         /* right btree block */
+       int                     rrecs;          /* right record count */
+       union xfs_btree_ptr     lptr;           /* left btree pointer */
+       union xfs_btree_key     *rkp = NULL;    /* right btree key */
+       union xfs_btree_ptr     *rpp = NULL;    /* right address pointer */
+       union xfs_btree_rec     *rrp = NULL;    /* right record pointer */
+       int                     error;          /* error return value */
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+       XFS_BTREE_TRACE_ARGI(cur, level);
+
+       if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+           level == cur->bc_nlevels - 1)
+               goto out0;
+
+       /* Set up variables for this block as "right". */
+       right = xfs_btree_get_block(cur, level, &rbp);
+
+#ifdef DEBUG
+       error = xfs_btree_check_block(cur, right, level, rbp);
+       if (error)
+               goto error0;
+#endif
+
+       /* If we've got no left sibling then we can't shift an entry left. */
+       xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
+       if (xfs_btree_ptr_is_null(cur, &lptr))
+               goto out0;
+
+       /*
+        * If the cursor entry is the one that would be moved, don't
+        * do it... it's too complicated.
+        */
+       if (cur->bc_ptrs[level] <= 1)
+               goto out0;
+
+       /* Set up the left neighbor as "left". */
+       error = xfs_btree_read_buf_block(cur, &lptr, level, 0, &left, &lbp);
+       if (error)
+               goto error0;
+
+       /* If it's full, it can't take another entry. */
+       lrecs = xfs_btree_get_numrecs(left);
+       if (lrecs == cur->bc_ops->get_maxrecs(cur, level))
+               goto out0;
+
+       rrecs = xfs_btree_get_numrecs(right);
+
+       /*
+        * We add one entry to the left side and remove one for the right side.
+        * Accout for it here, the changes will be updated on disk and logged
+        * later.
+        */
+       lrecs++;
+       rrecs--;
+
+       XFS_BTREE_STATS_INC(cur, lshift);
+       XFS_BTREE_STATS_ADD(cur, moves, 1);
+
+       /*
+        * If non-leaf, copy a key and a ptr to the left block.
+        * Log the changes to the left block.
+        */
+       if (level > 0) {
+               /* It's a non-leaf.  Move keys and pointers. */
+               union xfs_btree_key     *lkp;   /* left btree key */
+               union xfs_btree_ptr     *lpp;   /* left address pointer */
+
+               lkp = xfs_btree_key_addr(cur, lrecs, left);
+               rkp = xfs_btree_key_addr(cur, 1, right);
+
+               lpp = xfs_btree_ptr_addr(cur, lrecs, left);
+               rpp = xfs_btree_ptr_addr(cur, 1, right);
+#ifdef DEBUG
+               error = xfs_btree_check_ptr(cur, rpp, 0, level);
+               if (error)
+                       goto error0;
+#endif
+               xfs_btree_copy_keys(cur, lkp, rkp, 1);
+               xfs_btree_copy_ptrs(cur, lpp, rpp, 1);
+
+               xfs_btree_log_keys(cur, lbp, lrecs, lrecs);
+               xfs_btree_log_ptrs(cur, lbp, lrecs, lrecs);
+
+               ASSERT(cur->bc_ops->keys_inorder(cur,
+                       xfs_btree_key_addr(cur, lrecs - 1, left), lkp));
+       } else {
+               /* It's a leaf.  Move records.  */
+               union xfs_btree_rec     *lrp;   /* left record pointer */
+
+               lrp = xfs_btree_rec_addr(cur, lrecs, left);
+               rrp = xfs_btree_rec_addr(cur, 1, right);
+
+               xfs_btree_copy_recs(cur, lrp, rrp, 1);
+               xfs_btree_log_recs(cur, lbp, lrecs, lrecs);
+
+               ASSERT(cur->bc_ops->recs_inorder(cur,
+                       xfs_btree_rec_addr(cur, lrecs - 1, left), lrp));
+       }
+
+       xfs_btree_set_numrecs(left, lrecs);
+       xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS);
+
+       xfs_btree_set_numrecs(right, rrecs);
+       xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS);
+
+       /*
+        * Slide the contents of right down one entry.
+        */
+       XFS_BTREE_STATS_ADD(cur, moves, rrecs - 1);
+       if (level > 0) {
+               /* It's a nonleaf. operate on keys and ptrs */
+#ifdef DEBUG
+               int                     i;              /* loop index */
+
+               for (i = 0; i < rrecs; i++) {
+                       error = xfs_btree_check_ptr(cur, rpp, i + 1, level);
+                       if (error)
+                               goto error0;
+               }
+#endif
+               xfs_btree_shift_keys(cur,
+                               xfs_btree_key_addr(cur, 2, right),
+                               -1, rrecs);
+               xfs_btree_shift_ptrs(cur,
+                               xfs_btree_ptr_addr(cur, 2, right),
+                               -1, rrecs);
+
+               xfs_btree_log_keys(cur, rbp, 1, rrecs);
+               xfs_btree_log_ptrs(cur, rbp, 1, rrecs);
+       } else {
+               /* It's a leaf. operate on records */
+               xfs_btree_shift_recs(cur,
+                       xfs_btree_rec_addr(cur, 2, right),
+                       -1, rrecs);
+               xfs_btree_log_recs(cur, rbp, 1, rrecs);
+
+               /*
+                * If it's the first record in the block, we'll need a key
+                * structure to pass up to the next level (updkey).
+                */
+               cur->bc_ops->init_key_from_rec(&key,
+                       xfs_btree_rec_addr(cur, 1, right));
+               rkp = &key;
+       }
+
+       /* Update the parent key values of right. */
+       error = xfs_btree_updkey(cur, rkp, level + 1);
+       if (error)
+               goto error0;
+
+       /* Slide the cursor value left one. */
+       cur->bc_ptrs[level]--;
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+       *stat = 1;
+       return 0;
+
+out0:
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+       *stat = 0;
+       return 0;
+
+error0:
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+       return error;
+}
+
+/*
+ * Move 1 record right from cur/level if possible.
+ * Update cur to reflect the new path.
+ */
+STATIC int                                     /* error */
+xfs_btree_rshift(
+       struct xfs_btree_cur    *cur,
+       int                     level,
+       int                     *stat)          /* success/failure */
+{
+       union xfs_btree_key     key;            /* btree key */
+       struct xfs_buf          *lbp;           /* left buffer pointer */
+       struct xfs_btree_block  *left;          /* left btree block */
+       struct xfs_buf          *rbp;           /* right buffer pointer */
+       struct xfs_btree_block  *right;         /* right btree block */
+       struct xfs_btree_cur    *tcur;          /* temporary btree cursor */
+       union xfs_btree_ptr     rptr;           /* right block pointer */
+       union xfs_btree_key     *rkp;           /* right btree key */
+       int                     rrecs;          /* right record count */
+       int                     lrecs;          /* left record count */
+       int                     error;          /* error return value */
+       int                     i;              /* loop counter */
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+       XFS_BTREE_TRACE_ARGI(cur, level);
+
+       if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+           (level == cur->bc_nlevels - 1))
+               goto out0;
+
+       /* Set up variables for this block as "left". */
+       left = xfs_btree_get_block(cur, level, &lbp);
+
+#ifdef DEBUG
+       error = xfs_btree_check_block(cur, left, level, lbp);
+       if (error)
+               goto error0;
+#endif
+
+       /* If we've got no right sibling then we can't shift an entry right. */
+       xfs_btree_get_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB);
+       if (xfs_btree_ptr_is_null(cur, &rptr))
+               goto out0;
+
+       /*
+        * If the cursor entry is the one that would be moved, don't
+        * do it... it's too complicated.
+        */
+       lrecs = xfs_btree_get_numrecs(left);
+       if (cur->bc_ptrs[level] >= lrecs)
+               goto out0;
+
+       /* Set up the right neighbor as "right". */
+       error = xfs_btree_read_buf_block(cur, &rptr, level, 0, &right, &rbp);
+       if (error)
+               goto error0;
+
+       /* If it's full, it can't take another entry. */
+       rrecs = xfs_btree_get_numrecs(right);
+       if (rrecs == cur->bc_ops->get_maxrecs(cur, level))
+               goto out0;
+
+       XFS_BTREE_STATS_INC(cur, rshift);
+       XFS_BTREE_STATS_ADD(cur, moves, rrecs);
+
+       /*
+        * Make a hole at the start of the right neighbor block, then
+        * copy the last left block entry to the hole.
+        */
+       if (level > 0) {
+               /* It's a nonleaf. make a hole in the keys and ptrs */
+               union xfs_btree_key     *lkp;
+               union xfs_btree_ptr     *lpp;
+               union xfs_btree_ptr     *rpp;
+
+               lkp = xfs_btree_key_addr(cur, lrecs, left);
+               lpp = xfs_btree_ptr_addr(cur, lrecs, left);
+               rkp = xfs_btree_key_addr(cur, 1, right);
+               rpp = xfs_btree_ptr_addr(cur, 1, right);
+
+#ifdef DEBUG
+               for (i = rrecs - 1; i >= 0; i--) {
+                       error = xfs_btree_check_ptr(cur, rpp, i, level);
+                       if (error)
+                               goto error0;
+               }
+#endif
+
+               xfs_btree_shift_keys(cur, rkp, 1, rrecs);
+               xfs_btree_shift_ptrs(cur, rpp, 1, rrecs);
+
+#ifdef DEBUG
+               error = xfs_btree_check_ptr(cur, lpp, 0, level);
+               if (error)
+                       goto error0;
+#endif
+
+               /* Now put the new data in, and log it. */
+               xfs_btree_copy_keys(cur, rkp, lkp, 1);
+               xfs_btree_copy_ptrs(cur, rpp, lpp, 1);
+
+               xfs_btree_log_keys(cur, rbp, 1, rrecs + 1);
+               xfs_btree_log_ptrs(cur, rbp, 1, rrecs + 1);
+
+               ASSERT(cur->bc_ops->keys_inorder(cur, rkp,
+                       xfs_btree_key_addr(cur, 2, right)));
+       } else {
+               /* It's a leaf. make a hole in the records */
+               union xfs_btree_rec     *lrp;
+               union xfs_btree_rec     *rrp;
+
+               lrp = xfs_btree_rec_addr(cur, lrecs, left);
+               rrp = xfs_btree_rec_addr(cur, 1, right);
+
+               xfs_btree_shift_recs(cur, rrp, 1, rrecs);
+
+               /* Now put the new data in, and log it. */
+               xfs_btree_copy_recs(cur, rrp, lrp, 1);
+               xfs_btree_log_recs(cur, rbp, 1, rrecs + 1);
+
+               cur->bc_ops->init_key_from_rec(&key, rrp);
+               rkp = &key;
+
+               ASSERT(cur->bc_ops->recs_inorder(cur, rrp,
+                       xfs_btree_rec_addr(cur, 2, right)));
+       }
+
+       /*
+        * Decrement and log left's numrecs, bump and log right's numrecs.
+        */
+       xfs_btree_set_numrecs(left, --lrecs);
+       xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS);
+
+       xfs_btree_set_numrecs(right, ++rrecs);
+       xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS);
+
+       /*
+        * Using a temporary cursor, update the parent key values of the
+        * block on the right.
+        */
+       error = xfs_btree_dup_cursor(cur, &tcur);
+       if (error)
+               goto error0;
+       i = xfs_btree_lastrec(tcur, level);
+       XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+       error = xfs_btree_increment(tcur, level, &i);
+       if (error)
+               goto error1;
+
+       error = xfs_btree_updkey(tcur, rkp, level + 1);
+       if (error)
+               goto error1;
+
+       xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+       *stat = 1;
+       return 0;
+
+out0:
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+       *stat = 0;
+       return 0;
+
+error0:
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+       return error;
+
+error1:
+       XFS_BTREE_TRACE_CURSOR(tcur, XBT_ERROR);
+       xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
+       return error;
+}
+
+/*
+ * Split cur/level block in half.
+ * Return new block number and the key to its first
+ * record (to be inserted into parent).
+ */
+STATIC int                                     /* error */
+xfs_btree_split(
+       struct xfs_btree_cur    *cur,
+       int                     level,
+       union xfs_btree_ptr     *ptrp,
+       union xfs_btree_key     *key,
+       struct xfs_btree_cur    **curp,
+       int                     *stat)          /* success/failure */
+{
+       union xfs_btree_ptr     lptr;           /* left sibling block ptr */
+       struct xfs_buf          *lbp;           /* left buffer pointer */
+       struct xfs_btree_block  *left;          /* left btree block */
+       union xfs_btree_ptr     rptr;           /* right sibling block ptr */
+       struct xfs_buf          *rbp;           /* right buffer pointer */
+       struct xfs_btree_block  *right;         /* right btree block */
+       union xfs_btree_ptr     rrptr;          /* right-right sibling ptr */
+       struct xfs_buf          *rrbp;          /* right-right buffer pointer */
+       struct xfs_btree_block  *rrblock;       /* right-right btree block */
+       int                     lrecs;
+       int                     rrecs;
+       int                     src_index;
+       int                     error;          /* error return value */
+#ifdef DEBUG
+       int                     i;
+#endif
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+       XFS_BTREE_TRACE_ARGIPK(cur, level, *ptrp, key);
+
+       XFS_BTREE_STATS_INC(cur, split);
+
+       /* Set up left block (current one). */
+       left = xfs_btree_get_block(cur, level, &lbp);
+
+#ifdef DEBUG
+       error = xfs_btree_check_block(cur, left, level, lbp);
+       if (error)
+               goto error0;
+#endif
+
+       xfs_btree_buf_to_ptr(cur, lbp, &lptr);
+
+       /* Allocate the new block. If we can't do it, we're toast. Give up. */
+       error = cur->bc_ops->alloc_block(cur, &lptr, &rptr, 1, stat);
+       if (error)
+               goto error0;
+       if (*stat == 0)
+               goto out0;
+       XFS_BTREE_STATS_INC(cur, alloc);
+
+       /* Set up the new block as "right". */
+       error = xfs_btree_get_buf_block(cur, &rptr, 0, &right, &rbp);
+       if (error)
+               goto error0;
+
+       /* Fill in the btree header for the new right block. */
+       xfs_btree_init_block(cur, xfs_btree_get_level(left), 0, right);
+
+       /*
+        * Split the entries between the old and the new block evenly.
+        * Make sure that if there's an odd number of entries now, that
+        * each new block will have the same number of entries.
+        */
+       lrecs = xfs_btree_get_numrecs(left);
+       rrecs = lrecs / 2;
+       if ((lrecs & 1) && cur->bc_ptrs[level] <= rrecs + 1)
+               rrecs++;
+       src_index = (lrecs - rrecs + 1);
+
+       XFS_BTREE_STATS_ADD(cur, moves, rrecs);
+
+       /*
+        * Copy btree block entries from the left block over to the
+        * new block, the right. Update the right block and log the
+        * changes.
+        */
+       if (level > 0) {
+               /* It's a non-leaf.  Move keys and pointers. */
+               union xfs_btree_key     *lkp;   /* left btree key */
+               union xfs_btree_ptr     *lpp;   /* left address pointer */
+               union xfs_btree_key     *rkp;   /* right btree key */
+               union xfs_btree_ptr     *rpp;   /* right address pointer */
+
+               lkp = xfs_btree_key_addr(cur, src_index, left);
+               lpp = xfs_btree_ptr_addr(cur, src_index, left);
+               rkp = xfs_btree_key_addr(cur, 1, right);
+               rpp = xfs_btree_ptr_addr(cur, 1, right);
+
+#ifdef DEBUG
+               for (i = src_index; i < rrecs; i++) {
+                       error = xfs_btree_check_ptr(cur, lpp, i, level);
+                       if (error)
+                               goto error0;
+               }
+#endif
+
+               xfs_btree_copy_keys(cur, rkp, lkp, rrecs);
+               xfs_btree_copy_ptrs(cur, rpp, lpp, rrecs);
+
+               xfs_btree_log_keys(cur, rbp, 1, rrecs);
+               xfs_btree_log_ptrs(cur, rbp, 1, rrecs);
+
+               /* Grab the keys to the entries moved to the right block */
+               xfs_btree_copy_keys(cur, key, rkp, 1);
+       } else {
+               /* It's a leaf.  Move records.  */
+               union xfs_btree_rec     *lrp;   /* left record pointer */
+               union xfs_btree_rec     *rrp;   /* right record pointer */
+
+               lrp = xfs_btree_rec_addr(cur, src_index, left);
+               rrp = xfs_btree_rec_addr(cur, 1, right);
+
+               xfs_btree_copy_recs(cur, rrp, lrp, rrecs);
+               xfs_btree_log_recs(cur, rbp, 1, rrecs);
+
+               cur->bc_ops->init_key_from_rec(key,
+                       xfs_btree_rec_addr(cur, 1, right));
+       }
+
+
+       /*
+        * Find the left block number by looking in the buffer.
+        * Adjust numrecs, sibling pointers.
+        */
+       xfs_btree_get_sibling(cur, left, &rrptr, XFS_BB_RIGHTSIB);
+       xfs_btree_set_sibling(cur, right, &rrptr, XFS_BB_RIGHTSIB);
+       xfs_btree_set_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
+       xfs_btree_set_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB);
+
+       lrecs -= rrecs;
+       xfs_btree_set_numrecs(left, lrecs);
+       xfs_btree_set_numrecs(right, xfs_btree_get_numrecs(right) + rrecs);
+
+       xfs_btree_log_block(cur, rbp, XFS_BB_ALL_BITS);
+       xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
+
+       /*
+        * If there's a block to the new block's right, make that block
+        * point back to right instead of to left.
+        */
+       if (!xfs_btree_ptr_is_null(cur, &rrptr)) {
+               error = xfs_btree_read_buf_block(cur, &rrptr, level,
+                                                       0, &rrblock, &rrbp);
+               if (error)
+                       goto error0;
+               xfs_btree_set_sibling(cur, rrblock, &rptr, XFS_BB_LEFTSIB);
+               xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB);
+       }
+       /*
+        * If the cursor is really in the right block, move it there.
+        * If it's just pointing past the last entry in left, then we'll
+        * insert there, so don't change anything in that case.
+        */
+       if (cur->bc_ptrs[level] > lrecs + 1) {
+               xfs_btree_setbuf(cur, level, rbp);
+               cur->bc_ptrs[level] -= lrecs;
+       }
+       /*
+        * If there are more levels, we'll need another cursor which refers
+        * the right block, no matter where this cursor was.
+        */
+       if (level + 1 < cur->bc_nlevels) {
+               error = xfs_btree_dup_cursor(cur, curp);
+               if (error)
+                       goto error0;
+               (*curp)->bc_ptrs[level + 1]++;
+       }
+       *ptrp = rptr;
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+       *stat = 1;
+       return 0;
+out0:
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+       *stat = 0;
+       return 0;
+
+error0:
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+       return error;
+}
+
+/*
+ * Copy the old inode root contents into a real block and make the
+ * broot point to it.
+ */
+int                                            /* error */
+xfs_btree_new_iroot(
+       struct xfs_btree_cur    *cur,           /* btree cursor */
+       int                     *logflags,      /* logging flags for inode */
+       int                     *stat)          /* return status - 0 fail */
+{
+       struct xfs_buf          *cbp;           /* buffer for cblock */
+       struct xfs_btree_block  *block;         /* btree block */
+       struct xfs_btree_block  *cblock;        /* child btree block */
+       union xfs_btree_key     *ckp;           /* child key pointer */
+       union xfs_btree_ptr     *cpp;           /* child ptr pointer */
+       union xfs_btree_key     *kp;            /* pointer to btree key */
+       union xfs_btree_ptr     *pp;            /* pointer to block addr */
+       union xfs_btree_ptr     nptr;           /* new block addr */
+       int                     level;          /* btree level */
+       int                     error;          /* error return code */
+#ifdef DEBUG
+       int                     i;              /* loop counter */
+#endif
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+       XFS_BTREE_STATS_INC(cur, newroot);
+
+       ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
+
+       level = cur->bc_nlevels - 1;
+
+       block = xfs_btree_get_iroot(cur);
+       pp = xfs_btree_ptr_addr(cur, 1, block);
+
+       /* Allocate the new block. If we can't do it, we're toast. Give up. */
+       error = cur->bc_ops->alloc_block(cur, pp, &nptr, 1, stat);
+       if (error)
+               goto error0;
+       if (*stat == 0) {
+               XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+               return 0;
+       }
+       XFS_BTREE_STATS_INC(cur, alloc);
+
+       /* Copy the root into a real block. */
+       error = xfs_btree_get_buf_block(cur, &nptr, 0, &cblock, &cbp);
+       if (error)
+               goto error0;
+
+       memcpy(cblock, block, xfs_btree_block_len(cur));
+
+       be16_add_cpu(&block->bb_level, 1);
+       xfs_btree_set_numrecs(block, 1);
+       cur->bc_nlevels++;
+       cur->bc_ptrs[level + 1] = 1;
+
+       kp = xfs_btree_key_addr(cur, 1, block);
+       ckp = xfs_btree_key_addr(cur, 1, cblock);
+       xfs_btree_copy_keys(cur, ckp, kp, xfs_btree_get_numrecs(cblock));
+
+       cpp = xfs_btree_ptr_addr(cur, 1, cblock);
+#ifdef DEBUG
+       for (i = 0; i < be16_to_cpu(cblock->bb_numrecs); i++) {
+               error = xfs_btree_check_ptr(cur, pp, i, level);
+               if (error)
+                       goto error0;
+       }
+#endif
+       xfs_btree_copy_ptrs(cur, cpp, pp, xfs_btree_get_numrecs(cblock));
+
+#ifdef DEBUG
+       error = xfs_btree_check_ptr(cur, &nptr, 0, level);
+       if (error)
+               goto error0;
+#endif
+       xfs_btree_copy_ptrs(cur, pp, &nptr, 1);
+
+       xfs_iroot_realloc(cur->bc_private.b.ip,
+                         1 - xfs_btree_get_numrecs(cblock),
+                         cur->bc_private.b.whichfork);
+
+       xfs_btree_setbuf(cur, level, cbp);
+
+       /*
+        * Do all this logging at the end so that
+        * the root is at the right level.
+        */
+       xfs_btree_log_block(cur, cbp, XFS_BB_ALL_BITS);
+       xfs_btree_log_keys(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
+       xfs_btree_log_ptrs(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
+
+       *logflags |=
+               XFS_ILOG_CORE | XFS_ILOG_FBROOT(cur->bc_private.b.whichfork);
+       *stat = 1;
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+       return 0;
+error0:
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+       return error;
+}
+
+/*
+ * Allocate a new root block, fill it in.
+ */
+STATIC int                             /* error */
+xfs_btree_new_root(
+       struct xfs_btree_cur    *cur,   /* btree cursor */
+       int                     *stat)  /* success/failure */
+{
+       struct xfs_btree_block  *block; /* one half of the old root block */
+       struct xfs_buf          *bp;    /* buffer containing block */
+       int                     error;  /* error return value */
+       struct xfs_buf          *lbp;   /* left buffer pointer */
+       struct xfs_btree_block  *left;  /* left btree block */
+       struct xfs_buf          *nbp;   /* new (root) buffer */
+       struct xfs_btree_block  *new;   /* new (root) btree block */
+       int                     nptr;   /* new value for key index, 1 or 2 */
+       struct xfs_buf          *rbp;   /* right buffer pointer */
+       struct xfs_btree_block  *right; /* right btree block */
+       union xfs_btree_ptr     rptr;
+       union xfs_btree_ptr     lptr;
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+       XFS_BTREE_STATS_INC(cur, newroot);
+
+       /* initialise our start point from the cursor */
+       cur->bc_ops->init_ptr_from_cur(cur, &rptr);
+
+       /* Allocate the new block. If we can't do it, we're toast. Give up. */
+       error = cur->bc_ops->alloc_block(cur, &rptr, &lptr, 1, stat);
+       if (error)
+               goto error0;
+       if (*stat == 0)
+               goto out0;
+       XFS_BTREE_STATS_INC(cur, alloc);
+
+       /* Set up the new block. */
+       error = xfs_btree_get_buf_block(cur, &lptr, 0, &new, &nbp);
+       if (error)
+               goto error0;
+
+       /* Set the root in the holding structure  increasing the level by 1. */
+       cur->bc_ops->set_root(cur, &lptr, 1);
+
+       /*
+        * At the previous root level there are now two blocks: the old root,
+        * and the new block generated when it was split.  We don't know which
+        * one the cursor is pointing at, so we set up variables "left" and
+        * "right" for each case.
+        */
+       block = xfs_btree_get_block(cur, cur->bc_nlevels - 1, &bp);
+
+#ifdef DEBUG
+       error = xfs_btree_check_block(cur, block, cur->bc_nlevels - 1, bp);
+       if (error)
+               goto error0;
+#endif
+
+       xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
+       if (!xfs_btree_ptr_is_null(cur, &rptr)) {
+               /* Our block is left, pick up the right block. */
+               lbp = bp;
+               xfs_btree_buf_to_ptr(cur, lbp, &lptr);
+               left = block;
+               error = xfs_btree_read_buf_block(cur, &rptr,
+                                       cur->bc_nlevels - 1, 0, &right, &rbp);
+               if (error)
+                       goto error0;
+               bp = rbp;
+               nptr = 1;
+       } else {
+               /* Our block is right, pick up the left block. */
+               rbp = bp;
+               xfs_btree_buf_to_ptr(cur, rbp, &rptr);
+               right = block;
+               xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
+               error = xfs_btree_read_buf_block(cur, &lptr,
+                                       cur->bc_nlevels - 1, 0, &left, &lbp);
+               if (error)
+                       goto error0;
+               bp = lbp;
+               nptr = 2;
+       }
+       /* Fill in the new block's btree header and log it. */
+       xfs_btree_init_block(cur, cur->bc_nlevels, 2, new);
+       xfs_btree_log_block(cur, nbp, XFS_BB_ALL_BITS);
+       ASSERT(!xfs_btree_ptr_is_null(cur, &lptr) &&
+                       !xfs_btree_ptr_is_null(cur, &rptr));
+
+       /* Fill in the key data in the new root. */
+       if (xfs_btree_get_level(left) > 0) {
+               xfs_btree_copy_keys(cur,
+                               xfs_btree_key_addr(cur, 1, new),
+                               xfs_btree_key_addr(cur, 1, left), 1);
+               xfs_btree_copy_keys(cur,
+                               xfs_btree_key_addr(cur, 2, new),
+                               xfs_btree_key_addr(cur, 1, right), 1);
+       } else {
+               cur->bc_ops->init_key_from_rec(
+                               xfs_btree_key_addr(cur, 1, new),
+                               xfs_btree_rec_addr(cur, 1, left));
+               cur->bc_ops->init_key_from_rec(
+                               xfs_btree_key_addr(cur, 2, new),
+                               xfs_btree_rec_addr(cur, 1, right));
+       }
+       xfs_btree_log_keys(cur, nbp, 1, 2);
+
+       /* Fill in the pointer data in the new root. */
+       xfs_btree_copy_ptrs(cur,
+               xfs_btree_ptr_addr(cur, 1, new), &lptr, 1);
+       xfs_btree_copy_ptrs(cur,
+               xfs_btree_ptr_addr(cur, 2, new), &rptr, 1);
+       xfs_btree_log_ptrs(cur, nbp, 1, 2);
+
+       /* Fix up the cursor. */
+       xfs_btree_setbuf(cur, cur->bc_nlevels, nbp);
+       cur->bc_ptrs[cur->bc_nlevels] = nptr;
+       cur->bc_nlevels++;
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+       *stat = 1;
+       return 0;
+error0:
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+       return error;
+out0:
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+       *stat = 0;
+       return 0;
+}
+
+STATIC int
+xfs_btree_make_block_unfull(
+       struct xfs_btree_cur    *cur,   /* btree cursor */
+       int                     level,  /* btree level */
+       int                     numrecs,/* # of recs in block */
+       int                     *oindex,/* old tree index */
+       int                     *index, /* new tree index */
+       union xfs_btree_ptr     *nptr,  /* new btree ptr */
+       struct xfs_btree_cur    **ncur, /* new btree cursor */
+       union xfs_btree_rec     *nrec,  /* new record */
+       int                     *stat)
+{
+       union xfs_btree_key     key;    /* new btree key value */
+       int                     error = 0;
+
+       if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+           level == cur->bc_nlevels - 1) {
+               struct xfs_inode *ip = cur->bc_private.b.ip;
+
+               if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) {
+                       /* A root block that can be made bigger. */
+
+                       xfs_iroot_realloc(ip, 1, cur->bc_private.b.whichfork);
+               } else {
+                       /* A root block that needs replacing */
+                       int     logflags = 0;
+
+                       error = xfs_btree_new_iroot(cur, &logflags, stat);
+                       if (error || *stat == 0)
+                               return error;
+
+                       xfs_trans_log_inode(cur->bc_tp, ip, logflags);
+               }
+
+               return 0;
+       }
+
+       /* First, try shifting an entry to the right neighbor. */
+       error = xfs_btree_rshift(cur, level, stat);
+       if (error || *stat)
+               return error;
+
+       /* Next, try shifting an entry to the left neighbor. */
+       error = xfs_btree_lshift(cur, level, stat);
+       if (error)
+               return error;
+
+       if (*stat) {
+               *oindex = *index = cur->bc_ptrs[level];
+               return 0;
+       }
+
+       /*
+        * Next, try splitting the current block in half.
+        *
+        * If this works we have to re-set our variables because we
+        * could be in a different block now.
+        */
+       error = xfs_btree_split(cur, level, nptr, &key, ncur, stat);
+       if (error || *stat == 0)
+               return error;
+
+
+       *index = cur->bc_ptrs[level];
+       cur->bc_ops->init_rec_from_key(&key, nrec);
+       return 0;
+}
+
+/*
+ * Insert one record/level.  Return information to the caller
+ * allowing the next level up to proceed if necessary.
+ */
+STATIC int
+xfs_btree_insrec(
+       struct xfs_btree_cur    *cur,   /* btree cursor */
+       int                     level,  /* level to insert record at */
+       union xfs_btree_ptr     *ptrp,  /* i/o: block number inserted */
+       union xfs_btree_rec     *recp,  /* i/o: record data inserted */
+       struct xfs_btree_cur    **curp, /* output: new cursor replacing cur */
+       int                     *stat)  /* success/failure */
+{
+       struct xfs_btree_block  *block; /* btree block */
+       struct xfs_buf          *bp;    /* buffer for block */
+       union xfs_btree_key     key;    /* btree key */
+       union xfs_btree_ptr     nptr;   /* new block ptr */
+       struct xfs_btree_cur    *ncur;  /* new btree cursor */
+       union xfs_btree_rec     nrec;   /* new record count */
+       int                     optr;   /* old key/record index */
+       int                     ptr;    /* key/record index */
+       int                     numrecs;/* number of records */
+       int                     error;  /* error return value */
+#ifdef DEBUG
+       int                     i;
+#endif
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+       XFS_BTREE_TRACE_ARGIPR(cur, level, *ptrp, recp);
+
+       ncur = NULL;
+
+       /*
+        * If we have an external root pointer, and we've made it to the
+        * root level, allocate a new root block and we're done.
+        */
+       if (!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+           (level >= cur->bc_nlevels)) {
+               error = xfs_btree_new_root(cur, stat);
+               xfs_btree_set_ptr_null(cur, ptrp);
+
+               XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+               return error;
+       }
+
+       /* If we're off the left edge, return failure. */
+       ptr = cur->bc_ptrs[level];
+       if (ptr == 0) {
+               XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+               *stat = 0;
+               return 0;
+       }
+
+       /* Make a key out of the record data to be inserted, and save it. */
+       cur->bc_ops->init_key_from_rec(&key, recp);
+
+       optr = ptr;
+
+       XFS_BTREE_STATS_INC(cur, insrec);
+
+       /* Get pointers to the btree buffer and block. */
+       block = xfs_btree_get_block(cur, level, &bp);
+       numrecs = xfs_btree_get_numrecs(block);
+
+#ifdef DEBUG
+       error = xfs_btree_check_block(cur, block, level, bp);
+       if (error)
+               goto error0;
+
+       /* Check that the new entry is being inserted in the right place. */
+       if (ptr <= numrecs) {
+               if (level == 0) {
+                       ASSERT(cur->bc_ops->recs_inorder(cur, recp,
+                               xfs_btree_rec_addr(cur, ptr, block)));
+               } else {
+                       ASSERT(cur->bc_ops->keys_inorder(cur, &key,
+                               xfs_btree_key_addr(cur, ptr, block)));
+               }
+       }
+#endif
+
+       /*
+        * If the block is full, we can't insert the new entry until we
+        * make the block un-full.
+        */
+       xfs_btree_set_ptr_null(cur, &nptr);
+       if (numrecs == cur->bc_ops->get_maxrecs(cur, level)) {
+               error = xfs_btree_make_block_unfull(cur, level, numrecs,
+                                       &optr, &ptr, &nptr, &ncur, &nrec, stat);
+               if (error || *stat == 0)
+                       goto error0;
+       }
+
+       /*
+        * The current block may have changed if the block was
+        * previously full and we have just made space in it.
+        */
+       block = xfs_btree_get_block(cur, level, &bp);
+       numrecs = xfs_btree_get_numrecs(block);
+
+#ifdef DEBUG
+       error = xfs_btree_check_block(cur, block, level, bp);
+       if (error)
+               return error;
+#endif
+
+       /*
+        * At this point we know there's room for our new entry in the block
+        * we're pointing at.
+        */
+       XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr + 1);
+
+       if (level > 0) {
+               /* It's a nonleaf. make a hole in the keys and ptrs */
+               union xfs_btree_key     *kp;
+               union xfs_btree_ptr     *pp;
+
+               kp = xfs_btree_key_addr(cur, ptr, block);
+               pp = xfs_btree_ptr_addr(cur, ptr, block);
+
+#ifdef DEBUG
+               for (i = numrecs - ptr; i >= 0; i--) {
+                       error = xfs_btree_check_ptr(cur, pp, i, level);
+                       if (error)
+                               return error;
+               }
+#endif
+
+               xfs_btree_shift_keys(cur, kp, 1, numrecs - ptr + 1);
+               xfs_btree_shift_ptrs(cur, pp, 1, numrecs - ptr + 1);
+
+#ifdef DEBUG
+               error = xfs_btree_check_ptr(cur, ptrp, 0, level);
+               if (error)
+                       goto error0;
+#endif
+
+               /* Now put the new data in, bump numrecs and log it. */
+               xfs_btree_copy_keys(cur, kp, &key, 1);
+               xfs_btree_copy_ptrs(cur, pp, ptrp, 1);
+               numrecs++;
+               xfs_btree_set_numrecs(block, numrecs);
+               xfs_btree_log_ptrs(cur, bp, ptr, numrecs);
+               xfs_btree_log_keys(cur, bp, ptr, numrecs);
+#ifdef DEBUG
+               if (ptr < numrecs) {
+                       ASSERT(cur->bc_ops->keys_inorder(cur, kp,
+                               xfs_btree_key_addr(cur, ptr + 1, block)));
+               }
+#endif
+       } else {
+               /* It's a leaf. make a hole in the records */
+               union xfs_btree_rec             *rp;
+
+               rp = xfs_btree_rec_addr(cur, ptr, block);
+
+               xfs_btree_shift_recs(cur, rp, 1, numrecs - ptr + 1);
+
+               /* Now put the new data in, bump numrecs and log it. */
+               xfs_btree_copy_recs(cur, rp, recp, 1);
+               xfs_btree_set_numrecs(block, ++numrecs);
+               xfs_btree_log_recs(cur, bp, ptr, numrecs);
+#ifdef DEBUG
+               if (ptr < numrecs) {
+                       ASSERT(cur->bc_ops->recs_inorder(cur, rp,
+                               xfs_btree_rec_addr(cur, ptr + 1, block)));
+               }
+#endif
+       }
+
+       /* Log the new number of records in the btree header. */
+       xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS);
+
+       /* If we inserted at the start of a block, update the parents' keys. */
+       if (optr == 1) {
+               error = xfs_btree_updkey(cur, &key, level + 1);
+               if (error)
+                       goto error0;
+       }
+
+       /*
+        * If we are tracking the last record in the tree and
+        * we are at the far right edge of the tree, update it.
+        */
+       if (xfs_btree_is_lastrec(cur, block, level)) {
+               cur->bc_ops->update_lastrec(cur, block, recp,
+                                           ptr, LASTREC_INSREC);
+       }
+
+       /*
+        * Return the new block number, if any.
+        * If there is one, give back a record value and a cursor too.
+        */
+       *ptrp = nptr;
+       if (!xfs_btree_ptr_is_null(cur, &nptr)) {
+               *recp = nrec;
+               *curp = ncur;
+       }
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+       *stat = 1;
+       return 0;
+
+error0:
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+       return error;
+}
+
+/*
+ * Insert the record at the point referenced by cur.
+ *
+ * A multi-level split of the tree on insert will invalidate the original
+ * cursor.  All callers of this function should assume that the cursor is
+ * no longer valid and revalidate it.
+ */
+int
+xfs_btree_insert(
+       struct xfs_btree_cur    *cur,
+       int                     *stat)
+{
+       int                     error;  /* error return value */
+       int                     i;      /* result value, 0 for failure */
+       int                     level;  /* current level number in btree */
+       union xfs_btree_ptr     nptr;   /* new block number (split result) */
+       struct xfs_btree_cur    *ncur;  /* new cursor (split result) */
+       struct xfs_btree_cur    *pcur;  /* previous level's cursor */
+       union xfs_btree_rec     rec;    /* record to insert */
+
+       level = 0;
+       ncur = NULL;
+       pcur = cur;
+
+       xfs_btree_set_ptr_null(cur, &nptr);
+       cur->bc_ops->init_rec_from_cur(cur, &rec);
+
+       /*
+        * Loop going up the tree, starting at the leaf level.
+        * Stop when we don't get a split block, that must mean that
+        * the insert is finished with this level.
+        */
+       do {
+               /*
+                * Insert nrec/nptr into this level of the tree.
+                * Note if we fail, nptr will be null.
+                */
+               error = xfs_btree_insrec(pcur, level, &nptr, &rec, &ncur, &i);
+               if (error) {
+                       if (pcur != cur)
+                               xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR);
+                       goto error0;
+               }
+
+               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+               level++;
+
+               /*
+                * See if the cursor we just used is trash.
+                * Can't trash the caller's cursor, but otherwise we should
+                * if ncur is a new cursor or we're about to be done.
+                */
+               if (pcur != cur &&
+                   (ncur || xfs_btree_ptr_is_null(cur, &nptr))) {
+                       /* Save the state from the cursor before we trash it */
+                       if (cur->bc_ops->update_cursor)
+                               cur->bc_ops->update_cursor(pcur, cur);
+                       cur->bc_nlevels = pcur->bc_nlevels;
+                       xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR);
+               }
+               /* If we got a new cursor, switch to it. */
+               if (ncur) {
+                       pcur = ncur;
+                       ncur = NULL;
+               }
+       } while (!xfs_btree_ptr_is_null(cur, &nptr));
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+       *stat = i;
+       return 0;
+error0:
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+       return error;
+}
+
+/*
+ * Try to merge a non-leaf block back into the inode root.
+ *
+ * Note: the killroot names comes from the fact that we're effectively
+ * killing the old root block.  But because we can't just delete the
+ * inode we have to copy the single block it was pointing to into the
+ * inode.
+ */
+int
+xfs_btree_kill_iroot(
+       struct xfs_btree_cur    *cur)
+{
+       int                     whichfork = cur->bc_private.b.whichfork;
+       struct xfs_inode        *ip = cur->bc_private.b.ip;
+       struct xfs_ifork        *ifp = XFS_IFORK_PTR(ip, whichfork);
+       struct xfs_btree_block  *block;
+       struct xfs_btree_block  *cblock;
+       union xfs_btree_key     *kp;
+       union xfs_btree_key     *ckp;
+       union xfs_btree_ptr     *pp;
+       union xfs_btree_ptr     *cpp;
+       struct xfs_buf          *cbp;
+       int                     level;
+       int                     index;
+       int                     numrecs;
+#ifdef DEBUG
+       union xfs_btree_ptr     ptr;
+       int                     i;
+#endif
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+
+       ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
+       ASSERT(cur->bc_nlevels > 1);
+
+       /*
+        * Don't deal with the root block needs to be a leaf case.
+        * We're just going to turn the thing back into extents anyway.
+        */
+       level = cur->bc_nlevels - 1;
+       if (level == 1)
+               goto out0;
+
+       /*
+        * Give up if the root has multiple children.
+        */
+       block = xfs_btree_get_iroot(cur);
+       if (xfs_btree_get_numrecs(block) != 1)
+               goto out0;
+
+       cblock = xfs_btree_get_block(cur, level - 1, &cbp);
+       numrecs = xfs_btree_get_numrecs(cblock);
+
+       /*
+        * Only do this if the next level will fit.
+        * Then the data must be copied up to the inode,
+        * instead of freeing the root you free the next level.
+        */
+       if (numrecs > cur->bc_ops->get_dmaxrecs(cur, level))
+               goto out0;
+
+       XFS_BTREE_STATS_INC(cur, killroot);
+
+#ifdef DEBUG
+       xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB);
+       ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
+       xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
+       ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
+#endif
+
+       index = numrecs - cur->bc_ops->get_maxrecs(cur, level);
+       if (index) {
+               xfs_iroot_realloc(cur->bc_private.b.ip, index,
+                                 cur->bc_private.b.whichfork);
+               block = ifp->if_broot;
+       }
+
+       be16_add_cpu(&block->bb_numrecs, index);
+       ASSERT(block->bb_numrecs == cblock->bb_numrecs);
+
+       kp = xfs_btree_key_addr(cur, 1, block);
+       ckp = xfs_btree_key_addr(cur, 1, cblock);
+       xfs_btree_copy_keys(cur, kp, ckp, numrecs);
+
+       pp = xfs_btree_ptr_addr(cur, 1, block);
+       cpp = xfs_btree_ptr_addr(cur, 1, cblock);
+#ifdef DEBUG
+       for (i = 0; i < numrecs; i++) {
+               int             error;
+
+               error = xfs_btree_check_ptr(cur, cpp, i, level - 1);
+               if (error) {
+                       XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+                       return error;
+               }
+       }
+#endif
+       xfs_btree_copy_ptrs(cur, pp, cpp, numrecs);
+
+       cur->bc_ops->free_block(cur, cbp);
+       XFS_BTREE_STATS_INC(cur, free);
+
+       cur->bc_bufs[level - 1] = NULL;
+       be16_add_cpu(&block->bb_level, -1);
+       xfs_trans_log_inode(cur->bc_tp, ip,
+               XFS_ILOG_CORE | XFS_ILOG_FBROOT(cur->bc_private.b.whichfork));
+       cur->bc_nlevels--;
+out0:
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+       return 0;
+}
+
+STATIC int
+xfs_btree_dec_cursor(
+       struct xfs_btree_cur    *cur,
+       int                     level,
+       int                     *stat)
+{
+       int                     error;
+       int                     i;
+
+       if (level > 0) {
+               error = xfs_btree_decrement(cur, level, &i);
+               if (error)
+                       return error;
+       }
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+       *stat = 1;
+       return 0;
+}
+
+/*
+ * Single level of the btree record deletion routine.
+ * Delete record pointed to by cur/level.
+ * Remove the record from its block then rebalance the tree.
+ * Return 0 for error, 1 for done, 2 to go on to the next level.
+ */
+STATIC int                                     /* error */
+xfs_btree_delrec(
+       struct xfs_btree_cur    *cur,           /* btree cursor */
+       int                     level,          /* level removing record from */
+       int                     *stat)          /* fail/done/go-on */
+{
+       struct xfs_btree_block  *block;         /* btree block */
+       union xfs_btree_ptr     cptr;           /* current block ptr */
+       struct xfs_buf          *bp;            /* buffer for block */
+       int                     error;          /* error return value */
+       int                     i;              /* loop counter */
+       union xfs_btree_key     key;            /* storage for keyp */
+       union xfs_btree_key     *keyp = &key;   /* passed to the next level */
+       union xfs_btree_ptr     lptr;           /* left sibling block ptr */
+       struct xfs_buf          *lbp;           /* left buffer pointer */
+       struct xfs_btree_block  *left;          /* left btree block */
+       int                     lrecs = 0;      /* left record count */
+       int                     ptr;            /* key/record index */
+       union xfs_btree_ptr     rptr;           /* right sibling block ptr */
+       struct xfs_buf          *rbp;           /* right buffer pointer */
+       struct xfs_btree_block  *right;         /* right btree block */
+       struct xfs_btree_block  *rrblock;       /* right-right btree block */
+       struct xfs_buf          *rrbp;          /* right-right buffer pointer */
+       int                     rrecs = 0;      /* right record count */
+       struct xfs_btree_cur    *tcur;          /* temporary btree cursor */
+       int                     numrecs;        /* temporary numrec count */
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+       XFS_BTREE_TRACE_ARGI(cur, level);
+
+       tcur = NULL;
+
+       /* Get the index of the entry being deleted, check for nothing there. */
+       ptr = cur->bc_ptrs[level];
+       if (ptr == 0) {
+               XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+               *stat = 0;
+               return 0;
+       }
+
+       /* Get the buffer & block containing the record or key/ptr. */
+       block = xfs_btree_get_block(cur, level, &bp);
+       numrecs = xfs_btree_get_numrecs(block);
+
+#ifdef DEBUG
+       error = xfs_btree_check_block(cur, block, level, bp);
+       if (error)
+               goto error0;
+#endif
+
+       /* Fail if we're off the end of the block. */
+       if (ptr > numrecs) {
+               XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+               *stat = 0;
+               return 0;
+       }
+
+       XFS_BTREE_STATS_INC(cur, delrec);
+       XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr);
+
+       /* Excise the entries being deleted. */
+       if (level > 0) {
+               /* It's a nonleaf. operate on keys and ptrs */
+               union xfs_btree_key     *lkp;
+               union xfs_btree_ptr     *lpp;
+
+               lkp = xfs_btree_key_addr(cur, ptr + 1, block);
+               lpp = xfs_btree_ptr_addr(cur, ptr + 1, block);
+
+#ifdef DEBUG
+               for (i = 0; i < numrecs - ptr; i++) {
+                       error = xfs_btree_check_ptr(cur, lpp, i, level);
+                       if (error)
+                               goto error0;
+               }
+#endif
+
+               if (ptr < numrecs) {
+                       xfs_btree_shift_keys(cur, lkp, -1, numrecs - ptr);
+                       xfs_btree_shift_ptrs(cur, lpp, -1, numrecs - ptr);
+                       xfs_btree_log_keys(cur, bp, ptr, numrecs - 1);
+                       xfs_btree_log_ptrs(cur, bp, ptr, numrecs - 1);
+               }
+
+               /*
+                * If it's the first record in the block, we'll need to pass a
+                * key up to the next level (updkey).
+                */
+               if (ptr == 1)
+                       keyp = xfs_btree_key_addr(cur, 1, block);
+       } else {
+               /* It's a leaf. operate on records */
+               if (ptr < numrecs) {
+                       xfs_btree_shift_recs(cur,
+                               xfs_btree_rec_addr(cur, ptr + 1, block),
+                               -1, numrecs - ptr);
+                       xfs_btree_log_recs(cur, bp, ptr, numrecs - 1);
+               }
+
+               /*
+                * If it's the first record in the block, we'll need a key
+                * structure to pass up to the next level (updkey).
+                */
+               if (ptr == 1) {
+                       cur->bc_ops->init_key_from_rec(&key,
+                                       xfs_btree_rec_addr(cur, 1, block));
+                       keyp = &key;
+               }
+       }
+
+       /*
+        * Decrement and log the number of entries in the block.
+        */
+       xfs_btree_set_numrecs(block, --numrecs);
+       xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS);
+
+       /*
+        * If we are tracking the last record in the tree and
+        * we are at the far right edge of the tree, update it.
+        */
+       if (xfs_btree_is_lastrec(cur, block, level)) {
+               cur->bc_ops->update_lastrec(cur, block, NULL,
+                                           ptr, LASTREC_DELREC);
+       }
+
+       /*
+        * We're at the root level.  First, shrink the root block in-memory.
+        * Try to get rid of the next level down.  If we can't then there's
+        * nothing left to do.
+        */
+       if (level == cur->bc_nlevels - 1) {
+               if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
+                       xfs_iroot_realloc(cur->bc_private.b.ip, -1,
+                                         cur->bc_private.b.whichfork);
+
+                       error = xfs_btree_kill_iroot(cur);
+                       if (error)
+                               goto error0;
+
+                       error = xfs_btree_dec_cursor(cur, level, stat);
+                       if (error)
+                               goto error0;
+                       *stat = 1;
+                       return 0;
+               }
+
+               /*
+                * If this is the root level, and there's only one entry left,
+                * and it's NOT the leaf level, then we can get rid of this
+                * level.
+                */
+               if (numrecs == 1 && level > 0) {
+                       union xfs_btree_ptr     *pp;
+                       /*
+                        * pp is still set to the first pointer in the block.
+                        * Make it the new root of the btree.
+                        */
+                       pp = xfs_btree_ptr_addr(cur, 1, block);
+                       error = cur->bc_ops->kill_root(cur, bp, level, pp);
+                       if (error)
+                               goto error0;
+               } else if (level > 0) {
+                       error = xfs_btree_dec_cursor(cur, level, stat);
+                       if (error)
+                               goto error0;
+               }
+               *stat = 1;
+               return 0;
+       }
+
+       /*
+        * If we deleted the leftmost entry in the block, update the
+        * key values above us in the tree.
+        */
+       if (ptr == 1) {
+               error = xfs_btree_updkey(cur, keyp, level + 1);
+               if (error)
+                       goto error0;
+       }
+
+       /*
+        * If the number of records remaining in the block is at least
+        * the minimum, we're done.
+        */
+       if (numrecs >= cur->bc_ops->get_minrecs(cur, level)) {
+               error = xfs_btree_dec_cursor(cur, level, stat);
+               if (error)
+                       goto error0;
+               return 0;
+       }
+
+       /*
+        * Otherwise, we have to move some records around to keep the
+        * tree balanced.  Look at the left and right sibling blocks to
+        * see if we can re-balance by moving only one record.
+        */
+       xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
+       xfs_btree_get_sibling(cur, block, &lptr, XFS_BB_LEFTSIB);
+
+       if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
+               /*
+                * One child of root, need to get a chance to copy its contents
+                * into the root and delete it. Can't go up to next level,
+                * there's nothing to delete there.
+                */
+               if (xfs_btree_ptr_is_null(cur, &rptr) &&
+                   xfs_btree_ptr_is_null(cur, &lptr) &&
+                   level == cur->bc_nlevels - 2) {
+                       error = xfs_btree_kill_iroot(cur);
+                       if (!error)
+                               error = xfs_btree_dec_cursor(cur, level, stat);
+                       if (error)
+                               goto error0;
+                       return 0;
+               }
+       }
+
+       ASSERT(!xfs_btree_ptr_is_null(cur, &rptr) ||
+              !xfs_btree_ptr_is_null(cur, &lptr));
+
+       /*
+        * Duplicate the cursor so our btree manipulations here won't
+        * disrupt the next level up.
+        */
+       error = xfs_btree_dup_cursor(cur, &tcur);
+       if (error)
+               goto error0;
+
+       /*
+        * If there's a right sibling, see if it's ok to shift an entry
+        * out of it.
+        */
+       if (!xfs_btree_ptr_is_null(cur, &rptr)) {
+               /*
+                * Move the temp cursor to the last entry in the next block.
+                * Actually any entry but the first would suffice.
+                */
+               i = xfs_btree_lastrec(tcur, level);
+               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+               error = xfs_btree_increment(tcur, level, &i);
+               if (error)
+                       goto error0;
+               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+               i = xfs_btree_lastrec(tcur, level);
+               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+               /* Grab a pointer to the block. */
+               right = xfs_btree_get_block(tcur, level, &rbp);
+#ifdef DEBUG
+               error = xfs_btree_check_block(tcur, right, level, rbp);
+               if (error)
+                       goto error0;
+#endif
+               /* Grab the current block number, for future use. */
+               xfs_btree_get_sibling(tcur, right, &cptr, XFS_BB_LEFTSIB);
+
+               /*
+                * If right block is full enough so that removing one entry
+                * won't make it too empty, and left-shifting an entry out
+                * of right to us works, we're done.
+                */
+               if (xfs_btree_get_numrecs(right) - 1 >=
+                   cur->bc_ops->get_minrecs(tcur, level)) {
+                       error = xfs_btree_lshift(tcur, level, &i);
+                       if (error)
+                               goto error0;
+                       if (i) {
+                               ASSERT(xfs_btree_get_numrecs(block) >=
+                                      cur->bc_ops->get_minrecs(tcur, level));
+
+                               xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+                               tcur = NULL;
+
+                               error = xfs_btree_dec_cursor(cur, level, stat);
+                               if (error)
+                                       goto error0;
+                               return 0;
+                       }
+               }
+
+               /*
+                * Otherwise, grab the number of records in right for
+                * future reference, and fix up the temp cursor to point
+                * to our block again (last record).
+                */
+               rrecs = xfs_btree_get_numrecs(right);
+               if (!xfs_btree_ptr_is_null(cur, &lptr)) {
+                       i = xfs_btree_firstrec(tcur, level);
+                       XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+                       error = xfs_btree_decrement(tcur, level, &i);
+                       if (error)
+                               goto error0;
+                       XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+               }
+       }
+
+       /*
+        * If there's a left sibling, see if it's ok to shift an entry
+        * out of it.
+        */
+       if (!xfs_btree_ptr_is_null(cur, &lptr)) {
+               /*
+                * Move the temp cursor to the first entry in the
+                * previous block.
+                */
+               i = xfs_btree_firstrec(tcur, level);
+               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+               error = xfs_btree_decrement(tcur, level, &i);
+               if (error)
+                       goto error0;
+               i = xfs_btree_firstrec(tcur, level);
+               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+               /* Grab a pointer to the block. */
+               left = xfs_btree_get_block(tcur, level, &lbp);
+#ifdef DEBUG
+               error = xfs_btree_check_block(cur, left, level, lbp);
+               if (error)
+                       goto error0;
+#endif
+               /* Grab the current block number, for future use. */
+               xfs_btree_get_sibling(tcur, left, &cptr, XFS_BB_RIGHTSIB);
+
+               /*
+                * If left block is full enough so that removing one entry
+                * won't make it too empty, and right-shifting an entry out
+                * of left to us works, we're done.
+                */
+               if (xfs_btree_get_numrecs(left) - 1 >=
+                   cur->bc_ops->get_minrecs(tcur, level)) {
+                       error = xfs_btree_rshift(tcur, level, &i);
+                       if (error)
+                               goto error0;
+                       if (i) {
+                               ASSERT(xfs_btree_get_numrecs(block) >=
+                                      cur->bc_ops->get_minrecs(tcur, level));
+                               xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+                               tcur = NULL;
+                               if (level == 0)
+                                       cur->bc_ptrs[0]++;
+                               XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+                               *stat = 1;
+                               return 0;
+                       }
+               }
+
+               /*
+                * Otherwise, grab the number of records in right for
+                * future reference.
+                */
+               lrecs = xfs_btree_get_numrecs(left);
+       }
+
+       /* Delete the temp cursor, we're done with it. */
+       xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+       tcur = NULL;
+
+       /* If here, we need to do a join to keep the tree balanced. */
+       ASSERT(!xfs_btree_ptr_is_null(cur, &cptr));
+
+       if (!xfs_btree_ptr_is_null(cur, &lptr) &&
+           lrecs + xfs_btree_get_numrecs(block) <=
+                       cur->bc_ops->get_maxrecs(cur, level)) {
+               /*
+                * Set "right" to be the starting block,
+                * "left" to be the left neighbor.
+                */
+               rptr = cptr;
+               right = block;
+               rbp = bp;
+               error = xfs_btree_read_buf_block(cur, &lptr, level,
+                                                       0, &left, &lbp);
+               if (error)
+                       goto error0;
+
+       /*
+        * If that won't work, see if we can join with the right neighbor block.
+        */
+       } else if (!xfs_btree_ptr_is_null(cur, &rptr) &&
+                  rrecs + xfs_btree_get_numrecs(block) <=
+                       cur->bc_ops->get_maxrecs(cur, level)) {
+               /*
+                * Set "left" to be the starting block,
+                * "right" to be the right neighbor.
+                */
+               lptr = cptr;
+               left = block;
+               lbp = bp;
+               error = xfs_btree_read_buf_block(cur, &rptr, level,
+                                                       0, &right, &rbp);
+               if (error)
+                       goto error0;
+
+       /*
+        * Otherwise, we can't fix the imbalance.
+        * Just return.  This is probably a logic error, but it's not fatal.
+        */
+       } else {
+               error = xfs_btree_dec_cursor(cur, level, stat);
+               if (error)
+                       goto error0;
+               return 0;
+       }
+
+       rrecs = xfs_btree_get_numrecs(right);
+       lrecs = xfs_btree_get_numrecs(left);
+
+       /*
+        * We're now going to join "left" and "right" by moving all the stuff
+        * in "right" to "left" and deleting "right".
+        */
+       XFS_BTREE_STATS_ADD(cur, moves, rrecs);
+       if (level > 0) {
+               /* It's a non-leaf.  Move keys and pointers. */
+               union xfs_btree_key     *lkp;   /* left btree key */
+               union xfs_btree_ptr     *lpp;   /* left address pointer */
+               union xfs_btree_key     *rkp;   /* right btree key */
+               union xfs_btree_ptr     *rpp;   /* right address pointer */
+
+               lkp = xfs_btree_key_addr(cur, lrecs + 1, left);
+               lpp = xfs_btree_ptr_addr(cur, lrecs + 1, left);
+               rkp = xfs_btree_key_addr(cur, 1, right);
+               rpp = xfs_btree_ptr_addr(cur, 1, right);
+#ifdef DEBUG
+               for (i = 1; i < rrecs; i++) {
+                       error = xfs_btree_check_ptr(cur, rpp, i, level);
+                       if (error)
+                               goto error0;
+               }
+#endif
+               xfs_btree_copy_keys(cur, lkp, rkp, rrecs);
+               xfs_btree_copy_ptrs(cur, lpp, rpp, rrecs);
+
+               xfs_btree_log_keys(cur, lbp, lrecs + 1, lrecs + rrecs);
+               xfs_btree_log_ptrs(cur, lbp, lrecs + 1, lrecs + rrecs);
+       } else {
+               /* It's a leaf.  Move records.  */
+               union xfs_btree_rec     *lrp;   /* left record pointer */
+               union xfs_btree_rec     *rrp;   /* right record pointer */
+
+               lrp = xfs_btree_rec_addr(cur, lrecs + 1, left);
+               rrp = xfs_btree_rec_addr(cur, 1, right);
+
+               xfs_btree_copy_recs(cur, lrp, rrp, rrecs);
+               xfs_btree_log_recs(cur, lbp, lrecs + 1, lrecs + rrecs);
+       }
+
+       XFS_BTREE_STATS_INC(cur, join);
+
+       /*
+        * Fix up the the number of records and right block pointer in the
+        * surviving block, and log it.
+        */
+       xfs_btree_set_numrecs(left, lrecs + rrecs);
+       xfs_btree_get_sibling(cur, right, &cptr, XFS_BB_RIGHTSIB),
+       xfs_btree_set_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB);
+       xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
+
+       /* If there is a right sibling, point it to the remaining block. */
+       xfs_btree_get_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB);
+       if (!xfs_btree_ptr_is_null(cur, &cptr)) {
+               error = xfs_btree_read_buf_block(cur, &cptr, level,
+                                                       0, &rrblock, &rrbp);
+               if (error)
+                       goto error0;
+               xfs_btree_set_sibling(cur, rrblock, &lptr, XFS_BB_LEFTSIB);
+               xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB);
+       }
+
+       /* Free the deleted block. */
+       error = cur->bc_ops->free_block(cur, rbp);
+       if (error)
+               goto error0;
+       XFS_BTREE_STATS_INC(cur, free);
+
+       /*
+        * If we joined with the left neighbor, set the buffer in the
+        * cursor to the left block, and fix up the index.
+        */
+       if (bp != lbp) {
+               cur->bc_bufs[level] = lbp;
+               cur->bc_ptrs[level] += lrecs;
+               cur->bc_ra[level] = 0;
+       }
+       /*
+        * If we joined with the right neighbor and there's a level above
+        * us, increment the cursor at that level.
+        */
+       else if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) ||
+                  (level + 1 < cur->bc_nlevels)) {
+               error = xfs_btree_increment(cur, level + 1, &i);
+               if (error)
+                       goto error0;
+       }
+
+       /*
+        * Readjust the ptr at this level if it's not a leaf, since it's
+        * still pointing at the deletion point, which makes the cursor
+        * inconsistent.  If this makes the ptr 0, the caller fixes it up.
+        * We can't use decrement because it would change the next level up.
+        */
+       if (level > 0)
+               cur->bc_ptrs[level]--;
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+       /* Return value means the next level up has something to do. */
+       *stat = 2;
+       return 0;
+
+error0:
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+       if (tcur)
+               xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
+       return error;
+}
+
+/*
+ * Delete the record pointed to by cur.
+ * The cursor refers to the place where the record was (could be inserted)
+ * when the operation returns.
+ */
+int                                    /* error */
+xfs_btree_delete(
+       struct xfs_btree_cur    *cur,
+       int                     *stat)  /* success/failure */
+{
+       int                     error;  /* error return value */
+       int                     level;
+       int                     i;
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+
+       /*
+        * Go up the tree, starting at leaf level.
+        *
+        * If 2 is returned then a join was done; go to the next level.
+        * Otherwise we are done.
+        */
+       for (level = 0, i = 2; i == 2; level++) {
+               error = xfs_btree_delrec(cur, level, &i);
+               if (error)
+                       goto error0;
+       }
+
+       if (i == 0) {
+               for (level = 1; level < cur->bc_nlevels; level++) {
+                       if (cur->bc_ptrs[level] == 0) {
+                               error = xfs_btree_decrement(cur, level, &i);
+                               if (error)
+                                       goto error0;
+                               break;
+                       }
+               }
+       }
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+       *stat = i;
+       return 0;
+error0:
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+       return error;
+}
+
+/*
+ * Get the data from the pointed-to record.
+ */
+int                                    /* error */
+xfs_btree_get_rec(
+       struct xfs_btree_cur    *cur,   /* btree cursor */
+       union xfs_btree_rec     **recp, /* output: btree record */
+       int                     *stat)  /* output: success/failure */
+{
+       struct xfs_btree_block  *block; /* btree block */
+       struct xfs_buf          *bp;    /* buffer pointer */
+       int                     ptr;    /* record number */
+#ifdef DEBUG
+       int                     error;  /* error return value */
+#endif
+
+       ptr = cur->bc_ptrs[0];
+       block = xfs_btree_get_block(cur, 0, &bp);
+
+#ifdef DEBUG
+       error = xfs_btree_check_block(cur, block, 0, bp);
+       if (error)
+               return error;
+#endif
+
+       /*
+        * Off the right end or left end, return failure.
+        */
+       if (ptr > xfs_btree_get_numrecs(block) || ptr <= 0) {
+               *stat = 0;
+               return 0;
+       }
+
+       /*
+        * Point to the record and extract its data.
+        */
+       *recp = xfs_btree_rec_addr(cur, ptr, block);
+       *stat = 1;
+       return 0;
+}
index 1f528a2a37545d858d40577a948e56ee7699d479..789fffdf8b2f557bc1aa3ed3c51acc5081ee941b 100644 (file)
@@ -39,39 +39,19 @@ extern kmem_zone_t  *xfs_btree_cur_zone;
 #define        XFS_BTNUM_INO   ((xfs_btnum_t)XFS_BTNUM_INOi)
 
 /*
- * Short form header: space allocation btrees.
- */
-typedef struct xfs_btree_sblock {
-       __be32          bb_magic;       /* magic number for block type */
-       __be16          bb_level;       /* 0 is a leaf */
-       __be16          bb_numrecs;     /* current # of data records */
-       __be32          bb_leftsib;     /* left sibling block or NULLAGBLOCK */
-       __be32          bb_rightsib;    /* right sibling block or NULLAGBLOCK */
-} xfs_btree_sblock_t;
-
-/*
- * Long form header: bmap btrees.
- */
-typedef struct xfs_btree_lblock {
-       __be32          bb_magic;       /* magic number for block type */
-       __be16          bb_level;       /* 0 is a leaf */
-       __be16          bb_numrecs;     /* current # of data records */
-       __be64          bb_leftsib;     /* left sibling block or NULLDFSBNO */
-       __be64          bb_rightsib;    /* right sibling block or NULLDFSBNO */
-} xfs_btree_lblock_t;
-
-/*
- * Combined header and structure, used by common code.
+ * Generic btree header.
+ *
+ * This is a comination of the actual format used on disk for short and long
+ * format btrees.  The first three fields are shared by both format, but
+ * the pointers are different and should be used with care.
+ *
+ * To get the size of the actual short or long form headers please use
+ * the size macros below.  Never use sizeof(xfs_btree_block).
  */
-typedef struct xfs_btree_hdr
-{
+struct xfs_btree_block {
        __be32          bb_magic;       /* magic number for block type */
        __be16          bb_level;       /* 0 is a leaf */
        __be16          bb_numrecs;     /* current # of data records */
-} xfs_btree_hdr_t;
-
-typedef struct xfs_btree_block {
-       xfs_btree_hdr_t bb_h;           /* header */
        union {
                struct {
                        __be32          bb_leftsib;
@@ -82,7 +62,36 @@ typedef struct xfs_btree_block {
                        __be64          bb_rightsib;
                } l;                    /* long form pointers */
        } bb_u;                         /* rest */
-} xfs_btree_block_t;
+};
+
+#define XFS_BTREE_SBLOCK_LEN   16      /* size of a short form block */
+#define XFS_BTREE_LBLOCK_LEN   24      /* size of a long form block */
+
+
+/*
+ * Generic key, ptr and record wrapper structures.
+ *
+ * These are disk format structures, and are converted where necessary
+ * by the btree specific code that needs to interpret them.
+ */
+union xfs_btree_ptr {
+       __be32                  s;      /* short form ptr */
+       __be64                  l;      /* long form ptr */
+};
+
+union xfs_btree_key {
+       xfs_bmbt_key_t          bmbt;
+       xfs_bmdr_key_t          bmbr;   /* bmbt root block */
+       xfs_alloc_key_t         alloc;
+       xfs_inobt_key_t         inobt;
+};
+
+union xfs_btree_rec {
+       xfs_bmbt_rec_t          bmbt;
+       xfs_bmdr_rec_t          bmbr;   /* bmbt root block */
+       xfs_alloc_rec_t         alloc;
+       xfs_inobt_rec_t         inobt;
+};
 
 /*
  * For logging record fields.
@@ -95,47 +104,132 @@ typedef struct xfs_btree_block {
 #define        XFS_BB_NUM_BITS         5
 #define        XFS_BB_ALL_BITS         ((1 << XFS_BB_NUM_BITS) - 1)
 
-/*
- * Boolean to select which form of xfs_btree_block_t.bb_u to use.
- */
-#define        XFS_BTREE_LONG_PTRS(btnum)      ((btnum) == XFS_BTNUM_BMAP)
-
 /*
  * Magic numbers for btree blocks.
  */
 extern const __uint32_t        xfs_magics[];
 
 /*
- * Maximum and minimum records in a btree block.
- * Given block size, type prefix, and leaf flag (0 or 1).
- * The divisor below is equivalent to lf ? (e1) : (e2) but that produces
- * compiler warnings.
- */
-#define        XFS_BTREE_BLOCK_MAXRECS(bsz,t,lf)       \
-       ((int)(((bsz) - (uint)sizeof(t ## _block_t)) / \
-        (((lf) * (uint)sizeof(t ## _rec_t)) + \
-         ((1 - (lf)) * \
-          ((uint)sizeof(t ## _key_t) + (uint)sizeof(t ## _ptr_t))))))
-#define        XFS_BTREE_BLOCK_MINRECS(bsz,t,lf)       \
-       (XFS_BTREE_BLOCK_MAXRECS(bsz,t,lf) / 2)
-
-/*
- * Record, key, and pointer address calculation macros.
- * Given block size, type prefix, block pointer, and index of requested entry
- * (first entry numbered 1).
- */
-#define        XFS_BTREE_REC_ADDR(t,bb,i)      \
-       ((t ## _rec_t *)((char *)(bb) + sizeof(t ## _block_t) + \
-        ((i) - 1) * sizeof(t ## _rec_t)))
-#define        XFS_BTREE_KEY_ADDR(t,bb,i)      \
-       ((t ## _key_t *)((char *)(bb) + sizeof(t ## _block_t) + \
-        ((i) - 1) * sizeof(t ## _key_t)))
-#define        XFS_BTREE_PTR_ADDR(t,bb,i,mxr)  \
-       ((t ## _ptr_t *)((char *)(bb) + sizeof(t ## _block_t) + \
-        (mxr) * sizeof(t ## _key_t) + ((i) - 1) * sizeof(t ## _ptr_t)))
+ * Generic stats interface
+ */
+#define __XFS_BTREE_STATS_INC(type, stat) \
+       XFS_STATS_INC(xs_ ## type ## _2_ ## stat)
+#define XFS_BTREE_STATS_INC(cur, stat)  \
+do {    \
+       switch (cur->bc_btnum) {  \
+       case XFS_BTNUM_BNO: __XFS_BTREE_STATS_INC(abtb, stat); break;   \
+       case XFS_BTNUM_CNT: __XFS_BTREE_STATS_INC(abtc, stat); break;   \
+       case XFS_BTNUM_BMAP: __XFS_BTREE_STATS_INC(bmbt, stat); break;  \
+       case XFS_BTNUM_INO: __XFS_BTREE_STATS_INC(ibt, stat); break;    \
+       case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break;       \
+       }       \
+} while (0)
+
+#define __XFS_BTREE_STATS_ADD(type, stat, val) \
+       XFS_STATS_ADD(xs_ ## type ## _2_ ## stat, val)
+#define XFS_BTREE_STATS_ADD(cur, stat, val)  \
+do {    \
+       switch (cur->bc_btnum) {  \
+       case XFS_BTNUM_BNO: __XFS_BTREE_STATS_ADD(abtb, stat, val); break; \
+       case XFS_BTNUM_CNT: __XFS_BTREE_STATS_ADD(abtc, stat, val); break; \
+       case XFS_BTNUM_BMAP: __XFS_BTREE_STATS_ADD(bmbt, stat, val); break; \
+       case XFS_BTNUM_INO: __XFS_BTREE_STATS_ADD(ibt, stat, val); break; \
+       case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break;       \
+       }       \
+} while (0)
 
 #define        XFS_BTREE_MAXLEVELS     8       /* max of all btrees */
 
+struct xfs_btree_ops {
+       /* size of the key and record structures */
+       size_t  key_len;
+       size_t  rec_len;
+
+       /* cursor operations */
+       struct xfs_btree_cur *(*dup_cursor)(struct xfs_btree_cur *);
+       void    (*update_cursor)(struct xfs_btree_cur *src,
+                                struct xfs_btree_cur *dst);
+
+       /* update btree root pointer */
+       void    (*set_root)(struct xfs_btree_cur *cur,
+                               union xfs_btree_ptr *nptr, int level_change);
+       int     (*kill_root)(struct xfs_btree_cur *cur, struct xfs_buf *bp,
+                               int level, union xfs_btree_ptr *newroot);
+
+       /* block allocation / freeing */
+       int     (*alloc_block)(struct xfs_btree_cur *cur,
+                              union xfs_btree_ptr *start_bno,
+                              union xfs_btree_ptr *new_bno,
+                              int length, int *stat);
+       int     (*free_block)(struct xfs_btree_cur *cur, struct xfs_buf *bp);
+
+       /* update last record information */
+       void    (*update_lastrec)(struct xfs_btree_cur *cur,
+                                 struct xfs_btree_block *block,
+                                 union xfs_btree_rec *rec,
+                                 int ptr, int reason);
+
+       /* records in block/level */
+       int     (*get_minrecs)(struct xfs_btree_cur *cur, int level);
+       int     (*get_maxrecs)(struct xfs_btree_cur *cur, int level);
+
+       /* records on disk.  Matter for the root in inode case. */
+       int     (*get_dmaxrecs)(struct xfs_btree_cur *cur, int level);
+
+       /* init values of btree structures */
+       void    (*init_key_from_rec)(union xfs_btree_key *key,
+                                    union xfs_btree_rec *rec);
+       void    (*init_rec_from_key)(union xfs_btree_key *key,
+                                    union xfs_btree_rec *rec);
+       void    (*init_rec_from_cur)(struct xfs_btree_cur *cur,
+                                    union xfs_btree_rec *rec);
+       void    (*init_ptr_from_cur)(struct xfs_btree_cur *cur,
+                                    union xfs_btree_ptr *ptr);
+
+       /* difference between key value and cursor value */
+       __int64_t (*key_diff)(struct xfs_btree_cur *cur,
+                             union xfs_btree_key *key);
+
+#ifdef DEBUG
+       /* check that k1 is lower than k2 */
+       int     (*keys_inorder)(struct xfs_btree_cur *cur,
+                               union xfs_btree_key *k1,
+                               union xfs_btree_key *k2);
+
+       /* check that r1 is lower than r2 */
+       int     (*recs_inorder)(struct xfs_btree_cur *cur,
+                               union xfs_btree_rec *r1,
+                               union xfs_btree_rec *r2);
+#endif
+
+       /* btree tracing */
+#ifdef XFS_BTREE_TRACE
+       void            (*trace_enter)(struct xfs_btree_cur *, const char *,
+                                      char *, int, int, __psunsigned_t,
+                                      __psunsigned_t, __psunsigned_t,
+                                      __psunsigned_t, __psunsigned_t,
+                                      __psunsigned_t, __psunsigned_t,
+                                      __psunsigned_t, __psunsigned_t,
+                                      __psunsigned_t, __psunsigned_t);
+       void            (*trace_cursor)(struct xfs_btree_cur *, __uint32_t *,
+                                       __uint64_t *, __uint64_t *);
+       void            (*trace_key)(struct xfs_btree_cur *,
+                                    union xfs_btree_key *, __uint64_t *,
+                                    __uint64_t *);
+       void            (*trace_record)(struct xfs_btree_cur *,
+                                       union xfs_btree_rec *, __uint64_t *,
+                                       __uint64_t *, __uint64_t *);
+#endif
+};
+
+/*
+ * Reasons for the update_lastrec method to be called.
+ */
+#define LASTREC_UPDATE 0
+#define LASTREC_INSREC 1
+#define LASTREC_DELREC 2
+
+
 /*
  * Btree cursor structure.
  * This collects all information needed by the btree code in one place.
@@ -144,6 +238,8 @@ typedef struct xfs_btree_cur
 {
        struct xfs_trans        *bc_tp; /* transaction we're in, if any */
        struct xfs_mount        *bc_mp; /* file system mount struct */
+       const struct xfs_btree_ops *bc_ops;
+       uint                    bc_flags; /* btree features - below */
        union {
                xfs_alloc_rec_incore_t  a;
                xfs_bmbt_irec_t         b;
@@ -175,94 +271,40 @@ typedef struct xfs_btree_cur
        }               bc_private;     /* per-btree type data */
 } xfs_btree_cur_t;
 
+/* cursor flags */
+#define XFS_BTREE_LONG_PTRS            (1<<0)  /* pointers are 64bits long */
+#define XFS_BTREE_ROOT_IN_INODE                (1<<1)  /* root may be variable size */
+#define XFS_BTREE_LASTREC_UPDATE       (1<<2)  /* track last rec externally */
+
+
 #define        XFS_BTREE_NOERROR       0
 #define        XFS_BTREE_ERROR         1
 
 /*
  * Convert from buffer to btree block header.
  */
-#define        XFS_BUF_TO_BLOCK(bp)    ((xfs_btree_block_t *)XFS_BUF_PTR(bp))
-#define        XFS_BUF_TO_LBLOCK(bp)   ((xfs_btree_lblock_t *)XFS_BUF_PTR(bp))
-#define        XFS_BUF_TO_SBLOCK(bp)   ((xfs_btree_sblock_t *)XFS_BUF_PTR(bp))
+#define        XFS_BUF_TO_BLOCK(bp)    ((struct xfs_btree_block *)XFS_BUF_PTR(bp))
 
 
-#ifdef __KERNEL__
-
-#ifdef DEBUG
 /*
- * Debug routine: check that block header is ok.
+ * Check that block header is ok.
  */
-void
+int
 xfs_btree_check_block(
-       xfs_btree_cur_t         *cur,   /* btree cursor */
-       xfs_btree_block_t       *block, /* generic btree block pointer */
-       int                     level,  /* level of the btree block */
-       struct xfs_buf          *bp);   /* buffer containing block, if any */
-
-/*
- * Debug routine: check that keys are in the right order.
- */
-void
-xfs_btree_check_key(
-       xfs_btnum_t             btnum,  /* btree identifier */
-       void                    *ak1,   /* pointer to left (lower) key */
-       void                    *ak2);  /* pointer to right (higher) key */
-
-/*
- * Debug routine: check that records are in the right order.
- */
-void
-xfs_btree_check_rec(
-       xfs_btnum_t             btnum,  /* btree identifier */
-       void                    *ar1,   /* pointer to left (lower) record */
-       void                    *ar2);  /* pointer to right (higher) record */
-#else
-#define        xfs_btree_check_block(a,b,c,d)
-#define        xfs_btree_check_key(a,b,c)
-#define        xfs_btree_check_rec(a,b,c)
-#endif /* DEBUG */
-
-/*
- * Checking routine: check that long form block header is ok.
- */
-int                                    /* error (0 or EFSCORRUPTED) */
-xfs_btree_check_lblock(
-       xfs_btree_cur_t         *cur,   /* btree cursor */
-       xfs_btree_lblock_t      *block, /* btree long form block pointer */
+       struct xfs_btree_cur    *cur,   /* btree cursor */
+       struct xfs_btree_block  *block, /* generic btree block pointer */
        int                     level,  /* level of the btree block */
        struct xfs_buf          *bp);   /* buffer containing block, if any */
 
 /*
- * Checking routine: check that (long) pointer is ok.
+ * Check that (long) pointer is ok.
  */
 int                                    /* error (0 or EFSCORRUPTED) */
 xfs_btree_check_lptr(
-       xfs_btree_cur_t         *cur,   /* btree cursor */
+       struct xfs_btree_cur    *cur,   /* btree cursor */
        xfs_dfsbno_t            ptr,    /* btree block disk address */
        int                     level); /* btree block level */
 
-#define xfs_btree_check_lptr_disk(cur, ptr, level) \
-       xfs_btree_check_lptr(cur, be64_to_cpu(ptr), level)
-
-/*
- * Checking routine: check that short form block header is ok.
- */
-int                                    /* error (0 or EFSCORRUPTED) */
-xfs_btree_check_sblock(
-       xfs_btree_cur_t         *cur,   /* btree cursor */
-       xfs_btree_sblock_t      *block, /* btree short form block pointer */
-       int                     level,  /* level of the btree block */
-       struct xfs_buf          *bp);   /* buffer containing block */
-
-/*
- * Checking routine: check that (short) pointer is ok.
- */
-int                                    /* error (0 or EFSCORRUPTED) */
-xfs_btree_check_sptr(
-       xfs_btree_cur_t         *cur,   /* btree cursor */
-       xfs_agblock_t           ptr,    /* btree block disk address */
-       int                     level); /* btree block level */
-
 /*
  * Delete the btree cursor.
  */
@@ -280,15 +322,6 @@ xfs_btree_dup_cursor(
        xfs_btree_cur_t         *cur,   /* input cursor */
        xfs_btree_cur_t         **ncur);/* output cursor */
 
-/*
- * Change the cursor to point to the first record in the current block
- * at the given level.  Other levels are unaffected.
- */
-int                                    /* success=1, failure=0 */
-xfs_btree_firstrec(
-       xfs_btree_cur_t         *cur,   /* btree cursor */
-       int                     level); /* level to change */
-
 /*
  * Get a buffer for the block, return it with no data read.
  * Long-form addressing.
@@ -312,20 +345,6 @@ xfs_btree_get_bufs(
        xfs_agblock_t           agbno,  /* allocation group block number */
        uint                    lock);  /* lock flags for get_buf */
 
-/*
- * Allocate a new btree cursor.
- * The cursor is either for allocation (A) or bmap (B).
- */
-xfs_btree_cur_t *                      /* new btree cursor */
-xfs_btree_init_cursor(
-       struct xfs_mount        *mp,    /* file system mount point */
-       struct xfs_trans        *tp,    /* transaction pointer */
-       struct xfs_buf          *agbp,  /* (A only) buffer for agf structure */
-       xfs_agnumber_t          agno,   /* (A only) allocation group number */
-       xfs_btnum_t             btnum,  /* btree identifier */
-       struct xfs_inode        *ip,    /* (B only) inode owning the btree */
-       int                     whichfork); /* (B only) data/attr fork */
-
 /*
  * Check for the cursor referring to the last block at the given level.
  */
@@ -334,15 +353,6 @@ xfs_btree_islastblock(
        xfs_btree_cur_t         *cur,   /* btree cursor */
        int                     level); /* level to check */
 
-/*
- * Change the cursor to point to the last record in the current block
- * at the given level.  Other levels are unaffected.
- */
-int                                    /* success=1, failure=0 */
-xfs_btree_lastrec(
-       xfs_btree_cur_t         *cur,   /* btree cursor */
-       int                     level); /* level to change */
-
 /*
  * Compute first and last byte offsets for the fields given.
  * Interprets the offsets table, which contains struct field offsets.
@@ -404,39 +414,53 @@ xfs_btree_reada_bufs(
        xfs_extlen_t            count); /* count of filesystem blocks */
 
 /*
- * Read-ahead btree blocks, at the given level.
- * Bits in lr are set from XFS_BTCUR_{LEFT,RIGHT}RA.
+ * Set the buffer for level "lev" in the cursor to bp, releasing
+ * any previous buffer.
  */
-int                                    /* readahead block count */
-xfs_btree_readahead_core(
+void
+xfs_btree_setbuf(
        xfs_btree_cur_t         *cur,   /* btree cursor */
        int                     lev,    /* level in btree */
-       int                     lr);    /* left/right bits */
+       struct xfs_buf          *bp);   /* new buffer to set */
 
-static inline int                      /* readahead block count */
-xfs_btree_readahead(
-       xfs_btree_cur_t         *cur,   /* btree cursor */
-       int                     lev,    /* level in btree */
-       int                     lr)     /* left/right bits */
-{
-       if ((cur->bc_ra[lev] | lr) == cur->bc_ra[lev])
-               return 0;
 
-       return xfs_btree_readahead_core(cur, lev, lr);
-}
+/*
+ * Common btree core entry points.
+ */
+int xfs_btree_increment(struct xfs_btree_cur *, int, int *);
+int xfs_btree_decrement(struct xfs_btree_cur *, int, int *);
+int xfs_btree_lookup(struct xfs_btree_cur *, xfs_lookup_t, int *);
+int xfs_btree_update(struct xfs_btree_cur *, union xfs_btree_rec *);
+int xfs_btree_new_iroot(struct xfs_btree_cur *, int *, int *);
+int xfs_btree_kill_iroot(struct xfs_btree_cur *);
+int xfs_btree_insert(struct xfs_btree_cur *, int *);
+int xfs_btree_delete(struct xfs_btree_cur *, int *);
+int xfs_btree_get_rec(struct xfs_btree_cur *, union xfs_btree_rec **, int *);
 
+/*
+ * Internal btree helpers also used by xfs_bmap.c.
+ */
+void xfs_btree_log_block(struct xfs_btree_cur *, struct xfs_buf *, int);
+void xfs_btree_log_recs(struct xfs_btree_cur *, struct xfs_buf *, int, int);
 
 /*
- * Set the buffer for level "lev" in the cursor to bp, releasing
- * any previous buffer.
+ * Helpers.
  */
-void
-xfs_btree_setbuf(
-       xfs_btree_cur_t         *cur,   /* btree cursor */
-       int                     lev,    /* level in btree */
-       struct xfs_buf          *bp);   /* new buffer to set */
+static inline int xfs_btree_get_numrecs(struct xfs_btree_block *block)
+{
+       return be16_to_cpu(block->bb_numrecs);
+}
+
+static inline void xfs_btree_set_numrecs(struct xfs_btree_block *block,
+               __uint16_t numrecs)
+{
+       block->bb_numrecs = cpu_to_be16(numrecs);
+}
 
-#endif /* __KERNEL__ */
+static inline int xfs_btree_get_level(struct xfs_btree_block *block)
+{
+       return be16_to_cpu(block->bb_level);
+}
 
 
 /*
diff --git a/fs/xfs/xfs_btree_trace.c b/fs/xfs/xfs_btree_trace.c
new file mode 100644 (file)
index 0000000..44ff942
--- /dev/null
@@ -0,0 +1,249 @@
+/*
+ * Copyright (c) 2008 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_types.h"
+#include "xfs_inum.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_btree_trace.h"
+
+STATIC void
+xfs_btree_trace_ptr(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_ptr     ptr,
+       __psunsigned_t          *high,
+       __psunsigned_t          *low)
+{
+       if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+               __u64 val = be64_to_cpu(ptr.l);
+               *high = val >> 32;
+               *low = (int)val;
+       } else {
+               *high = 0;
+               *low = be32_to_cpu(ptr.s);
+       }
+}
+
+/*
+ * Add a trace buffer entry for arguments, for a buffer & 1 integer arg.
+ */
+void
+xfs_btree_trace_argbi(
+       const char              *func,
+       struct xfs_btree_cur    *cur,
+       struct xfs_buf          *b,
+       int                     i,
+       int                     line)
+{
+       cur->bc_ops->trace_enter(cur, func, XBT_ARGS, XFS_BTREE_KTRACE_ARGBI,
+                                line, (__psunsigned_t)b, i, 0, 0, 0, 0, 0,
+                                0, 0, 0, 0);
+}
+
+/*
+ * Add a trace buffer entry for arguments, for a buffer & 2 integer args.
+ */
+void
+xfs_btree_trace_argbii(
+       const char              *func,
+       struct xfs_btree_cur    *cur,
+       struct xfs_buf          *b,
+       int                     i0,
+       int                     i1,
+       int                     line)
+{
+       cur->bc_ops->trace_enter(cur, func, XBT_ARGS, XFS_BTREE_KTRACE_ARGBII,
+                                line, (__psunsigned_t)b, i0, i1, 0, 0, 0, 0,
+                                0, 0, 0, 0);
+}
+
+/*
+ * Add a trace buffer entry for arguments, for 3 block-length args
+ * and an integer arg.
+ */
+void
+xfs_btree_trace_argfffi(
+       const char              *func,
+       struct xfs_btree_cur    *cur,
+       xfs_dfiloff_t           o,
+       xfs_dfsbno_t            b,
+       xfs_dfilblks_t          i,
+       int                     j,
+       int                     line)
+{
+       cur->bc_ops->trace_enter(cur, func, XBT_ARGS, XFS_BTREE_KTRACE_ARGFFFI,
+                                line,
+                                o >> 32, (int)o,
+                                b >> 32, (int)b,
+                                i >> 32, (int)i,
+                                (int)j, 0, 0, 0, 0);
+}
+
+/*
+ * Add a trace buffer entry for arguments, for one integer arg.
+ */
+void
+xfs_btree_trace_argi(
+       const char              *func,
+       struct xfs_btree_cur    *cur,
+       int                     i,
+       int                     line)
+{
+       cur->bc_ops->trace_enter(cur, func, XBT_ARGS, XFS_BTREE_KTRACE_ARGI,
+                                line, i, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+}
+
+/*
+ * Add a trace buffer entry for arguments, for int, fsblock, key.
+ */
+void
+xfs_btree_trace_argipk(
+       const char              *func,
+       struct xfs_btree_cur    *cur,
+       int                     i,
+       union xfs_btree_ptr     ptr,
+       union xfs_btree_key     *key,
+       int                     line)
+{
+       __psunsigned_t          high, low;
+       __uint64_t              l0, l1;
+
+       xfs_btree_trace_ptr(cur, ptr, &high, &low);
+       cur->bc_ops->trace_key(cur, key, &l0, &l1);
+       cur->bc_ops->trace_enter(cur, func, XBT_ARGS, XFS_BTREE_KTRACE_ARGIPK,
+                                line, i, high, low,
+                                l0 >> 32, (int)l0,
+                                l1 >> 32, (int)l1,
+                                0, 0, 0, 0);
+}
+
+/*
+ * Add a trace buffer entry for arguments, for int, fsblock, rec.
+ */
+void
+xfs_btree_trace_argipr(
+       const char              *func,
+       struct xfs_btree_cur    *cur,
+       int                     i,
+       union xfs_btree_ptr     ptr,
+       union xfs_btree_rec     *rec,
+       int                     line)
+{
+       __psunsigned_t          high, low;
+       __uint64_t              l0, l1, l2;
+
+       xfs_btree_trace_ptr(cur, ptr, &high, &low);
+       cur->bc_ops->trace_record(cur, rec, &l0, &l1, &l2);
+       cur->bc_ops->trace_enter(cur, func, XBT_ARGS, XFS_BTREE_KTRACE_ARGIPR,
+                             line, i,
+                             high, low,
+                             l0 >> 32, (int)l0,
+                             l1 >> 32, (int)l1,
+                             l2 >> 32, (int)l2,
+                             0, 0);
+}
+
+/*
+ * Add a trace buffer entry for arguments, for int, key.
+ */
+void
+xfs_btree_trace_argik(
+       const char              *func,
+       struct xfs_btree_cur    *cur,
+       int                     i,
+       union xfs_btree_key     *key,
+       int                     line)
+{
+       __uint64_t              l0, l1;
+
+       cur->bc_ops->trace_key(cur, key, &l0, &l1);
+       cur->bc_ops->trace_enter(cur, func, XBT_ARGS, XFS_BTREE_KTRACE_ARGIK,
+                                line, i,
+                                l0 >> 32, (int)l0,
+                                l1 >> 32, (int)l1,
+                                0, 0, 0, 0, 0, 0);
+}
+
+/*
+ * Add a trace buffer entry for arguments, for record.
+ */
+void
+xfs_btree_trace_argr(
+       const char              *func,
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_rec     *rec,
+       int                     line)
+{
+       __uint64_t              l0, l1, l2;
+
+       cur->bc_ops->trace_record(cur, rec, &l0, &l1, &l2);
+       cur->bc_ops->trace_enter(cur, func, XBT_ARGS, XFS_BTREE_KTRACE_ARGR,
+                             line,
+                             l0 >> 32, (int)l0,
+                             l1 >> 32, (int)l1,
+                             l2 >> 32, (int)l2,
+                             0, 0, 0, 0, 0);
+}
+
+/*
+ * Add a trace buffer entry for the cursor/operation.
+ */
+void
+xfs_btree_trace_cursor(
+       const char              *func,
+       struct xfs_btree_cur    *cur,
+       int                     type,
+       int                     line)
+{
+       __uint32_t              s0;
+       __uint64_t              l0, l1;
+       char                    *s;
+
+       switch (type) {
+       case XBT_ARGS:
+               s = "args";
+               break;
+       case XBT_ENTRY:
+               s = "entry";
+               break;
+       case XBT_ERROR:
+               s = "error";
+               break;
+       case XBT_EXIT:
+               s = "exit";
+               break;
+       default:
+               s = "unknown";
+               break;
+       }
+
+       cur->bc_ops->trace_cursor(cur, &s0, &l0, &l1);
+       cur->bc_ops->trace_enter(cur, func, s, XFS_BTREE_KTRACE_CUR, line,
+                                s0,
+                                l0 >> 32, (int)l0,
+                                l1 >> 32, (int)l1,
+                                (__psunsigned_t)cur->bc_bufs[0],
+                                (__psunsigned_t)cur->bc_bufs[1],
+                                (__psunsigned_t)cur->bc_bufs[2],
+                                (__psunsigned_t)cur->bc_bufs[3],
+                                (cur->bc_ptrs[0] << 16) | cur->bc_ptrs[1],
+                                (cur->bc_ptrs[2] << 16) | cur->bc_ptrs[3]);
+}
diff --git a/fs/xfs/xfs_btree_trace.h b/fs/xfs/xfs_btree_trace.h
new file mode 100644 (file)
index 0000000..b3f5eb3
--- /dev/null
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2008 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_BTREE_TRACE_H__
+#define        __XFS_BTREE_TRACE_H__
+
+struct xfs_btree_cur;
+struct xfs_buf;
+
+
+/*
+ * Trace hooks.
+ * i,j = integer (32 bit)
+ * b = btree block buffer (xfs_buf_t)
+ * p = btree ptr
+ * r = btree record
+ * k = btree key
+ */
+
+#ifdef XFS_BTREE_TRACE
+
+/*
+ * Trace buffer entry types.
+ */
+#define XFS_BTREE_KTRACE_ARGBI   1
+#define XFS_BTREE_KTRACE_ARGBII  2
+#define XFS_BTREE_KTRACE_ARGFFFI 3
+#define XFS_BTREE_KTRACE_ARGI    4
+#define XFS_BTREE_KTRACE_ARGIPK  5
+#define XFS_BTREE_KTRACE_ARGIPR  6
+#define XFS_BTREE_KTRACE_ARGIK   7
+#define XFS_BTREE_KTRACE_ARGR   8
+#define XFS_BTREE_KTRACE_CUR     9
+
+/*
+ * Sub-types for cursor traces.
+ */
+#define XBT_ARGS       0
+#define XBT_ENTRY      1
+#define XBT_ERROR      2
+#define XBT_EXIT       3
+
+void xfs_btree_trace_argbi(const char *, struct xfs_btree_cur *,
+               struct xfs_buf *, int, int);
+void xfs_btree_trace_argbii(const char *, struct xfs_btree_cur *,
+               struct xfs_buf *, int, int, int);
+void xfs_btree_trace_argfffi(const char *, struct xfs_btree_cur *,
+               xfs_dfiloff_t, xfs_dfsbno_t, xfs_dfilblks_t, int, int);
+void xfs_btree_trace_argi(const char *, struct xfs_btree_cur *, int, int);
+void xfs_btree_trace_argipk(const char *, struct xfs_btree_cur *, int,
+               union xfs_btree_ptr, union xfs_btree_key *, int);
+void xfs_btree_trace_argipr(const char *, struct xfs_btree_cur *, int,
+               union xfs_btree_ptr, union xfs_btree_rec *, int);
+void xfs_btree_trace_argik(const char *, struct xfs_btree_cur *, int,
+               union xfs_btree_key *, int);
+void xfs_btree_trace_argr(const char *, struct xfs_btree_cur *,
+               union xfs_btree_rec *, int);
+void xfs_btree_trace_cursor(const char *, struct xfs_btree_cur *, int, int);
+
+
+#define XFS_ALLOCBT_TRACE_SIZE 4096    /* size of global trace buffer */
+extern ktrace_t        *xfs_allocbt_trace_buf;
+
+#define XFS_INOBT_TRACE_SIZE   4096    /* size of global trace buffer */
+extern ktrace_t        *xfs_inobt_trace_buf;
+
+#define XFS_BMBT_TRACE_SIZE    4096    /* size of global trace buffer */
+#define XFS_BMBT_KTRACE_SIZE   32      /* size of per-inode trace buffer */
+extern ktrace_t        *xfs_bmbt_trace_buf;
+
+
+#define        XFS_BTREE_TRACE_ARGBI(c, b, i)  \
+       xfs_btree_trace_argbi(__func__, c, b, i, __LINE__)
+#define        XFS_BTREE_TRACE_ARGBII(c, b, i, j)      \
+       xfs_btree_trace_argbii(__func__, c, b, i, j, __LINE__)
+#define        XFS_BTREE_TRACE_ARGFFFI(c, o, b, i, j)  \
+       xfs_btree_trace_argfffi(__func__, c, o, b, i, j, __LINE__)
+#define        XFS_BTREE_TRACE_ARGI(c, i)      \
+       xfs_btree_trace_argi(__func__, c, i, __LINE__)
+#define        XFS_BTREE_TRACE_ARGIPK(c, i, p, k)      \
+       xfs_btree_trace_argipk(__func__, c, i, p, k, __LINE__)
+#define        XFS_BTREE_TRACE_ARGIPR(c, i, p, r)      \
+       xfs_btree_trace_argipr(__func__, c, i, p, r, __LINE__)
+#define        XFS_BTREE_TRACE_ARGIK(c, i, k)  \
+       xfs_btree_trace_argik(__func__, c, i, k, __LINE__)
+#define XFS_BTREE_TRACE_ARGR(c, r)     \
+       xfs_btree_trace_argr(__func__, c, r, __LINE__)
+#define        XFS_BTREE_TRACE_CURSOR(c, t)    \
+       xfs_btree_trace_cursor(__func__, c, t, __LINE__)
+#else
+#define        XFS_BTREE_TRACE_ARGBI(c, b, i)
+#define        XFS_BTREE_TRACE_ARGBII(c, b, i, j)
+#define        XFS_BTREE_TRACE_ARGFFFI(c, o, b, i, j)
+#define        XFS_BTREE_TRACE_ARGI(c, i)
+#define        XFS_BTREE_TRACE_ARGIPK(c, i, p, s)
+#define        XFS_BTREE_TRACE_ARGIPR(c, i, p, r)
+#define        XFS_BTREE_TRACE_ARGIK(c, i, k)
+#define XFS_BTREE_TRACE_ARGR(c, r)
+#define        XFS_BTREE_TRACE_CURSOR(c, t)
+#endif /* XFS_BTREE_TRACE */
+
+#endif /* __XFS_BTREE_TRACE_H__ */
index 002fc2617c8ee9426bff6203eb5bc6039094bf0a..d245d04e10ca284034ca769641c83dd6ff2a467a 100644 (file)
@@ -375,7 +375,7 @@ xfs_buf_item_unpin(
        xfs_buf_log_item_t      *bip,
        int                     stale)
 {
-       xfs_mount_t     *mp;
+       struct xfs_ail  *ailp;
        xfs_buf_t       *bp;
        int             freed;
 
@@ -387,7 +387,7 @@ xfs_buf_item_unpin(
        xfs_buftrace("XFS_UNPIN", bp);
 
        freed = atomic_dec_and_test(&bip->bli_refcount);
-       mp = bip->bli_item.li_mountp;
+       ailp = bip->bli_item.li_ailp;
        xfs_bunpin(bp);
        if (freed && stale) {
                ASSERT(bip->bli_flags & XFS_BLI_STALE);
@@ -399,17 +399,17 @@ xfs_buf_item_unpin(
                xfs_buftrace("XFS_UNPIN STALE", bp);
                /*
                 * If we get called here because of an IO error, we may
-                * or may not have the item on the AIL. xfs_trans_delete_ail()
+                * or may not have the item on the AIL. xfs_trans_ail_delete()
                 * will take care of that situation.
-                * xfs_trans_delete_ail() drops the AIL lock.
+                * xfs_trans_ail_delete() drops the AIL lock.
                 */
                if (bip->bli_flags & XFS_BLI_STALE_INODE) {
                        xfs_buf_do_callbacks(bp, (xfs_log_item_t *)bip);
                        XFS_BUF_SET_FSPRIVATE(bp, NULL);
                        XFS_BUF_CLR_IODONE_FUNC(bp);
                } else {
-                       spin_lock(&mp->m_ail_lock);
-                       xfs_trans_delete_ail(mp, (xfs_log_item_t *)bip);
+                       spin_lock(&ailp->xa_lock);
+                       xfs_trans_ail_delete(ailp, (xfs_log_item_t *)bip);
                        xfs_buf_item_relse(bp);
                        ASSERT(XFS_BUF_FSPRIVATE(bp, void *) == NULL);
                }
@@ -731,6 +731,7 @@ xfs_buf_item_init(
        bip->bli_item.li_type = XFS_LI_BUF;
        bip->bli_item.li_ops = &xfs_buf_item_ops;
        bip->bli_item.li_mountp = mp;
+       bip->bli_item.li_ailp = mp->m_ail;
        bip->bli_buf = bp;
        xfs_buf_hold(bp);
        bip->bli_format.blf_type = XFS_LI_BUF;
@@ -1122,27 +1123,23 @@ xfs_buf_iodone(
        xfs_buf_t               *bp,
        xfs_buf_log_item_t      *bip)
 {
-       struct xfs_mount        *mp;
+       struct xfs_ail          *ailp = bip->bli_item.li_ailp;
 
        ASSERT(bip->bli_buf == bp);
 
        xfs_buf_rele(bp);
-       mp = bip->bli_item.li_mountp;
 
        /*
         * If we are forcibly shutting down, this may well be
         * off the AIL already. That's because we simulate the
         * log-committed callbacks to unpin these buffers. Or we may never
         * have put this item on AIL because of the transaction was
-        * aborted forcibly. xfs_trans_delete_ail() takes care of these.
+        * aborted forcibly. xfs_trans_ail_delete() takes care of these.
         *
         * Either way, AIL is useless if we're forcing a shutdown.
         */
-       spin_lock(&mp->m_ail_lock);
-       /*
-        * xfs_trans_delete_ail() drops the AIL lock.
-        */
-       xfs_trans_delete_ail(mp, (xfs_log_item_t *)bip);
+       spin_lock(&ailp->xa_lock);
+       xfs_trans_ail_delete(ailp, (xfs_log_item_t *)bip);
        xfs_buf_item_free(bip);
 }
 
diff --git a/fs/xfs/xfs_clnt.h b/fs/xfs/xfs_clnt.h
deleted file mode 100644 (file)
index d2ce5dd..0000000
+++ /dev/null
@@ -1,105 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_CLNT_H__
-#define __XFS_CLNT_H__
-
-/*
- * XFS arguments structure, constructed from the arguments we
- * are passed via the mount system call.
- *
- * NOTE: The mount system call is handled differently between
- * Linux and IRIX.  In IRIX we worked work with a binary data
- * structure coming in across the syscall interface from user
- * space (the mount userspace knows about each filesystem type
- * and the set of valid options for it, and converts the users
- * argument string into a binary structure _before_ making the
- * system call), and the ABI issues that this implies.
- *
- * In Linux, we are passed a comma separated set of options;
- * ie. a NULL terminated string of characters.  Userspace mount
- * code does not have any knowledge of mount options expected by
- * each filesystem type and so each filesystem parses its mount
- * options in kernel space.
- *
- * For the Linux port, we kept this structure pretty much intact
- * and use it internally (because the existing code groks it).
- */
-struct xfs_mount_args {
-       int     flags;          /* flags -> see XFSMNT_... macros below */
-       int     flags2;         /* flags -> see XFSMNT2_... macros below */
-       int     logbufs;        /* Number of log buffers, -1 to default */
-       int     logbufsize;     /* Size of log buffers, -1 to default */
-       char    fsname[MAXNAMELEN+1];   /* data device name */
-       char    rtname[MAXNAMELEN+1];   /* realtime device filename */
-       char    logname[MAXNAMELEN+1];  /* journal device filename */
-       char    mtpt[MAXNAMELEN+1];     /* filesystem mount point */
-       int     sunit;          /* stripe unit (BBs) */
-       int     swidth;         /* stripe width (BBs), multiple of sunit */
-       uchar_t iosizelog;      /* log2 of the preferred I/O size */
-       int     ihashsize;      /* inode hash table size (buckets) */
-};
-
-/*
- * XFS mount option flags -- args->flags1
- */
-#define        XFSMNT_ATTR2            0x00000001      /* allow ATTR2 EA format */
-#define        XFSMNT_WSYNC            0x00000002      /* safe mode nfs mount
-                                                * compatible */
-#define        XFSMNT_INO64            0x00000004      /* move inode numbers up
-                                                * past 2^32 */
-#define XFSMNT_UQUOTA          0x00000008      /* user quota accounting */
-#define XFSMNT_PQUOTA          0x00000010      /* IRIX prj quota accounting */
-#define XFSMNT_UQUOTAENF       0x00000020      /* user quota limit
-                                                * enforcement */
-#define XFSMNT_PQUOTAENF       0x00000040      /* IRIX project quota limit
-                                                * enforcement */
-#define XFSMNT_QUIET           0x00000080      /* don't report mount errors */
-#define XFSMNT_NOALIGN         0x00000200      /* don't allocate at
-                                                * stripe boundaries*/
-#define XFSMNT_RETERR          0x00000400      /* return error to user */
-#define XFSMNT_NORECOVERY      0x00000800      /* no recovery, implies
-                                                * read-only mount */
-#define XFSMNT_SHARED          0x00001000      /* shared XFS mount */
-#define XFSMNT_IOSIZE          0x00002000      /* optimize for I/O size */
-#define XFSMNT_OSYNCISOSYNC    0x00004000      /* o_sync is REALLY o_sync */
-                                               /* (osyncisdsync is default) */
-#define XFSMNT_NOATTR2         0x00008000      /* turn off ATTR2 EA format */
-#define XFSMNT_32BITINODES     0x00200000      /* restrict inodes to 32
-                                                * bits of address space */
-#define XFSMNT_GQUOTA          0x00400000      /* group quota accounting */
-#define XFSMNT_GQUOTAENF       0x00800000      /* group quota limit
-                                                * enforcement */
-#define XFSMNT_NOUUID          0x01000000      /* Ignore fs uuid */
-#define XFSMNT_DMAPI           0x02000000      /* enable dmapi/xdsm */
-#define XFSMNT_BARRIER         0x04000000      /* use write barriers */
-#define XFSMNT_IKEEP           0x08000000      /* inode cluster delete */
-#define XFSMNT_SWALLOC         0x10000000      /* turn on stripe width
-                                                * allocation */
-#define XFSMNT_DIRSYNC         0x40000000      /* sync creat,link,unlink,rename
-                                                * symlink,mkdir,rmdir,mknod */
-#define XFSMNT_FLAGS2          0x80000000      /* more flags set in flags2 */
-
-/*
- * XFS mount option flags -- args->flags2
- */
-#define XFSMNT2_COMPAT_IOSIZE  0x00000001      /* don't report large preferred
-                                                * I/O size in stat(2) */
-#define XFSMNT2_FILESTREAMS    0x00000002      /* enable the filestreams
-                                                * allocator */
-
-#endif /* __XFS_CLNT_H__ */
index 9e561a9cefcaa193c5a4214ad7449246dcd96a44..a11a8390bf6caa7d12270e182efabee7f5fcffac 100644 (file)
@@ -1566,11 +1566,14 @@ xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno)
        int nmap, error, w, count, c, got, i, mapi;
        xfs_trans_t *tp;
        xfs_mount_t *mp;
+       xfs_drfsbno_t   nblks;
 
        dp = args->dp;
        mp = dp->i_mount;
        w = args->whichfork;
        tp = args->trans;
+       nblks = dp->i_d.di_nblocks;
+
        /*
         * For new directories adjust the file offset and block count.
         */
@@ -1647,6 +1650,8 @@ xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno)
        }
        if (mapp != &map)
                kmem_free(mapp);
+       /* account for newly allocated blocks in reserved blocks total */
+       args->total -= dp->i_d.di_nblocks - nblks;
        *new_blkno = (xfs_dablk_t)bno;
        return 0;
 }
index 8be0b00ede9aafc83970a23e4ad9c7df2565b54d..70b710c1792d33bfce9c94641b2e62016d48493e 100644 (file)
@@ -72,27 +72,7 @@ typedef struct xfs_da_intnode {
 typedef struct xfs_da_node_hdr xfs_da_node_hdr_t;
 typedef struct xfs_da_node_entry xfs_da_node_entry_t;
 
-#define XFS_DA_MAXHASH ((xfs_dahash_t)-1) /* largest valid hash value */
-
 #define        XFS_LBSIZE(mp)  (mp)->m_sb.sb_blocksize
-#define        XFS_LBLOG(mp)   (mp)->m_sb.sb_blocklog
-
-#define        XFS_DA_MAKE_BNOENTRY(mp,bno,entry)      \
-       (((bno) << (mp)->m_dircook_elog) | (entry))
-#define        XFS_DA_MAKE_COOKIE(mp,bno,entry,hash)   \
-       (((xfs_off_t)XFS_DA_MAKE_BNOENTRY(mp, bno, entry) << 32) | (hash))
-#define        XFS_DA_COOKIE_HASH(mp,cookie)           ((xfs_dahash_t)cookie)
-#define        XFS_DA_COOKIE_BNO(mp,cookie)            \
-       ((((xfs_off_t)(cookie) >> 31) == -1LL ? \
-               (xfs_dablk_t)0 : \
-               (xfs_dablk_t)((xfs_off_t)(cookie) >> \
-                               ((mp)->m_dircook_elog + 32))))
-#define        XFS_DA_COOKIE_ENTRY(mp,cookie)          \
-       ((((xfs_off_t)(cookie) >> 31) == -1LL ? \
-               (xfs_dablk_t)0 : \
-               (xfs_dablk_t)(((xfs_off_t)(cookie) >> 32) & \
-                               ((1 << (mp)->m_dircook_elog) - 1))))
-
 
 /*========================================================================
  * Btree searching and modification structure definitions.
@@ -226,9 +206,8 @@ struct xfs_nameops {
 };
 
 
-#ifdef __KERNEL__
 /*========================================================================
- * Function prototypes for the kernel.
+ * Function prototypes.
  *========================================================================*/
 
 /*
@@ -289,6 +268,5 @@ xfs_daddr_t xfs_da_blkno(xfs_dabuf_t *dabuf);
 
 extern struct kmem_zone *xfs_da_state_zone;
 extern struct kmem_zone *xfs_dabuf_zone;
-#endif /* __KERNEL__ */
 
 #endif /* __XFS_DA_BTREE_H__ */
index c9065eaf2a4d2a9eceea58fe0563f4377cfe0ad1..d7cf392cc8526ae2b1b79d3b022570f04ae30119 100644 (file)
@@ -78,8 +78,7 @@ typedef struct xfs_dinode
        xfs_dinode_core_t       di_core;
        /*
         * In adding anything between the core and the union, be
-        * sure to update the macros like XFS_LITINO below and
-        * XFS_BMAP_RBLOCK_DSIZE in xfs_bmap_btree.h.
+        * sure to update the macros like XFS_LITINO below.
         */
        __be32                  di_next_unlinked;/* agi unlinked list ptr */
        union {
@@ -166,7 +165,7 @@ typedef enum xfs_dinode_fmt
  */
 #define        XFS_LITINO(mp)  ((mp)->m_litino)
 #define        XFS_BROOT_SIZE_ADJ      \
-       (sizeof(xfs_bmbt_block_t) - sizeof(xfs_bmdr_block_t))
+       (XFS_BTREE_LBLOCK_LEN - sizeof(xfs_bmdr_block_t))
 
 /*
  * Inode data & attribute fork sizes, per inode.
index 80e0dc51361c07e541726ff75871f399fd58d181..1afb12278b8d04b85eb42d0f67d0bd21936885be 100644 (file)
@@ -525,11 +525,13 @@ xfs_dir2_grow_inode(
        xfs_mount_t     *mp;
        int             nmap;           /* number of bmap entries */
        xfs_trans_t     *tp;
+       xfs_drfsbno_t   nblks;
 
        xfs_dir2_trace_args_s("grow_inode", args, space);
        dp = args->dp;
        tp = args->trans;
        mp = dp->i_mount;
+       nblks = dp->i_d.di_nblocks;
        /*
         * Set lowest possible block in the space requested.
         */
@@ -622,7 +624,11 @@ xfs_dir2_grow_inode(
         */
        if (mapp != &map)
                kmem_free(mapp);
+
+       /* account for newly allocated blocks in reserved blocks total */
+       args->total -= dp->i_d.di_nblocks - nblks;
        *dbp = xfs_dir2_da_to_db(mp, (xfs_dablk_t)bno);
+
        /*
         * Update file's size if this is the data space and it grew.
         */
index a1e55fb9d5dde43c95565c1aece67d4345232391..e71e2581c0c34b7e3febea7b907d41c5b82c979d 100644 (file)
@@ -25,7 +25,6 @@
 #include "xfs_inum.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
-#include "xfs_clnt.h"
 
 
 static struct xfs_dmops xfs_dmcore_stub = {
@@ -38,9 +37,9 @@ static struct xfs_dmops xfs_dmcore_stub = {
 };
 
 int
-xfs_dmops_get(struct xfs_mount *mp, struct xfs_mount_args *args)
+xfs_dmops_get(struct xfs_mount *mp)
 {
-       if (args->flags & XFSMNT_DMAPI) {
+       if (mp->m_flags & XFS_MOUNT_DMAPI) {
                cmn_err(CE_WARN,
                        "XFS: dmapi support not available in this kernel.");
                return EINVAL;
index 8aa28f751b2a6cb54bf60762772079820f74426c..05a4bdd4be39a09db4cec3f18e7d9200838ec6b4 100644 (file)
@@ -108,19 +108,16 @@ xfs_efi_item_pin(xfs_efi_log_item_t *efip)
 STATIC void
 xfs_efi_item_unpin(xfs_efi_log_item_t *efip, int stale)
 {
-       xfs_mount_t     *mp;
+       struct xfs_ail          *ailp = efip->efi_item.li_ailp;
 
-       mp = efip->efi_item.li_mountp;
-       spin_lock(&mp->m_ail_lock);
+       spin_lock(&ailp->xa_lock);
        if (efip->efi_flags & XFS_EFI_CANCELED) {
-               /*
-                * xfs_trans_delete_ail() drops the AIL lock.
-                */
-               xfs_trans_delete_ail(mp, (xfs_log_item_t *)efip);
+               /* xfs_trans_ail_delete() drops the AIL lock. */
+               xfs_trans_ail_delete(ailp, (xfs_log_item_t *)efip);
                xfs_efi_item_free(efip);
        } else {
                efip->efi_flags |= XFS_EFI_COMMITTED;
-               spin_unlock(&mp->m_ail_lock);
+               spin_unlock(&ailp->xa_lock);
        }
 }
 
@@ -134,26 +131,23 @@ xfs_efi_item_unpin(xfs_efi_log_item_t *efip, int stale)
 STATIC void
 xfs_efi_item_unpin_remove(xfs_efi_log_item_t *efip, xfs_trans_t *tp)
 {
-       xfs_mount_t     *mp;
+       struct xfs_ail          *ailp = efip->efi_item.li_ailp;
        xfs_log_item_desc_t     *lidp;
 
-       mp = efip->efi_item.li_mountp;
-       spin_lock(&mp->m_ail_lock);
+       spin_lock(&ailp->xa_lock);
        if (efip->efi_flags & XFS_EFI_CANCELED) {
                /*
                 * free the xaction descriptor pointing to this item
                 */
                lidp = xfs_trans_find_item(tp, (xfs_log_item_t *) efip);
                xfs_trans_free_item(tp, lidp);
-               /*
-                * pull the item off the AIL.
-                * xfs_trans_delete_ail() drops the AIL lock.
-                */
-               xfs_trans_delete_ail(mp, (xfs_log_item_t *)efip);
+
+               /* xfs_trans_ail_delete() drops the AIL lock. */
+               xfs_trans_ail_delete(ailp, (xfs_log_item_t *)efip);
                xfs_efi_item_free(efip);
        } else {
                efip->efi_flags |= XFS_EFI_COMMITTED;
-               spin_unlock(&mp->m_ail_lock);
+               spin_unlock(&ailp->xa_lock);
        }
 }
 
@@ -268,6 +262,7 @@ xfs_efi_init(xfs_mount_t    *mp,
        efip->efi_item.li_type = XFS_LI_EFI;
        efip->efi_item.li_ops = &xfs_efi_item_ops;
        efip->efi_item.li_mountp = mp;
+       efip->efi_item.li_ailp = mp->m_ail;
        efip->efi_format.efi_nextents = nextents;
        efip->efi_format.efi_id = (__psint_t)(void*)efip;
 
@@ -345,25 +340,22 @@ void
 xfs_efi_release(xfs_efi_log_item_t     *efip,
                uint                    nextents)
 {
-       xfs_mount_t     *mp;
-       int             extents_left;
+       struct xfs_ail          *ailp = efip->efi_item.li_ailp;
+       int                     extents_left;
 
-       mp = efip->efi_item.li_mountp;
        ASSERT(efip->efi_next_extent > 0);
        ASSERT(efip->efi_flags & XFS_EFI_COMMITTED);
 
-       spin_lock(&mp->m_ail_lock);
+       spin_lock(&ailp->xa_lock);
        ASSERT(efip->efi_next_extent >= nextents);
        efip->efi_next_extent -= nextents;
        extents_left = efip->efi_next_extent;
        if (extents_left == 0) {
-               /*
-                * xfs_trans_delete_ail() drops the AIL lock.
-                */
-               xfs_trans_delete_ail(mp, (xfs_log_item_t *)efip);
+               /* xfs_trans_ail_delete() drops the AIL lock. */
+               xfs_trans_ail_delete(ailp, (xfs_log_item_t *)efip);
                xfs_efi_item_free(efip);
        } else {
-               spin_unlock(&mp->m_ail_lock);
+               spin_unlock(&ailp->xa_lock);
        }
 }
 
@@ -565,6 +557,7 @@ xfs_efd_init(xfs_mount_t    *mp,
        efdp->efd_item.li_type = XFS_LI_EFD;
        efdp->efd_item.li_ops = &xfs_efd_item_ops;
        efdp->efd_item.li_mountp = mp;
+       efdp->efd_item.li_ailp = mp->m_ail;
        efdp->efd_efip = efip;
        efdp->efd_format.efd_nextents = nextents;
        efdp->efd_format.efd_efi_id = efip->efi_format.efi_id;
index 84583cf73db329ac156986cfc9476f210a48fdda..f1d0585041b9d262401fedaf7fab714c2b623e36 100644 (file)
@@ -126,7 +126,7 @@ xfs_growfs_data_private(
        xfs_extlen_t            agsize;
        xfs_extlen_t            tmpsize;
        xfs_alloc_rec_t         *arec;
-       xfs_btree_sblock_t      *block;
+       struct xfs_btree_block  *block;
        xfs_buf_t               *bp;
        int                     bucket;
        int                     dpct;
@@ -251,14 +251,14 @@ xfs_growfs_data_private(
                bp = xfs_buf_get(mp->m_ddev_targp,
                        XFS_AGB_TO_DADDR(mp, agno, XFS_BNO_BLOCK(mp)),
                        BTOBB(mp->m_sb.sb_blocksize), 0);
-               block = XFS_BUF_TO_SBLOCK(bp);
+               block = XFS_BUF_TO_BLOCK(bp);
                memset(block, 0, mp->m_sb.sb_blocksize);
                block->bb_magic = cpu_to_be32(XFS_ABTB_MAGIC);
                block->bb_level = 0;
                block->bb_numrecs = cpu_to_be16(1);
-               block->bb_leftsib = cpu_to_be32(NULLAGBLOCK);
-               block->bb_rightsib = cpu_to_be32(NULLAGBLOCK);
-               arec = XFS_BTREE_REC_ADDR(xfs_alloc, block, 1);
+               block->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
+               block->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
+               arec = XFS_ALLOC_REC_ADDR(mp, block, 1);
                arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp));
                arec->ar_blockcount = cpu_to_be32(
                        agsize - be32_to_cpu(arec->ar_startblock));
@@ -272,14 +272,14 @@ xfs_growfs_data_private(
                bp = xfs_buf_get(mp->m_ddev_targp,
                        XFS_AGB_TO_DADDR(mp, agno, XFS_CNT_BLOCK(mp)),
                        BTOBB(mp->m_sb.sb_blocksize), 0);
-               block = XFS_BUF_TO_SBLOCK(bp);
+               block = XFS_BUF_TO_BLOCK(bp);
                memset(block, 0, mp->m_sb.sb_blocksize);
                block->bb_magic = cpu_to_be32(XFS_ABTC_MAGIC);
                block->bb_level = 0;
                block->bb_numrecs = cpu_to_be16(1);
-               block->bb_leftsib = cpu_to_be32(NULLAGBLOCK);
-               block->bb_rightsib = cpu_to_be32(NULLAGBLOCK);
-               arec = XFS_BTREE_REC_ADDR(xfs_alloc, block, 1);
+               block->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
+               block->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
+               arec = XFS_ALLOC_REC_ADDR(mp, block, 1);
                arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp));
                arec->ar_blockcount = cpu_to_be32(
                        agsize - be32_to_cpu(arec->ar_startblock));
@@ -294,13 +294,13 @@ xfs_growfs_data_private(
                bp = xfs_buf_get(mp->m_ddev_targp,
                        XFS_AGB_TO_DADDR(mp, agno, XFS_IBT_BLOCK(mp)),
                        BTOBB(mp->m_sb.sb_blocksize), 0);
-               block = XFS_BUF_TO_SBLOCK(bp);
+               block = XFS_BUF_TO_BLOCK(bp);
                memset(block, 0, mp->m_sb.sb_blocksize);
                block->bb_magic = cpu_to_be32(XFS_IBT_MAGIC);
                block->bb_level = 0;
                block->bb_numrecs = 0;
-               block->bb_leftsib = cpu_to_be32(NULLAGBLOCK);
-               block->bb_rightsib = cpu_to_be32(NULLAGBLOCK);
+               block->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
+               block->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
                error = xfs_bwrite(mp, bp);
                if (error) {
                        goto error0;
index aad8c5da38afadb158518d7ee5dc51a3309726f2..c8a56c529642f39928d058719b44dd03d454d198 100644 (file)
@@ -118,6 +118,102 @@ xfs_ialloc_cluster_alignment(
        return 1;
 }
 
+/*
+ * Lookup the record equal to ino in the btree given by cur.
+ */
+STATIC int                             /* error */
+xfs_inobt_lookup_eq(
+       struct xfs_btree_cur    *cur,   /* btree cursor */
+       xfs_agino_t             ino,    /* starting inode of chunk */
+       __int32_t               fcnt,   /* free inode count */
+       xfs_inofree_t           free,   /* free inode mask */
+       int                     *stat)  /* success/failure */
+{
+       cur->bc_rec.i.ir_startino = ino;
+       cur->bc_rec.i.ir_freecount = fcnt;
+       cur->bc_rec.i.ir_free = free;
+       return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
+}
+
+/*
+ * Lookup the first record greater than or equal to ino
+ * in the btree given by cur.
+ */
+int                                    /* error */
+xfs_inobt_lookup_ge(
+       struct xfs_btree_cur    *cur,   /* btree cursor */
+       xfs_agino_t             ino,    /* starting inode of chunk */
+       __int32_t               fcnt,   /* free inode count */
+       xfs_inofree_t           free,   /* free inode mask */
+       int                     *stat)  /* success/failure */
+{
+       cur->bc_rec.i.ir_startino = ino;
+       cur->bc_rec.i.ir_freecount = fcnt;
+       cur->bc_rec.i.ir_free = free;
+       return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat);
+}
+
+/*
+ * Lookup the first record less than or equal to ino
+ * in the btree given by cur.
+ */
+int                                    /* error */
+xfs_inobt_lookup_le(
+       struct xfs_btree_cur    *cur,   /* btree cursor */
+       xfs_agino_t             ino,    /* starting inode of chunk */
+       __int32_t               fcnt,   /* free inode count */
+       xfs_inofree_t           free,   /* free inode mask */
+       int                     *stat)  /* success/failure */
+{
+       cur->bc_rec.i.ir_startino = ino;
+       cur->bc_rec.i.ir_freecount = fcnt;
+       cur->bc_rec.i.ir_free = free;
+       return xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat);
+}
+
+/*
+ * Update the record referred to by cur to the value given
+ * by [ino, fcnt, free].
+ * This either works (return 0) or gets an EFSCORRUPTED error.
+ */
+STATIC int                             /* error */
+xfs_inobt_update(
+       struct xfs_btree_cur    *cur,   /* btree cursor */
+       xfs_agino_t             ino,    /* starting inode of chunk */
+       __int32_t               fcnt,   /* free inode count */
+       xfs_inofree_t           free)   /* free inode mask */
+{
+       union xfs_btree_rec     rec;
+
+       rec.inobt.ir_startino = cpu_to_be32(ino);
+       rec.inobt.ir_freecount = cpu_to_be32(fcnt);
+       rec.inobt.ir_free = cpu_to_be64(free);
+       return xfs_btree_update(cur, &rec);
+}
+
+/*
+ * Get the data from the pointed-to record.
+ */
+int                                    /* error */
+xfs_inobt_get_rec(
+       struct xfs_btree_cur    *cur,   /* btree cursor */
+       xfs_agino_t             *ino,   /* output: starting inode of chunk */
+       __int32_t               *fcnt,  /* output: number of free inodes */
+       xfs_inofree_t           *free,  /* output: free inode mask */
+       int                     *stat)  /* output: success/failure */
+{
+       union xfs_btree_rec     *rec;
+       int                     error;
+
+       error = xfs_btree_get_rec(cur, &rec, stat);
+       if (!error && *stat == 1) {
+               *ino = be32_to_cpu(rec->inobt.ir_startino);
+               *fcnt = be32_to_cpu(rec->inobt.ir_freecount);
+               *free = be64_to_cpu(rec->inobt.ir_free);
+       }
+       return error;
+}
+
 /*
  * Allocate new inodes in the allocation group specified by agbp.
  * Return 0 for success, else error code.
@@ -335,8 +431,7 @@ xfs_ialloc_ag_alloc(
        /*
         * Insert records describing the new inode chunk into the btree.
         */
-       cur = xfs_btree_init_cursor(args.mp, tp, agbp, agno,
-                       XFS_BTNUM_INO, (xfs_inode_t *)0, 0);
+       cur = xfs_inobt_init_cursor(args.mp, tp, agbp, agno);
        for (thisino = newino;
             thisino < newino + newlen;
             thisino += XFS_INODES_PER_CHUNK) {
@@ -346,7 +441,7 @@ xfs_ialloc_ag_alloc(
                        return error;
                }
                ASSERT(i == 0);
-               if ((error = xfs_inobt_insert(cur, &i))) {
+               if ((error = xfs_btree_insert(cur, &i))) {
                        xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
                        return error;
                }
@@ -676,8 +771,7 @@ nextag:
         */
        agno = tagno;
        *IO_agbp = NULL;
-       cur = xfs_btree_init_cursor(mp, tp, agbp, be32_to_cpu(agi->agi_seqno),
-                                   XFS_BTNUM_INO, (xfs_inode_t *)0, 0);
+       cur = xfs_inobt_init_cursor(mp, tp, agbp, be32_to_cpu(agi->agi_seqno));
        /*
         * If pagino is 0 (this is the root inode allocation) use newino.
         * This must work because we've just allocated some.
@@ -697,7 +791,7 @@ nextag:
                                goto error0;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
                        freecount += rec.ir_freecount;
-                       if ((error = xfs_inobt_increment(cur, 0, &i)))
+                       if ((error = xfs_btree_increment(cur, 0, &i)))
                                goto error0;
                } while (i == 1);
 
@@ -741,7 +835,7 @@ nextag:
                        /*
                         * Search left with tcur, back up 1 record.
                         */
-                       if ((error = xfs_inobt_decrement(tcur, 0, &i)))
+                       if ((error = xfs_btree_decrement(tcur, 0, &i)))
                                goto error1;
                        doneleft = !i;
                        if (!doneleft) {
@@ -755,7 +849,7 @@ nextag:
                        /*
                         * Search right with cur, go forward 1 record.
                         */
-                       if ((error = xfs_inobt_increment(cur, 0, &i)))
+                       if ((error = xfs_btree_increment(cur, 0, &i)))
                                goto error1;
                        doneright = !i;
                        if (!doneright) {
@@ -817,7 +911,7 @@ nextag:
                                 * further left.
                                 */
                                if (useleft) {
-                                       if ((error = xfs_inobt_decrement(tcur, 0,
+                                       if ((error = xfs_btree_decrement(tcur, 0,
                                                        &i)))
                                                goto error1;
                                        doneleft = !i;
@@ -837,7 +931,7 @@ nextag:
                                 * further right.
                                 */
                                else {
-                                       if ((error = xfs_inobt_increment(cur, 0,
+                                       if ((error = xfs_btree_increment(cur, 0,
                                                        &i)))
                                                goto error1;
                                        doneright = !i;
@@ -892,7 +986,7 @@ nextag:
                                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
                                if (rec.ir_freecount > 0)
                                        break;
-                               if ((error = xfs_inobt_increment(cur, 0, &i)))
+                               if ((error = xfs_btree_increment(cur, 0, &i)))
                                        goto error0;
                                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
                        }
@@ -926,7 +1020,7 @@ nextag:
                                goto error0;
                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
                        freecount += rec.ir_freecount;
-                       if ((error = xfs_inobt_increment(cur, 0, &i)))
+                       if ((error = xfs_btree_increment(cur, 0, &i)))
                                goto error0;
                } while (i == 1);
                ASSERT(freecount == be32_to_cpu(agi->agi_freecount) ||
@@ -1022,8 +1116,7 @@ xfs_difree(
        /*
         * Initialize the cursor.
         */
-       cur = xfs_btree_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO,
-               (xfs_inode_t *)0, 0);
+       cur = xfs_inobt_init_cursor(mp, tp, agbp, agno);
 #ifdef DEBUG
        if (cur->bc_nlevels == 1) {
                int freecount = 0;
@@ -1036,7 +1129,7 @@ xfs_difree(
                                goto error0;
                        if (i) {
                                freecount += rec.ir_freecount;
-                               if ((error = xfs_inobt_increment(cur, 0, &i)))
+                               if ((error = xfs_btree_increment(cur, 0, &i)))
                                        goto error0;
                        }
                } while (i == 1);
@@ -1098,8 +1191,8 @@ xfs_difree(
                xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, -ilen);
                xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -(ilen - 1));
 
-               if ((error = xfs_inobt_delete(cur, &i))) {
-                       cmn_err(CE_WARN, "xfs_difree: xfs_inobt_delete returned an error %d on %s.\n",
+               if ((error = xfs_btree_delete(cur, &i))) {
+                       cmn_err(CE_WARN, "xfs_difree: xfs_btree_delete returned an error %d on %s.\n",
                                error, mp->m_fsname);
                        goto error0;
                }
@@ -1141,7 +1234,7 @@ xfs_difree(
                                goto error0;
                        if (i) {
                                freecount += rec.ir_freecount;
-                               if ((error = xfs_inobt_increment(cur, 0, &i)))
+                               if ((error = xfs_btree_increment(cur, 0, &i)))
                                        goto error0;
                        }
                } while (i == 1);
@@ -1259,8 +1352,7 @@ xfs_dilocate(
 #endif /* DEBUG */
                        return error;
                }
-               cur = xfs_btree_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO,
-                       (xfs_inode_t *)0, 0);
+               cur = xfs_inobt_init_cursor(mp, tp, agbp, agno);
                if ((error = xfs_inobt_lookup_le(cur, agino, 0, 0, &i))) {
 #ifdef DEBUG
                        xfs_fs_cmn_err(CE_ALERT, mp, "xfs_dilocate: "
index 4e30ec1d13bc35b3fd3af4b0b0ad50bde967ad58..ccf554a6e0a1dcada20008141d6090dff8cbec59 100644 (file)
@@ -56,7 +56,6 @@ static inline int xfs_ialloc_find_free(xfs_inofree_t *fp)
 }
 
 
-#ifdef __KERNEL__
 /*
  * Allocate an inode on disk.
  * Mode is used to tell whether the new inode will need space, and whether
@@ -154,6 +153,24 @@ xfs_ialloc_pagi_init(
        struct xfs_trans *tp,           /* transaction pointer */
         xfs_agnumber_t  agno);         /* allocation group number */
 
-#endif /* __KERNEL__ */
+/*
+ * Lookup the first record greater than or equal to ino
+ * in the btree given by cur.
+ */
+int xfs_inobt_lookup_ge(struct xfs_btree_cur *cur, xfs_agino_t ino,
+               __int32_t fcnt, xfs_inofree_t free, int *stat);
+
+/*
+ * Lookup the first record less than or equal to ino
+ * in the btree given by cur.
+ */
+int xfs_inobt_lookup_le(struct xfs_btree_cur *cur, xfs_agino_t ino,
+               __int32_t fcnt, xfs_inofree_t free, int *stat);
+
+/*
+ * Get the data from the pointed-to record.
+ */
+extern int xfs_inobt_get_rec(struct xfs_btree_cur *cur, xfs_agino_t *ino,
+                            __int32_t *fcnt, xfs_inofree_t *free, int *stat);
 
 #endif /* __XFS_IALLOC_H__ */
index 83502f3edef0d99477e6fb539418950ec9019c9b..99f2408e8d8e634ac446914a505982bba3d48deb 100644 (file)
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_btree.h"
+#include "xfs_btree_trace.h"
 #include "xfs_ialloc.h"
 #include "xfs_alloc.h"
 #include "xfs_error.h"
 
-STATIC void xfs_inobt_log_block(xfs_trans_t *, xfs_buf_t *, int);
-STATIC void xfs_inobt_log_keys(xfs_btree_cur_t *, xfs_buf_t *, int, int);
-STATIC void xfs_inobt_log_ptrs(xfs_btree_cur_t *, xfs_buf_t *, int, int);
-STATIC void xfs_inobt_log_recs(xfs_btree_cur_t *, xfs_buf_t *, int, int);
-STATIC int xfs_inobt_lshift(xfs_btree_cur_t *, int, int *);
-STATIC int xfs_inobt_newroot(xfs_btree_cur_t *, int *);
-STATIC int xfs_inobt_rshift(xfs_btree_cur_t *, int, int *);
-STATIC int xfs_inobt_split(xfs_btree_cur_t *, int, xfs_agblock_t *,
-               xfs_inobt_key_t *, xfs_btree_cur_t **, int *);
-STATIC int xfs_inobt_updkey(xfs_btree_cur_t *, xfs_inobt_key_t *, int);
 
-/*
- * Single level of the xfs_inobt_delete record deletion routine.
- * Delete record pointed to by cur/level.
- * Remove the record from its block then rebalance the tree.
- * Return 0 for error, 1 for done, 2 to go on to the next level.
- */
-STATIC int                             /* error */
-xfs_inobt_delrec(
-       xfs_btree_cur_t         *cur,   /* btree cursor */
-       int                     level,  /* level removing record from */
-       int                     *stat)  /* fail/done/go-on */
+STATIC int
+xfs_inobt_get_minrecs(
+       struct xfs_btree_cur    *cur,
+       int                     level)
 {
-       xfs_buf_t               *agbp;  /* buffer for a.g. inode header */
-       xfs_mount_t             *mp;    /* mount structure */
-       xfs_agi_t               *agi;   /* allocation group inode header */
-       xfs_inobt_block_t       *block; /* btree block record/key lives in */
-       xfs_agblock_t           bno;    /* btree block number */
-       xfs_buf_t               *bp;    /* buffer for block */
-       int                     error;  /* error return value */
-       int                     i;      /* loop index */
-       xfs_inobt_key_t         key;    /* kp points here if block is level 0 */
-       xfs_inobt_key_t         *kp = NULL;     /* pointer to btree keys */
-       xfs_agblock_t           lbno;   /* left block's block number */
-       xfs_buf_t               *lbp;   /* left block's buffer pointer */
-       xfs_inobt_block_t       *left;  /* left btree block */
-       xfs_inobt_key_t         *lkp;   /* left block key pointer */
-       xfs_inobt_ptr_t         *lpp;   /* left block address pointer */
-       int                     lrecs = 0;      /* number of records in left block */
-       xfs_inobt_rec_t         *lrp;   /* left block record pointer */
-       xfs_inobt_ptr_t         *pp = NULL;     /* pointer to btree addresses */
-       int                     ptr;    /* index in btree block for this rec */
-       xfs_agblock_t           rbno;   /* right block's block number */
-       xfs_buf_t               *rbp;   /* right block's buffer pointer */
-       xfs_inobt_block_t       *right; /* right btree block */
-       xfs_inobt_key_t         *rkp;   /* right block key pointer */
-       xfs_inobt_rec_t         *rp;    /* pointer to btree records */
-       xfs_inobt_ptr_t         *rpp;   /* right block address pointer */
-       int                     rrecs = 0;      /* number of records in right block */
-       int                     numrecs;
-       xfs_inobt_rec_t         *rrp;   /* right block record pointer */
-       xfs_btree_cur_t         *tcur;  /* temporary btree cursor */
-
-       mp = cur->bc_mp;
-
-       /*
-        * Get the index of the entry being deleted, check for nothing there.
-        */
-       ptr = cur->bc_ptrs[level];
-       if (ptr == 0) {
-               *stat = 0;
-               return 0;
-       }
-
-       /*
-        * Get the buffer & block containing the record or key/ptr.
-        */
-       bp = cur->bc_bufs[level];
-       block = XFS_BUF_TO_INOBT_BLOCK(bp);
-#ifdef DEBUG
-       if ((error = xfs_btree_check_sblock(cur, block, level, bp)))
-               return error;
-#endif
-       /*
-        * Fail if we're off the end of the block.
-        */
+       return cur->bc_mp->m_inobt_mnr[level != 0];
+}
 
-       numrecs = be16_to_cpu(block->bb_numrecs);
-       if (ptr > numrecs) {
-               *stat = 0;
-               return 0;
-       }
-       /*
-        * It's a nonleaf.  Excise the key and ptr being deleted, by
-        * sliding the entries past them down one.
-        * Log the changed areas of the block.
-        */
-       if (level > 0) {
-               kp = XFS_INOBT_KEY_ADDR(block, 1, cur);
-               pp = XFS_INOBT_PTR_ADDR(block, 1, cur);
-#ifdef DEBUG
-               for (i = ptr; i < numrecs; i++) {
-                       if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(pp[i]), level)))
-                               return error;
-               }
-#endif
-               if (ptr < numrecs) {
-                       memmove(&kp[ptr - 1], &kp[ptr],
-                               (numrecs - ptr) * sizeof(*kp));
-                       memmove(&pp[ptr - 1], &pp[ptr],
-                               (numrecs - ptr) * sizeof(*kp));
-                       xfs_inobt_log_keys(cur, bp, ptr, numrecs - 1);
-                       xfs_inobt_log_ptrs(cur, bp, ptr, numrecs - 1);
-               }
-       }
-       /*
-        * It's a leaf.  Excise the record being deleted, by sliding the
-        * entries past it down one.  Log the changed areas of the block.
-        */
-       else {
-               rp = XFS_INOBT_REC_ADDR(block, 1, cur);
-               if (ptr < numrecs) {
-                       memmove(&rp[ptr - 1], &rp[ptr],
-                               (numrecs - ptr) * sizeof(*rp));
-                       xfs_inobt_log_recs(cur, bp, ptr, numrecs - 1);
-               }
-               /*
-                * If it's the first record in the block, we'll need a key
-                * structure to pass up to the next level (updkey).
-                */
-               if (ptr == 1) {
-                       key.ir_startino = rp->ir_startino;
-                       kp = &key;
-               }
-       }
-       /*
-        * Decrement and log the number of entries in the block.
-        */
-       numrecs--;
-       block->bb_numrecs = cpu_to_be16(numrecs);
-       xfs_inobt_log_block(cur->bc_tp, bp, XFS_BB_NUMRECS);
-       /*
-        * Is this the root level?  If so, we're almost done.
-        */
-       if (level == cur->bc_nlevels - 1) {
-               /*
-                * If this is the root level,
-                * and there's only one entry left,
-                * and it's NOT the leaf level,
-                * then we can get rid of this level.
-                */
-               if (numrecs == 1 && level > 0) {
-                       agbp = cur->bc_private.a.agbp;
-                       agi = XFS_BUF_TO_AGI(agbp);
-                       /*
-                        * pp is still set to the first pointer in the block.
-                        * Make it the new root of the btree.
-                        */
-                       bno = be32_to_cpu(agi->agi_root);
-                       agi->agi_root = *pp;
-                       be32_add_cpu(&agi->agi_level, -1);
-                       /*
-                        * Free the block.
-                        */
-                       if ((error = xfs_free_extent(cur->bc_tp,
-                               XFS_AGB_TO_FSB(mp, cur->bc_private.a.agno, bno), 1)))
-                               return error;
-                       xfs_trans_binval(cur->bc_tp, bp);
-                       xfs_ialloc_log_agi(cur->bc_tp, agbp,
-                               XFS_AGI_ROOT | XFS_AGI_LEVEL);
-                       /*
-                        * Update the cursor so there's one fewer level.
-                        */
-                       cur->bc_bufs[level] = NULL;
-                       cur->bc_nlevels--;
-               } else if (level > 0 &&
-                          (error = xfs_inobt_decrement(cur, level, &i)))
-                       return error;
-               *stat = 1;
-               return 0;
-       }
-       /*
-        * If we deleted the leftmost entry in the block, update the
-        * key values above us in the tree.
-        */
-       if (ptr == 1 && (error = xfs_inobt_updkey(cur, kp, level + 1)))
-               return error;
-       /*
-        * If the number of records remaining in the block is at least
-        * the minimum, we're done.
-        */
-       if (numrecs >= XFS_INOBT_BLOCK_MINRECS(level, cur)) {
-               if (level > 0 &&
-                   (error = xfs_inobt_decrement(cur, level, &i)))
-                       return error;
-               *stat = 1;
-               return 0;
-       }
-       /*
-        * Otherwise, we have to move some records around to keep the
-        * tree balanced.  Look at the left and right sibling blocks to
-        * see if we can re-balance by moving only one record.
-        */
-       rbno = be32_to_cpu(block->bb_rightsib);
-       lbno = be32_to_cpu(block->bb_leftsib);
-       bno = NULLAGBLOCK;
-       ASSERT(rbno != NULLAGBLOCK || lbno != NULLAGBLOCK);
-       /*
-        * Duplicate the cursor so our btree manipulations here won't
-        * disrupt the next level up.
-        */
-       if ((error = xfs_btree_dup_cursor(cur, &tcur)))
-               return error;
-       /*
-        * If there's a right sibling, see if it's ok to shift an entry
-        * out of it.
-        */
-       if (rbno != NULLAGBLOCK) {
-               /*
-                * Move the temp cursor to the last entry in the next block.
-                * Actually any entry but the first would suffice.
-                */
-               i = xfs_btree_lastrec(tcur, level);
-               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-               if ((error = xfs_inobt_increment(tcur, level, &i)))
-                       goto error0;
-               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-               i = xfs_btree_lastrec(tcur, level);
-               XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-               /*
-                * Grab a pointer to the block.
-                */
-               rbp = tcur->bc_bufs[level];
-               right = XFS_BUF_TO_INOBT_BLOCK(rbp);
-#ifdef DEBUG
-               if ((error = xfs_btree_check_sblock(cur, right, level, rbp)))
-                       goto error0;
-#endif
-               /*
-                * Grab the current block number, for future use.
-                */
-               bno = be32_to_cpu(right->bb_leftsib);
-               /*
-                * If right block is full enough so that removing one entry
-                * won't make it too empty, and left-shifting an entry out
-                * of right to us works, we're done.
-                */
-               if (be16_to_cpu(right->bb_numrecs) - 1 >=
-                    XFS_INOBT_BLOCK_MINRECS(level, cur)) {
-                       if ((error = xfs_inobt_lshift(tcur, level, &i)))
-                               goto error0;
-                       if (i) {
-                               ASSERT(be16_to_cpu(block->bb_numrecs) >=
-                                      XFS_INOBT_BLOCK_MINRECS(level, cur));
-                               xfs_btree_del_cursor(tcur,
-                                                    XFS_BTREE_NOERROR);
-                               if (level > 0 &&
-                                   (error = xfs_inobt_decrement(cur, level,
-                                               &i)))
-                                       return error;
-                               *stat = 1;
-                               return 0;
-                       }
-               }
-               /*
-                * Otherwise, grab the number of records in right for
-                * future reference, and fix up the temp cursor to point
-                * to our block again (last record).
-                */
-               rrecs = be16_to_cpu(right->bb_numrecs);
-               if (lbno != NULLAGBLOCK) {
-                       xfs_btree_firstrec(tcur, level);
-                       if ((error = xfs_inobt_decrement(tcur, level, &i)))
-                               goto error0;
-               }
-       }
-       /*
-        * If there's a left sibling, see if it's ok to shift an entry
-        * out of it.
-        */
-       if (lbno != NULLAGBLOCK) {
-               /*
-                * Move the temp cursor to the first entry in the
-                * previous block.
-                */
-               xfs_btree_firstrec(tcur, level);
-               if ((error = xfs_inobt_decrement(tcur, level, &i)))
-                       goto error0;
-               xfs_btree_firstrec(tcur, level);
-               /*
-                * Grab a pointer to the block.
-                */
-               lbp = tcur->bc_bufs[level];
-               left = XFS_BUF_TO_INOBT_BLOCK(lbp);
-#ifdef DEBUG
-               if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
-                       goto error0;
-#endif
-               /*
-                * Grab the current block number, for future use.
-                */
-               bno = be32_to_cpu(left->bb_rightsib);
-               /*
-                * If left block is full enough so that removing one entry
-                * won't make it too empty, and right-shifting an entry out
-                * of left to us works, we're done.
-                */
-               if (be16_to_cpu(left->bb_numrecs) - 1 >=
-                    XFS_INOBT_BLOCK_MINRECS(level, cur)) {
-                       if ((error = xfs_inobt_rshift(tcur, level, &i)))
-                               goto error0;
-                       if (i) {
-                               ASSERT(be16_to_cpu(block->bb_numrecs) >=
-                                      XFS_INOBT_BLOCK_MINRECS(level, cur));
-                               xfs_btree_del_cursor(tcur,
-                                                    XFS_BTREE_NOERROR);
-                               if (level == 0)
-                                       cur->bc_ptrs[0]++;
-                               *stat = 1;
-                               return 0;
-                       }
-               }
-               /*
-                * Otherwise, grab the number of records in right for
-                * future reference.
-                */
-               lrecs = be16_to_cpu(left->bb_numrecs);
-       }
-       /*
-        * Delete the temp cursor, we're done with it.
-        */
-       xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
-       /*
-        * If here, we need to do a join to keep the tree balanced.
-        */
-       ASSERT(bno != NULLAGBLOCK);
-       /*
-        * See if we can join with the left neighbor block.
-        */
-       if (lbno != NULLAGBLOCK &&
-           lrecs + numrecs <= XFS_INOBT_BLOCK_MAXRECS(level, cur)) {
-               /*
-                * Set "right" to be the starting block,
-                * "left" to be the left neighbor.
-                */
-               rbno = bno;
-               right = block;
-               rrecs = be16_to_cpu(right->bb_numrecs);
-               rbp = bp;
-               if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
-                               cur->bc_private.a.agno, lbno, 0, &lbp,
-                               XFS_INO_BTREE_REF)))
-                       return error;
-               left = XFS_BUF_TO_INOBT_BLOCK(lbp);
-               lrecs = be16_to_cpu(left->bb_numrecs);
-               if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
-                       return error;
-       }
-       /*
-        * If that won't work, see if we can join with the right neighbor block.
-        */
-       else if (rbno != NULLAGBLOCK &&
-                rrecs + numrecs <= XFS_INOBT_BLOCK_MAXRECS(level, cur)) {
-               /*
-                * Set "left" to be the starting block,
-                * "right" to be the right neighbor.
-                */
-               lbno = bno;
-               left = block;
-               lrecs = be16_to_cpu(left->bb_numrecs);
-               lbp = bp;
-               if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
-                               cur->bc_private.a.agno, rbno, 0, &rbp,
-                               XFS_INO_BTREE_REF)))
-                       return error;
-               right = XFS_BUF_TO_INOBT_BLOCK(rbp);
-               rrecs = be16_to_cpu(right->bb_numrecs);
-               if ((error = xfs_btree_check_sblock(cur, right, level, rbp)))
-                       return error;
-       }
-       /*
-        * Otherwise, we can't fix the imbalance.
-        * Just return.  This is probably a logic error, but it's not fatal.
-        */
-       else {
-               if (level > 0 && (error = xfs_inobt_decrement(cur, level, &i)))
-                       return error;
-               *stat = 1;
-               return 0;
-       }
-       /*
-        * We're now going to join "left" and "right" by moving all the stuff
-        * in "right" to "left" and deleting "right".
-        */
-       if (level > 0) {
-               /*
-                * It's a non-leaf.  Move keys and pointers.
-                */
-               lkp = XFS_INOBT_KEY_ADDR(left, lrecs + 1, cur);
-               lpp = XFS_INOBT_PTR_ADDR(left, lrecs + 1, cur);
-               rkp = XFS_INOBT_KEY_ADDR(right, 1, cur);
-               rpp = XFS_INOBT_PTR_ADDR(right, 1, cur);
-#ifdef DEBUG
-               for (i = 0; i < rrecs; i++) {
-                       if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(rpp[i]), level)))
-                               return error;
-               }
-#endif
-               memcpy(lkp, rkp, rrecs * sizeof(*lkp));
-               memcpy(lpp, rpp, rrecs * sizeof(*lpp));
-               xfs_inobt_log_keys(cur, lbp, lrecs + 1, lrecs + rrecs);
-               xfs_inobt_log_ptrs(cur, lbp, lrecs + 1, lrecs + rrecs);
-       } else {
-               /*
-                * It's a leaf.  Move records.
-                */
-               lrp = XFS_INOBT_REC_ADDR(left, lrecs + 1, cur);
-               rrp = XFS_INOBT_REC_ADDR(right, 1, cur);
-               memcpy(lrp, rrp, rrecs * sizeof(*lrp));
-               xfs_inobt_log_recs(cur, lbp, lrecs + 1, lrecs + rrecs);
-       }
-       /*
-        * If we joined with the left neighbor, set the buffer in the
-        * cursor to the left block, and fix up the index.
-        */
-       if (bp != lbp) {
-               xfs_btree_setbuf(cur, level, lbp);
-               cur->bc_ptrs[level] += lrecs;
-       }
-       /*
-        * If we joined with the right neighbor and there's a level above
-        * us, increment the cursor at that level.
-        */
-       else if (level + 1 < cur->bc_nlevels &&
-                (error = xfs_alloc_increment(cur, level + 1, &i)))
-               return error;
-       /*
-        * Fix up the number of records in the surviving block.
-        */
-       lrecs += rrecs;
-       left->bb_numrecs = cpu_to_be16(lrecs);
-       /*
-        * Fix up the right block pointer in the surviving block, and log it.
-        */
-       left->bb_rightsib = right->bb_rightsib;
-       xfs_inobt_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
-       /*
-        * If there is a right sibling now, make it point to the
-        * remaining block.
-        */
-       if (be32_to_cpu(left->bb_rightsib) != NULLAGBLOCK) {
-               xfs_inobt_block_t       *rrblock;
-               xfs_buf_t               *rrbp;
+STATIC struct xfs_btree_cur *
+xfs_inobt_dup_cursor(
+       struct xfs_btree_cur    *cur)
+{
+       return xfs_inobt_init_cursor(cur->bc_mp, cur->bc_tp,
+                       cur->bc_private.a.agbp, cur->bc_private.a.agno);
+}
 
-               if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
-                               cur->bc_private.a.agno, be32_to_cpu(left->bb_rightsib), 0,
-                               &rrbp, XFS_INO_BTREE_REF)))
-                       return error;
-               rrblock = XFS_BUF_TO_INOBT_BLOCK(rrbp);
-               if ((error = xfs_btree_check_sblock(cur, rrblock, level, rrbp)))
-                       return error;
-               rrblock->bb_leftsib = cpu_to_be32(lbno);
-               xfs_inobt_log_block(cur->bc_tp, rrbp, XFS_BB_LEFTSIB);
-       }
-       /*
-        * Free the deleting block.
-        */
-       if ((error = xfs_free_extent(cur->bc_tp, XFS_AGB_TO_FSB(mp,
-                                    cur->bc_private.a.agno, rbno), 1)))
-               return error;
-       xfs_trans_binval(cur->bc_tp, rbp);
-       /*
-        * Readjust the ptr at this level if it's not a leaf, since it's
-        * still pointing at the deletion point, which makes the cursor
-        * inconsistent.  If this makes the ptr 0, the caller fixes it up.
-        * We can't use decrement because it would change the next level up.
-        */
-       if (level > 0)
-               cur->bc_ptrs[level]--;
-       /*
-        * Return value means the next level up has something to do.
-        */
-       *stat = 2;
-       return 0;
+STATIC void
+xfs_inobt_set_root(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_ptr     *nptr,
+       int                     inc)    /* level change */
+{
+       struct xfs_buf          *agbp = cur->bc_private.a.agbp;
+       struct xfs_agi          *agi = XFS_BUF_TO_AGI(agbp);
 
-error0:
-       xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
-       return error;
+       agi->agi_root = nptr->s;
+       be32_add_cpu(&agi->agi_level, inc);
+       xfs_ialloc_log_agi(cur->bc_tp, agbp, XFS_AGI_ROOT | XFS_AGI_LEVEL);
 }
 
-/*
- * Insert one record/level.  Return information to the caller
- * allowing the next level up to proceed if necessary.
- */
-STATIC int                             /* error */
-xfs_inobt_insrec(
-       xfs_btree_cur_t         *cur,   /* btree cursor */
-       int                     level,  /* level to insert record at */
-       xfs_agblock_t           *bnop,  /* i/o: block number inserted */
-       xfs_inobt_rec_t         *recp,  /* i/o: record data inserted */
-       xfs_btree_cur_t         **curp, /* output: new cursor replacing cur */
-       int                     *stat)  /* success/failure */
+STATIC int
+xfs_inobt_alloc_block(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_ptr     *start,
+       union xfs_btree_ptr     *new,
+       int                     length,
+       int                     *stat)
 {
-       xfs_inobt_block_t       *block; /* btree block record/key lives in */
-       xfs_buf_t               *bp;    /* buffer for block */
-       int                     error;  /* error return value */
-       int                     i;      /* loop index */
-       xfs_inobt_key_t         key;    /* key value being inserted */
-       xfs_inobt_key_t         *kp=NULL;       /* pointer to btree keys */
-       xfs_agblock_t           nbno;   /* block number of allocated block */
-       xfs_btree_cur_t         *ncur;  /* new cursor to be used at next lvl */
-       xfs_inobt_key_t         nkey;   /* new key value, from split */
-       xfs_inobt_rec_t         nrec;   /* new record value, for caller */
-       int                     numrecs;
-       int                     optr;   /* old ptr value */
-       xfs_inobt_ptr_t         *pp;    /* pointer to btree addresses */
-       int                     ptr;    /* index in btree block for this rec */
-       xfs_inobt_rec_t         *rp=NULL;       /* pointer to btree records */
+       xfs_alloc_arg_t         args;           /* block allocation args */
+       int                     error;          /* error return value */
+       xfs_agblock_t           sbno = be32_to_cpu(start->s);
 
-       /*
-        * GCC doesn't understand the (arguably complex) control flow in
-        * this function and complains about uninitialized structure fields
-        * without this.
-        */
-       memset(&nrec, 0, sizeof(nrec));
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
 
-       /*
-        * If we made it to the root level, allocate a new root block
-        * and we're done.
-        */
-       if (level >= cur->bc_nlevels) {
-               error = xfs_inobt_newroot(cur, &i);
-               *bnop = NULLAGBLOCK;
-               *stat = i;
+       memset(&args, 0, sizeof(args));
+       args.tp = cur->bc_tp;
+       args.mp = cur->bc_mp;
+       args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.a.agno, sbno);
+       args.minlen = 1;
+       args.maxlen = 1;
+       args.prod = 1;
+       args.type = XFS_ALLOCTYPE_NEAR_BNO;
+
+       error = xfs_alloc_vextent(&args);
+       if (error) {
+               XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
                return error;
        }
-       /*
-        * Make a key out of the record data to be inserted, and save it.
-        */
-       key.ir_startino = recp->ir_startino;
-       optr = ptr = cur->bc_ptrs[level];
-       /*
-        * If we're off the left edge, return failure.
-        */
-       if (ptr == 0) {
+       if (args.fsbno == NULLFSBLOCK) {
+               XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
                *stat = 0;
                return 0;
        }
-       /*
-        * Get pointers to the btree buffer and block.
-        */
-       bp = cur->bc_bufs[level];
-       block = XFS_BUF_TO_INOBT_BLOCK(bp);
-       numrecs = be16_to_cpu(block->bb_numrecs);
-#ifdef DEBUG
-       if ((error = xfs_btree_check_sblock(cur, block, level, bp)))
-               return error;
-       /*
-        * Check that the new entry is being inserted in the right place.
-        */
-       if (ptr <= numrecs) {
-               if (level == 0) {
-                       rp = XFS_INOBT_REC_ADDR(block, ptr, cur);
-                       xfs_btree_check_rec(cur->bc_btnum, recp, rp);
-               } else {
-                       kp = XFS_INOBT_KEY_ADDR(block, ptr, cur);
-                       xfs_btree_check_key(cur->bc_btnum, &key, kp);
-               }
-       }
-#endif
-       nbno = NULLAGBLOCK;
-       ncur = NULL;
-       /*
-        * If the block is full, we can't insert the new entry until we
-        * make the block un-full.
-        */
-       if (numrecs == XFS_INOBT_BLOCK_MAXRECS(level, cur)) {
-               /*
-                * First, try shifting an entry to the right neighbor.
-                */
-               if ((error = xfs_inobt_rshift(cur, level, &i)))
-                       return error;
-               if (i) {
-                       /* nothing */
-               }
-               /*
-                * Next, try shifting an entry to the left neighbor.
-                */
-               else {
-                       if ((error = xfs_inobt_lshift(cur, level, &i)))
-                               return error;
-                       if (i) {
-                               optr = ptr = cur->bc_ptrs[level];
-                       } else {
-                               /*
-                                * Next, try splitting the current block
-                                * in half. If this works we have to
-                                * re-set our variables because
-                                * we could be in a different block now.
-                                */
-                               if ((error = xfs_inobt_split(cur, level, &nbno,
-                                               &nkey, &ncur, &i)))
-                                       return error;
-                               if (i) {
-                                       bp = cur->bc_bufs[level];
-                                       block = XFS_BUF_TO_INOBT_BLOCK(bp);
-#ifdef DEBUG
-                                       if ((error = xfs_btree_check_sblock(cur,
-                                                       block, level, bp)))
-                                               return error;
-#endif
-                                       ptr = cur->bc_ptrs[level];
-                                       nrec.ir_startino = nkey.ir_startino;
-                               } else {
-                                       /*
-                                        * Otherwise the insert fails.
-                                        */
-                                       *stat = 0;
-                                       return 0;
-                               }
-                       }
-               }
-       }
-       /*
-        * At this point we know there's room for our new entry in the block
-        * we're pointing at.
-        */
-       numrecs = be16_to_cpu(block->bb_numrecs);
-       if (level > 0) {
-               /*
-                * It's a non-leaf entry.  Make a hole for the new data
-                * in the key and ptr regions of the block.
-                */
-               kp = XFS_INOBT_KEY_ADDR(block, 1, cur);
-               pp = XFS_INOBT_PTR_ADDR(block, 1, cur);
-#ifdef DEBUG
-               for (i = numrecs; i >= ptr; i--) {
-                       if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(pp[i - 1]), level)))
-                               return error;
-               }
-#endif
-               memmove(&kp[ptr], &kp[ptr - 1],
-                       (numrecs - ptr + 1) * sizeof(*kp));
-               memmove(&pp[ptr], &pp[ptr - 1],
-                       (numrecs - ptr + 1) * sizeof(*pp));
-               /*
-                * Now stuff the new data in, bump numrecs and log the new data.
-                */
-#ifdef DEBUG
-               if ((error = xfs_btree_check_sptr(cur, *bnop, level)))
-                       return error;
-#endif
-               kp[ptr - 1] = key;
-               pp[ptr - 1] = cpu_to_be32(*bnop);
-               numrecs++;
-               block->bb_numrecs = cpu_to_be16(numrecs);
-               xfs_inobt_log_keys(cur, bp, ptr, numrecs);
-               xfs_inobt_log_ptrs(cur, bp, ptr, numrecs);
-       } else {
-               /*
-                * It's a leaf entry.  Make a hole for the new record.
-                */
-               rp = XFS_INOBT_REC_ADDR(block, 1, cur);
-               memmove(&rp[ptr], &rp[ptr - 1],
-                       (numrecs - ptr + 1) * sizeof(*rp));
-               /*
-                * Now stuff the new record in, bump numrecs
-                * and log the new data.
-                */
-               rp[ptr - 1] = *recp;
-               numrecs++;
-               block->bb_numrecs = cpu_to_be16(numrecs);
-               xfs_inobt_log_recs(cur, bp, ptr, numrecs);
-       }
-       /*
-        * Log the new number of records in the btree header.
-        */
-       xfs_inobt_log_block(cur->bc_tp, bp, XFS_BB_NUMRECS);
-#ifdef DEBUG
-       /*
-        * Check that the key/record is in the right place, now.
-        */
-       if (ptr < numrecs) {
-               if (level == 0)
-                       xfs_btree_check_rec(cur->bc_btnum, rp + ptr - 1,
-                               rp + ptr);
-               else
-                       xfs_btree_check_key(cur->bc_btnum, kp + ptr - 1,
-                               kp + ptr);
-       }
-#endif
-       /*
-        * If we inserted at the start of a block, update the parents' keys.
-        */
-       if (optr == 1 && (error = xfs_inobt_updkey(cur, &key, level + 1)))
-               return error;
-       /*
-        * Return the new block number, if any.
-        * If there is one, give back a record value and a cursor too.
-        */
-       *bnop = nbno;
-       if (nbno != NULLAGBLOCK) {
-               *recp = nrec;
-               *curp = ncur;
-       }
+       ASSERT(args.len == 1);
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+
+       new->s = cpu_to_be32(XFS_FSB_TO_AGBNO(args.mp, args.fsbno));
        *stat = 1;
        return 0;
 }
 
-/*
- * Log header fields from a btree block.
- */
-STATIC void
-xfs_inobt_log_block(
-       xfs_trans_t             *tp,    /* transaction pointer */
-       xfs_buf_t               *bp,    /* buffer containing btree block */
-       int                     fields) /* mask of fields: XFS_BB_... */
+STATIC int
+xfs_inobt_free_block(
+       struct xfs_btree_cur    *cur,
+       struct xfs_buf          *bp)
 {
-       int                     first;  /* first byte offset logged */
-       int                     last;   /* last byte offset logged */
-       static const short      offsets[] = {   /* table of offsets */
-               offsetof(xfs_inobt_block_t, bb_magic),
-               offsetof(xfs_inobt_block_t, bb_level),
-               offsetof(xfs_inobt_block_t, bb_numrecs),
-               offsetof(xfs_inobt_block_t, bb_leftsib),
-               offsetof(xfs_inobt_block_t, bb_rightsib),
-               sizeof(xfs_inobt_block_t)
-       };
+       xfs_fsblock_t           fsbno;
+       int                     error;
 
-       xfs_btree_offsets(fields, offsets, XFS_BB_NUM_BITS, &first, &last);
-       xfs_trans_log_buf(tp, bp, first, last);
+       fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, XFS_BUF_ADDR(bp));
+       error = xfs_free_extent(cur->bc_tp, fsbno, 1);
+       if (error)
+               return error;
+
+       xfs_trans_binval(cur->bc_tp, bp);
+       return error;
 }
 
-/*
- * Log keys from a btree block (nonleaf).
- */
-STATIC void
-xfs_inobt_log_keys(
-       xfs_btree_cur_t         *cur,   /* btree cursor */
-       xfs_buf_t               *bp,    /* buffer containing btree block */
-       int                     kfirst, /* index of first key to log */
-       int                     klast)  /* index of last key to log */
+STATIC int
+xfs_inobt_get_maxrecs(
+       struct xfs_btree_cur    *cur,
+       int                     level)
 {
-       xfs_inobt_block_t       *block; /* btree block to log from */
-       int                     first;  /* first byte offset logged */
-       xfs_inobt_key_t         *kp;    /* key pointer in btree block */
-       int                     last;   /* last byte offset logged */
-
-       block = XFS_BUF_TO_INOBT_BLOCK(bp);
-       kp = XFS_INOBT_KEY_ADDR(block, 1, cur);
-       first = (int)((xfs_caddr_t)&kp[kfirst - 1] - (xfs_caddr_t)block);
-       last = (int)(((xfs_caddr_t)&kp[klast] - 1) - (xfs_caddr_t)block);
-       xfs_trans_log_buf(cur->bc_tp, bp, first, last);
+       return cur->bc_mp->m_inobt_mxr[level != 0];
 }
 
-/*
- * Log block pointer fields from a btree block (nonleaf).
- */
 STATIC void
-xfs_inobt_log_ptrs(
-       xfs_btree_cur_t         *cur,   /* btree cursor */
-       xfs_buf_t               *bp,    /* buffer containing btree block */
-       int                     pfirst, /* index of first pointer to log */
-       int                     plast)  /* index of last pointer to log */
+xfs_inobt_init_key_from_rec(
+       union xfs_btree_key     *key,
+       union xfs_btree_rec     *rec)
 {
-       xfs_inobt_block_t       *block; /* btree block to log from */
-       int                     first;  /* first byte offset logged */
-       int                     last;   /* last byte offset logged */
-       xfs_inobt_ptr_t         *pp;    /* block-pointer pointer in btree blk */
-
-       block = XFS_BUF_TO_INOBT_BLOCK(bp);
-       pp = XFS_INOBT_PTR_ADDR(block, 1, cur);
-       first = (int)((xfs_caddr_t)&pp[pfirst - 1] - (xfs_caddr_t)block);
-       last = (int)(((xfs_caddr_t)&pp[plast] - 1) - (xfs_caddr_t)block);
-       xfs_trans_log_buf(cur->bc_tp, bp, first, last);
+       key->inobt.ir_startino = rec->inobt.ir_startino;
 }
 
-/*
- * Log records from a btree block (leaf).
- */
 STATIC void
-xfs_inobt_log_recs(
-       xfs_btree_cur_t         *cur,   /* btree cursor */
-       xfs_buf_t               *bp,    /* buffer containing btree block */
-       int                     rfirst, /* index of first record to log */
-       int                     rlast)  /* index of last record to log */
+xfs_inobt_init_rec_from_key(
+       union xfs_btree_key     *key,
+       union xfs_btree_rec     *rec)
 {
-       xfs_inobt_block_t       *block; /* btree block to log from */
-       int                     first;  /* first byte offset logged */
-       int                     last;   /* last byte offset logged */
-       xfs_inobt_rec_t         *rp;    /* record pointer for btree block */
+       rec->inobt.ir_startino = key->inobt.ir_startino;
+}
 
-       block = XFS_BUF_TO_INOBT_BLOCK(bp);
-       rp = XFS_INOBT_REC_ADDR(block, 1, cur);
-       first = (int)((xfs_caddr_t)&rp[rfirst - 1] - (xfs_caddr_t)block);
-       last = (int)(((xfs_caddr_t)&rp[rlast] - 1) - (xfs_caddr_t)block);
-       xfs_trans_log_buf(cur->bc_tp, bp, first, last);
+STATIC void
+xfs_inobt_init_rec_from_cur(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_rec     *rec)
+{
+       rec->inobt.ir_startino = cpu_to_be32(cur->bc_rec.i.ir_startino);
+       rec->inobt.ir_freecount = cpu_to_be32(cur->bc_rec.i.ir_freecount);
+       rec->inobt.ir_free = cpu_to_be64(cur->bc_rec.i.ir_free);
 }
 
 /*
- * Lookup the record.  The cursor is made to point to it, based on dir.
- * Return 0 if can't find any such record, 1 for success.
+ * intial value of ptr for lookup
  */
-STATIC int                             /* error */
-xfs_inobt_lookup(
-       xfs_btree_cur_t         *cur,   /* btree cursor */
-       xfs_lookup_t            dir,    /* <=, ==, or >= */
-       int                     *stat)  /* success/failure */
+STATIC void
+xfs_inobt_init_ptr_from_cur(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_ptr     *ptr)
 {
-       xfs_agblock_t           agbno;  /* a.g. relative btree block number */
-       xfs_agnumber_t          agno;   /* allocation group number */
-       xfs_inobt_block_t       *block=NULL;    /* current btree block */
-       __int64_t               diff;   /* difference for the current key */
-       int                     error;  /* error return value */
-       int                     keyno=0;        /* current key number */
-       int                     level;  /* level in the btree */
-       xfs_mount_t             *mp;    /* file system mount point */
-
-       /*
-        * Get the allocation group header, and the root block number.
-        */
-       mp = cur->bc_mp;
-       {
-               xfs_agi_t       *agi;   /* a.g. inode header */
-
-               agi = XFS_BUF_TO_AGI(cur->bc_private.a.agbp);
-               agno = be32_to_cpu(agi->agi_seqno);
-               agbno = be32_to_cpu(agi->agi_root);
-       }
-       /*
-        * Iterate over each level in the btree, starting at the root.
-        * For each level above the leaves, find the key we need, based
-        * on the lookup record, then follow the corresponding block
-        * pointer down to the next level.
-        */
-       for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) {
-               xfs_buf_t       *bp;    /* buffer pointer for btree block */
-               xfs_daddr_t     d;      /* disk address of btree block */
-
-               /*
-                * Get the disk address we're looking for.
-                */
-               d = XFS_AGB_TO_DADDR(mp, agno, agbno);
-               /*
-                * If the old buffer at this level is for a different block,
-                * throw it away, otherwise just use it.
-                */
-               bp = cur->bc_bufs[level];
-               if (bp && XFS_BUF_ADDR(bp) != d)
-                       bp = NULL;
-               if (!bp) {
-                       /*
-                        * Need to get a new buffer.  Read it, then
-                        * set it in the cursor, releasing the old one.
-                        */
-                       if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
-                                       agno, agbno, 0, &bp, XFS_INO_BTREE_REF)))
-                               return error;
-                       xfs_btree_setbuf(cur, level, bp);
-                       /*
-                        * Point to the btree block, now that we have the buffer
-                        */
-                       block = XFS_BUF_TO_INOBT_BLOCK(bp);
-                       if ((error = xfs_btree_check_sblock(cur, block, level,
-                                       bp)))
-                               return error;
-               } else
-                       block = XFS_BUF_TO_INOBT_BLOCK(bp);
-               /*
-                * If we already had a key match at a higher level, we know
-                * we need to use the first entry in this block.
-                */
-               if (diff == 0)
-                       keyno = 1;
-               /*
-                * Otherwise we need to search this block.  Do a binary search.
-                */
-               else {
-                       int             high;   /* high entry number */
-                       xfs_inobt_key_t *kkbase=NULL;/* base of keys in block */
-                       xfs_inobt_rec_t *krbase=NULL;/* base of records in block */
-                       int             low;    /* low entry number */
+       struct xfs_agi          *agi = XFS_BUF_TO_AGI(cur->bc_private.a.agbp);
 
-                       /*
-                        * Get a pointer to keys or records.
-                        */
-                       if (level > 0)
-                               kkbase = XFS_INOBT_KEY_ADDR(block, 1, cur);
-                       else
-                               krbase = XFS_INOBT_REC_ADDR(block, 1, cur);
-                       /*
-                        * Set low and high entry numbers, 1-based.
-                        */
-                       low = 1;
-                       if (!(high = be16_to_cpu(block->bb_numrecs))) {
-                               /*
-                                * If the block is empty, the tree must
-                                * be an empty leaf.
-                                */
-                               ASSERT(level == 0 && cur->bc_nlevels == 1);
-                               cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE;
-                               *stat = 0;
-                               return 0;
-                       }
-                       /*
-                        * Binary search the block.
-                        */
-                       while (low <= high) {
-                               xfs_agino_t     startino;       /* key value */
-
-                               /*
-                                * keyno is average of low and high.
-                                */
-                               keyno = (low + high) >> 1;
-                               /*
-                                * Get startino.
-                                */
-                               if (level > 0) {
-                                       xfs_inobt_key_t *kkp;
-
-                                       kkp = kkbase + keyno - 1;
-                                       startino = be32_to_cpu(kkp->ir_startino);
-                               } else {
-                                       xfs_inobt_rec_t *krp;
-
-                                       krp = krbase + keyno - 1;
-                                       startino = be32_to_cpu(krp->ir_startino);
-                               }
-                               /*
-                                * Compute difference to get next direction.
-                                */
-                               diff = (__int64_t)
-                                       startino - cur->bc_rec.i.ir_startino;
-                               /*
-                                * Less than, move right.
-                                */
-                               if (diff < 0)
-                                       low = keyno + 1;
-                               /*
-                                * Greater than, move left.
-                                */
-                               else if (diff > 0)
-                                       high = keyno - 1;
-                               /*
-                                * Equal, we're done.
-                                */
-                               else
-                                       break;
-                       }
-               }
-               /*
-                * If there are more levels, set up for the next level
-                * by getting the block number and filling in the cursor.
-                */
-               if (level > 0) {
-                       /*
-                        * If we moved left, need the previous key number,
-                        * unless there isn't one.
-                        */
-                       if (diff > 0 && --keyno < 1)
-                               keyno = 1;
-                       agbno = be32_to_cpu(*XFS_INOBT_PTR_ADDR(block, keyno, cur));
-#ifdef DEBUG
-                       if ((error = xfs_btree_check_sptr(cur, agbno, level)))
-                               return error;
-#endif
-                       cur->bc_ptrs[level] = keyno;
-               }
-       }
-       /*
-        * Done with the search.
-        * See if we need to adjust the results.
-        */
-       if (dir != XFS_LOOKUP_LE && diff < 0) {
-               keyno++;
-               /*
-                * If ge search and we went off the end of the block, but it's
-                * not the last block, we're in the wrong block.
-                */
-               if (dir == XFS_LOOKUP_GE &&
-                   keyno > be16_to_cpu(block->bb_numrecs) &&
-                   be32_to_cpu(block->bb_rightsib) != NULLAGBLOCK) {
-                       int     i;
+       ASSERT(cur->bc_private.a.agno == be32_to_cpu(agi->agi_seqno));
 
-                       cur->bc_ptrs[0] = keyno;
-                       if ((error = xfs_inobt_increment(cur, 0, &i)))
-                               return error;
-                       ASSERT(i == 1);
-                       *stat = 1;
-                       return 0;
-               }
-       }
-       else if (dir == XFS_LOOKUP_LE && diff > 0)
-               keyno--;
-       cur->bc_ptrs[0] = keyno;
-       /*
-        * Return if we succeeded or not.
-        */
-       if (keyno == 0 || keyno > be16_to_cpu(block->bb_numrecs))
-               *stat = 0;
-       else
-               *stat = ((dir != XFS_LOOKUP_EQ) || (diff == 0));
-       return 0;
+       ptr->s = agi->agi_root;
 }
 
-/*
- * Move 1 record left from cur/level if possible.
- * Update cur to reflect the new path.
- */
-STATIC int                             /* error */
-xfs_inobt_lshift(
-       xfs_btree_cur_t         *cur,   /* btree cursor */
-       int                     level,  /* level to shift record on */
-       int                     *stat)  /* success/failure */
+STATIC __int64_t
+xfs_inobt_key_diff(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_key     *key)
 {
-       int                     error;  /* error return value */
-#ifdef DEBUG
-       int                     i;      /* loop index */
-#endif
-       xfs_inobt_key_t         key;    /* key value for leaf level upward */
-       xfs_buf_t               *lbp;   /* buffer for left neighbor block */
-       xfs_inobt_block_t       *left;  /* left neighbor btree block */
-       xfs_inobt_key_t         *lkp=NULL;      /* key pointer for left block */
-       xfs_inobt_ptr_t         *lpp;   /* address pointer for left block */
-       xfs_inobt_rec_t         *lrp=NULL;      /* record pointer for left block */
-       int                     nrec;   /* new number of left block entries */
-       xfs_buf_t               *rbp;   /* buffer for right (current) block */
-       xfs_inobt_block_t       *right; /* right (current) btree block */
-       xfs_inobt_key_t         *rkp=NULL;      /* key pointer for right block */
-       xfs_inobt_ptr_t         *rpp=NULL;      /* address pointer for right block */
-       xfs_inobt_rec_t         *rrp=NULL;      /* record pointer for right block */
-
-       /*
-        * Set up variables for this block as "right".
-        */
-       rbp = cur->bc_bufs[level];
-       right = XFS_BUF_TO_INOBT_BLOCK(rbp);
-#ifdef DEBUG
-       if ((error = xfs_btree_check_sblock(cur, right, level, rbp)))
-               return error;
-#endif
-       /*
-        * If we've got no left sibling then we can't shift an entry left.
-        */
-       if (be32_to_cpu(right->bb_leftsib) == NULLAGBLOCK) {
-               *stat = 0;
-               return 0;
-       }
-       /*
-        * If the cursor entry is the one that would be moved, don't
-        * do it... it's too complicated.
-        */
-       if (cur->bc_ptrs[level] <= 1) {
-               *stat = 0;
-               return 0;
-       }
-       /*
-        * Set up the left neighbor as "left".
-        */
-       if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
-                       cur->bc_private.a.agno, be32_to_cpu(right->bb_leftsib),
-                       0, &lbp, XFS_INO_BTREE_REF)))
-               return error;
-       left = XFS_BUF_TO_INOBT_BLOCK(lbp);
-       if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
-               return error;
-       /*
-        * If it's full, it can't take another entry.
-        */
-       if (be16_to_cpu(left->bb_numrecs) == XFS_INOBT_BLOCK_MAXRECS(level, cur)) {
-               *stat = 0;
-               return 0;
-       }
-       nrec = be16_to_cpu(left->bb_numrecs) + 1;
-       /*
-        * If non-leaf, copy a key and a ptr to the left block.
-        */
-       if (level > 0) {
-               lkp = XFS_INOBT_KEY_ADDR(left, nrec, cur);
-               rkp = XFS_INOBT_KEY_ADDR(right, 1, cur);
-               *lkp = *rkp;
-               xfs_inobt_log_keys(cur, lbp, nrec, nrec);
-               lpp = XFS_INOBT_PTR_ADDR(left, nrec, cur);
-               rpp = XFS_INOBT_PTR_ADDR(right, 1, cur);
-#ifdef DEBUG
-               if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(*rpp), level)))
-                       return error;
-#endif
-               *lpp = *rpp;
-               xfs_inobt_log_ptrs(cur, lbp, nrec, nrec);
-       }
-       /*
-        * If leaf, copy a record to the left block.
-        */
-       else {
-               lrp = XFS_INOBT_REC_ADDR(left, nrec, cur);
-               rrp = XFS_INOBT_REC_ADDR(right, 1, cur);
-               *lrp = *rrp;
-               xfs_inobt_log_recs(cur, lbp, nrec, nrec);
-       }
-       /*
-        * Bump and log left's numrecs, decrement and log right's numrecs.
-        */
-       be16_add_cpu(&left->bb_numrecs, 1);
-       xfs_inobt_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS);
-#ifdef DEBUG
-       if (level > 0)
-               xfs_btree_check_key(cur->bc_btnum, lkp - 1, lkp);
-       else
-               xfs_btree_check_rec(cur->bc_btnum, lrp - 1, lrp);
-#endif
-       be16_add_cpu(&right->bb_numrecs, -1);
-       xfs_inobt_log_block(cur->bc_tp, rbp, XFS_BB_NUMRECS);
-       /*
-        * Slide the contents of right down one entry.
-        */
-       if (level > 0) {
-#ifdef DEBUG
-               for (i = 0; i < be16_to_cpu(right->bb_numrecs); i++) {
-                       if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(rpp[i + 1]),
-                                       level)))
-                               return error;
-               }
-#endif
-               memmove(rkp, rkp + 1, be16_to_cpu(right->bb_numrecs) * sizeof(*rkp));
-               memmove(rpp, rpp + 1, be16_to_cpu(right->bb_numrecs) * sizeof(*rpp));
-               xfs_inobt_log_keys(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
-               xfs_inobt_log_ptrs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
-       } else {
-               memmove(rrp, rrp + 1, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp));
-               xfs_inobt_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
-               key.ir_startino = rrp->ir_startino;
-               rkp = &key;
-       }
-       /*
-        * Update the parent key values of right.
-        */
-       if ((error = xfs_inobt_updkey(cur, rkp, level + 1)))
-               return error;
-       /*
-        * Slide the cursor value left one.
-        */
-       cur->bc_ptrs[level]--;
-       *stat = 1;
-       return 0;
+       return (__int64_t)be32_to_cpu(key->inobt.ir_startino) -
+                         cur->bc_rec.i.ir_startino;
 }
 
-/*
- * Allocate a new root block, fill it in.
- */
-STATIC int                             /* error */
-xfs_inobt_newroot(
-       xfs_btree_cur_t         *cur,   /* btree cursor */
-       int                     *stat)  /* success/failure */
+STATIC int
+xfs_inobt_kill_root(
+       struct xfs_btree_cur    *cur,
+       struct xfs_buf          *bp,
+       int                     level,
+       union xfs_btree_ptr     *newroot)
 {
-       xfs_agi_t               *agi;   /* a.g. inode header */
-       xfs_alloc_arg_t         args;   /* allocation argument structure */
-       xfs_inobt_block_t       *block; /* one half of the old root block */
-       xfs_buf_t               *bp;    /* buffer containing block */
-       int                     error;  /* error return value */
-       xfs_inobt_key_t         *kp;    /* btree key pointer */
-       xfs_agblock_t           lbno;   /* left block number */
-       xfs_buf_t               *lbp;   /* left buffer pointer */
-       xfs_inobt_block_t       *left;  /* left btree block */
-       xfs_buf_t               *nbp;   /* new (root) buffer */
-       xfs_inobt_block_t       *new;   /* new (root) btree block */
-       int                     nptr;   /* new value for key index, 1 or 2 */
-       xfs_inobt_ptr_t         *pp;    /* btree address pointer */
-       xfs_agblock_t           rbno;   /* right block number */
-       xfs_buf_t               *rbp;   /* right buffer pointer */
-       xfs_inobt_block_t       *right; /* right btree block */
-       xfs_inobt_rec_t         *rp;    /* btree record pointer */
+       int                     error;
 
-       ASSERT(cur->bc_nlevels < XFS_IN_MAXLEVELS(cur->bc_mp));
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+       XFS_BTREE_STATS_INC(cur, killroot);
 
        /*
-        * Get a block & a buffer.
+        * Update the root pointer, decreasing the level by 1 and then
+        * free the old root.
         */
-       agi = XFS_BUF_TO_AGI(cur->bc_private.a.agbp);
-       args.tp = cur->bc_tp;
-       args.mp = cur->bc_mp;
-       args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.a.agno,
-               be32_to_cpu(agi->agi_root));
-       args.mod = args.minleft = args.alignment = args.total = args.wasdel =
-               args.isfl = args.userdata = args.minalignslop = 0;
-       args.minlen = args.maxlen = args.prod = 1;
-       args.type = XFS_ALLOCTYPE_NEAR_BNO;
-       if ((error = xfs_alloc_vextent(&args)))
+       xfs_inobt_set_root(cur, newroot, -1);
+       error = xfs_inobt_free_block(cur, bp);
+       if (error) {
+               XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
                return error;
-       /*
-        * None available, we fail.
-        */
-       if (args.fsbno == NULLFSBLOCK) {
-               *stat = 0;
-               return 0;
-       }
-       ASSERT(args.len == 1);
-       nbp = xfs_btree_get_bufs(args.mp, args.tp, args.agno, args.agbno, 0);
-       new = XFS_BUF_TO_INOBT_BLOCK(nbp);
-       /*
-        * Set the root data in the a.g. inode structure.
-        */
-       agi->agi_root = cpu_to_be32(args.agbno);
-       be32_add_cpu(&agi->agi_level, 1);
-       xfs_ialloc_log_agi(args.tp, cur->bc_private.a.agbp,
-               XFS_AGI_ROOT | XFS_AGI_LEVEL);
-       /*
-        * At the previous root level there are now two blocks: the old
-        * root, and the new block generated when it was split.
-        * We don't know which one the cursor is pointing at, so we
-        * set up variables "left" and "right" for each case.
-        */
-       bp = cur->bc_bufs[cur->bc_nlevels - 1];
-       block = XFS_BUF_TO_INOBT_BLOCK(bp);
-#ifdef DEBUG
-       if ((error = xfs_btree_check_sblock(cur, block, cur->bc_nlevels - 1, bp)))
-               return error;
-#endif
-       if (be32_to_cpu(block->bb_rightsib) != NULLAGBLOCK) {
-               /*
-                * Our block is left, pick up the right block.
-                */
-               lbp = bp;
-               lbno = XFS_DADDR_TO_AGBNO(args.mp, XFS_BUF_ADDR(lbp));
-               left = block;
-               rbno = be32_to_cpu(left->bb_rightsib);
-               if ((error = xfs_btree_read_bufs(args.mp, args.tp, args.agno,
-                               rbno, 0, &rbp, XFS_INO_BTREE_REF)))
-                       return error;
-               bp = rbp;
-               right = XFS_BUF_TO_INOBT_BLOCK(rbp);
-               if ((error = xfs_btree_check_sblock(cur, right,
-                               cur->bc_nlevels - 1, rbp)))
-                       return error;
-               nptr = 1;
-       } else {
-               /*
-                * Our block is right, pick up the left block.
-                */
-               rbp = bp;
-               rbno = XFS_DADDR_TO_AGBNO(args.mp, XFS_BUF_ADDR(rbp));
-               right = block;
-               lbno = be32_to_cpu(right->bb_leftsib);
-               if ((error = xfs_btree_read_bufs(args.mp, args.tp, args.agno,
-                               lbno, 0, &lbp, XFS_INO_BTREE_REF)))
-                       return error;
-               bp = lbp;
-               left = XFS_BUF_TO_INOBT_BLOCK(lbp);
-               if ((error = xfs_btree_check_sblock(cur, left,
-                               cur->bc_nlevels - 1, lbp)))
-                       return error;
-               nptr = 2;
-       }
-       /*
-        * Fill in the new block's btree header and log it.
-        */
-       new->bb_magic = cpu_to_be32(xfs_magics[cur->bc_btnum]);
-       new->bb_level = cpu_to_be16(cur->bc_nlevels);
-       new->bb_numrecs = cpu_to_be16(2);
-       new->bb_leftsib = cpu_to_be32(NULLAGBLOCK);
-       new->bb_rightsib = cpu_to_be32(NULLAGBLOCK);
-       xfs_inobt_log_block(args.tp, nbp, XFS_BB_ALL_BITS);
-       ASSERT(lbno != NULLAGBLOCK && rbno != NULLAGBLOCK);
-       /*
-        * Fill in the key data in the new root.
-        */
-       kp = XFS_INOBT_KEY_ADDR(new, 1, cur);
-       if (be16_to_cpu(left->bb_level) > 0) {
-               kp[0] = *XFS_INOBT_KEY_ADDR(left, 1, cur);
-               kp[1] = *XFS_INOBT_KEY_ADDR(right, 1, cur);
-       } else {
-               rp = XFS_INOBT_REC_ADDR(left, 1, cur);
-               kp[0].ir_startino = rp->ir_startino;
-               rp = XFS_INOBT_REC_ADDR(right, 1, cur);
-               kp[1].ir_startino = rp->ir_startino;
        }
-       xfs_inobt_log_keys(cur, nbp, 1, 2);
-       /*
-        * Fill in the pointer data in the new root.
-        */
-       pp = XFS_INOBT_PTR_ADDR(new, 1, cur);
-       pp[0] = cpu_to_be32(lbno);
-       pp[1] = cpu_to_be32(rbno);
-       xfs_inobt_log_ptrs(cur, nbp, 1, 2);
-       /*
-        * Fix up the cursor.
-        */
-       xfs_btree_setbuf(cur, cur->bc_nlevels, nbp);
-       cur->bc_ptrs[cur->bc_nlevels] = nptr;
-       cur->bc_nlevels++;
-       *stat = 1;
-       return 0;
-}
 
-/*
- * Move 1 record right from cur/level if possible.
- * Update cur to reflect the new path.
- */
-STATIC int                             /* error */
-xfs_inobt_rshift(
-       xfs_btree_cur_t         *cur,   /* btree cursor */
-       int                     level,  /* level to shift record on */
-       int                     *stat)  /* success/failure */
-{
-       int                     error;  /* error return value */
-       int                     i;      /* loop index */
-       xfs_inobt_key_t         key;    /* key value for leaf level upward */
-       xfs_buf_t               *lbp;   /* buffer for left (current) block */
-       xfs_inobt_block_t       *left;  /* left (current) btree block */
-       xfs_inobt_key_t         *lkp;   /* key pointer for left block */
-       xfs_inobt_ptr_t         *lpp;   /* address pointer for left block */
-       xfs_inobt_rec_t         *lrp;   /* record pointer for left block */
-       xfs_buf_t               *rbp;   /* buffer for right neighbor block */
-       xfs_inobt_block_t       *right; /* right neighbor btree block */
-       xfs_inobt_key_t         *rkp;   /* key pointer for right block */
-       xfs_inobt_ptr_t         *rpp;   /* address pointer for right block */
-       xfs_inobt_rec_t         *rrp=NULL;      /* record pointer for right block */
-       xfs_btree_cur_t         *tcur;  /* temporary cursor */
+       XFS_BTREE_STATS_INC(cur, free);
 
-       /*
-        * Set up variables for this block as "left".
-        */
-       lbp = cur->bc_bufs[level];
-       left = XFS_BUF_TO_INOBT_BLOCK(lbp);
-#ifdef DEBUG
-       if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
-               return error;
-#endif
-       /*
-        * If we've got no right sibling then we can't shift an entry right.
-        */
-       if (be32_to_cpu(left->bb_rightsib) == NULLAGBLOCK) {
-               *stat = 0;
-               return 0;
-       }
-       /*
-        * If the cursor entry is the one that would be moved, don't
-        * do it... it's too complicated.
-        */
-       if (cur->bc_ptrs[level] >= be16_to_cpu(left->bb_numrecs)) {
-               *stat = 0;
-               return 0;
-       }
-       /*
-        * Set up the right neighbor as "right".
-        */
-       if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
-                       cur->bc_private.a.agno, be32_to_cpu(left->bb_rightsib),
-                       0, &rbp, XFS_INO_BTREE_REF)))
-               return error;
-       right = XFS_BUF_TO_INOBT_BLOCK(rbp);
-       if ((error = xfs_btree_check_sblock(cur, right, level, rbp)))
-               return error;
-       /*
-        * If it's full, it can't take another entry.
-        */
-       if (be16_to_cpu(right->bb_numrecs) == XFS_INOBT_BLOCK_MAXRECS(level, cur)) {
-               *stat = 0;
-               return 0;
-       }
-       /*
-        * Make a hole at the start of the right neighbor block, then
-        * copy the last left block entry to the hole.
-        */
-       if (level > 0) {
-               lkp = XFS_INOBT_KEY_ADDR(left, be16_to_cpu(left->bb_numrecs), cur);
-               lpp = XFS_INOBT_PTR_ADDR(left, be16_to_cpu(left->bb_numrecs), cur);
-               rkp = XFS_INOBT_KEY_ADDR(right, 1, cur);
-               rpp = XFS_INOBT_PTR_ADDR(right, 1, cur);
-#ifdef DEBUG
-               for (i = be16_to_cpu(right->bb_numrecs) - 1; i >= 0; i--) {
-                       if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(rpp[i]), level)))
-                               return error;
-               }
-#endif
-               memmove(rkp + 1, rkp, be16_to_cpu(right->bb_numrecs) * sizeof(*rkp));
-               memmove(rpp + 1, rpp, be16_to_cpu(right->bb_numrecs) * sizeof(*rpp));
-#ifdef DEBUG
-               if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(*lpp), level)))
-                       return error;
-#endif
-               *rkp = *lkp;
-               *rpp = *lpp;
-               xfs_inobt_log_keys(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1);
-               xfs_inobt_log_ptrs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1);
-       } else {
-               lrp = XFS_INOBT_REC_ADDR(left, be16_to_cpu(left->bb_numrecs), cur);
-               rrp = XFS_INOBT_REC_ADDR(right, 1, cur);
-               memmove(rrp + 1, rrp, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp));
-               *rrp = *lrp;
-               xfs_inobt_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1);
-               key.ir_startino = rrp->ir_startino;
-               rkp = &key;
-       }
-       /*
-        * Decrement and log left's numrecs, bump and log right's numrecs.
-        */
-       be16_add_cpu(&left->bb_numrecs, -1);
-       xfs_inobt_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS);
-       be16_add_cpu(&right->bb_numrecs, 1);
-#ifdef DEBUG
-       if (level > 0)
-               xfs_btree_check_key(cur->bc_btnum, rkp, rkp + 1);
-       else
-               xfs_btree_check_rec(cur->bc_btnum, rrp, rrp + 1);
-#endif
-       xfs_inobt_log_block(cur->bc_tp, rbp, XFS_BB_NUMRECS);
-       /*
-        * Using a temporary cursor, update the parent key values of the
-        * block on the right.
-        */
-       if ((error = xfs_btree_dup_cursor(cur, &tcur)))
-               return error;
-       xfs_btree_lastrec(tcur, level);
-       if ((error = xfs_inobt_increment(tcur, level, &i)) ||
-           (error = xfs_inobt_updkey(tcur, rkp, level + 1))) {
-               xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
-               return error;
-       }
-       xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
-       *stat = 1;
+       cur->bc_bufs[level] = NULL;
+       cur->bc_nlevels--;
+
+       XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
        return 0;
 }
 
-/*
- * Split cur/level block in half.
- * Return new block number and its first record (to be inserted into parent).
- */
-STATIC int                             /* error */
-xfs_inobt_split(
-       xfs_btree_cur_t         *cur,   /* btree cursor */
-       int                     level,  /* level to split */
-       xfs_agblock_t           *bnop,  /* output: block number allocated */
-       xfs_inobt_key_t         *keyp,  /* output: first key of new block */
-       xfs_btree_cur_t         **curp, /* output: new cursor */
-       int                     *stat)  /* success/failure */
-{
-       xfs_alloc_arg_t         args;   /* allocation argument structure */
-       int                     error;  /* error return value */
-       int                     i;      /* loop index/record number */
-       xfs_agblock_t           lbno;   /* left (current) block number */
-       xfs_buf_t               *lbp;   /* buffer for left block */
-       xfs_inobt_block_t       *left;  /* left (current) btree block */
-       xfs_inobt_key_t         *lkp;   /* left btree key pointer */
-       xfs_inobt_ptr_t         *lpp;   /* left btree address pointer */
-       xfs_inobt_rec_t         *lrp;   /* left btree record pointer */
-       xfs_buf_t               *rbp;   /* buffer for right block */
-       xfs_inobt_block_t       *right; /* right (new) btree block */
-       xfs_inobt_key_t         *rkp;   /* right btree key pointer */
-       xfs_inobt_ptr_t         *rpp;   /* right btree address pointer */
-       xfs_inobt_rec_t         *rrp;   /* right btree record pointer */
-
-       /*
-        * Set up left block (current one).
-        */
-       lbp = cur->bc_bufs[level];
-       args.tp = cur->bc_tp;
-       args.mp = cur->bc_mp;
-       lbno = XFS_DADDR_TO_AGBNO(args.mp, XFS_BUF_ADDR(lbp));
-       /*
-        * Allocate the new block.
-        * If we can't do it, we're toast.  Give up.
-        */
-       args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.a.agno, lbno);
-       args.mod = args.minleft = args.alignment = args.total = args.wasdel =
-               args.isfl = args.userdata = args.minalignslop = 0;
-       args.minlen = args.maxlen = args.prod = 1;
-       args.type = XFS_ALLOCTYPE_NEAR_BNO;
-       if ((error = xfs_alloc_vextent(&args)))
-               return error;
-       if (args.fsbno == NULLFSBLOCK) {
-               *stat = 0;
-               return 0;
-       }
-       ASSERT(args.len == 1);
-       rbp = xfs_btree_get_bufs(args.mp, args.tp, args.agno, args.agbno, 0);
-       /*
-        * Set up the new block as "right".
-        */
-       right = XFS_BUF_TO_INOBT_BLOCK(rbp);
-       /*
-        * "Left" is the current (according to the cursor) block.
-        */
-       left = XFS_BUF_TO_INOBT_BLOCK(lbp);
 #ifdef DEBUG
-       if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
-               return error;
-#endif
-       /*
-        * Fill in the btree header for the new block.
-        */
-       right->bb_magic = cpu_to_be32(xfs_magics[cur->bc_btnum]);
-       right->bb_level = left->bb_level;
-       right->bb_numrecs = cpu_to_be16(be16_to_cpu(left->bb_numrecs) / 2);
-       /*
-        * Make sure that if there's an odd number of entries now, that
-        * each new block will have the same number of entries.
-        */
-       if ((be16_to_cpu(left->bb_numrecs) & 1) &&
-           cur->bc_ptrs[level] <= be16_to_cpu(right->bb_numrecs) + 1)
-               be16_add_cpu(&right->bb_numrecs, 1);
-       i = be16_to_cpu(left->bb_numrecs) - be16_to_cpu(right->bb_numrecs) + 1;
-       /*
-        * For non-leaf blocks, copy keys and addresses over to the new block.
-        */
-       if (level > 0) {
-               lkp = XFS_INOBT_KEY_ADDR(left, i, cur);
-               lpp = XFS_INOBT_PTR_ADDR(left, i, cur);
-               rkp = XFS_INOBT_KEY_ADDR(right, 1, cur);
-               rpp = XFS_INOBT_PTR_ADDR(right, 1, cur);
-#ifdef DEBUG
-               for (i = 0; i < be16_to_cpu(right->bb_numrecs); i++) {
-                       if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(lpp[i]), level)))
-                               return error;
-               }
-#endif
-               memcpy(rkp, lkp, be16_to_cpu(right->bb_numrecs) * sizeof(*rkp));
-               memcpy(rpp, lpp, be16_to_cpu(right->bb_numrecs) * sizeof(*rpp));
-               xfs_inobt_log_keys(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
-               xfs_inobt_log_ptrs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
-               *keyp = *rkp;
-       }
-       /*
-        * For leaf blocks, copy records over to the new block.
-        */
-       else {
-               lrp = XFS_INOBT_REC_ADDR(left, i, cur);
-               rrp = XFS_INOBT_REC_ADDR(right, 1, cur);
-               memcpy(rrp, lrp, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp));
-               xfs_inobt_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
-               keyp->ir_startino = rrp->ir_startino;
-       }
-       /*
-        * Find the left block number by looking in the buffer.
-        * Adjust numrecs, sibling pointers.
-        */
-       be16_add_cpu(&left->bb_numrecs, -(be16_to_cpu(right->bb_numrecs)));
-       right->bb_rightsib = left->bb_rightsib;
-       left->bb_rightsib = cpu_to_be32(args.agbno);
-       right->bb_leftsib = cpu_to_be32(lbno);
-       xfs_inobt_log_block(args.tp, rbp, XFS_BB_ALL_BITS);
-       xfs_inobt_log_block(args.tp, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
-       /*
-        * If there's a block to the new block's right, make that block
-        * point back to right instead of to left.
-        */
-       if (be32_to_cpu(right->bb_rightsib) != NULLAGBLOCK) {
-               xfs_inobt_block_t       *rrblock;       /* rr btree block */
-               xfs_buf_t               *rrbp;          /* buffer for rrblock */
-
-               if ((error = xfs_btree_read_bufs(args.mp, args.tp, args.agno,
-                               be32_to_cpu(right->bb_rightsib), 0, &rrbp,
-                               XFS_INO_BTREE_REF)))
-                       return error;
-               rrblock = XFS_BUF_TO_INOBT_BLOCK(rrbp);
-               if ((error = xfs_btree_check_sblock(cur, rrblock, level, rrbp)))
-                       return error;
-               rrblock->bb_leftsib = cpu_to_be32(args.agbno);
-               xfs_inobt_log_block(args.tp, rrbp, XFS_BB_LEFTSIB);
-       }
-       /*
-        * If the cursor is really in the right block, move it there.
-        * If it's just pointing past the last entry in left, then we'll
-        * insert there, so don't change anything in that case.
-        */
-       if (cur->bc_ptrs[level] > be16_to_cpu(left->bb_numrecs) + 1) {
-               xfs_btree_setbuf(cur, level, rbp);
-               cur->bc_ptrs[level] -= be16_to_cpu(left->bb_numrecs);
-       }
-       /*
-        * If there are more levels, we'll need another cursor which refers
-        * the right block, no matter where this cursor was.
-        */
-       if (level + 1 < cur->bc_nlevels) {
-               if ((error = xfs_btree_dup_cursor(cur, curp)))
-                       return error;
-               (*curp)->bc_ptrs[level + 1]++;
-       }
-       *bnop = args.agbno;
-       *stat = 1;
-       return 0;
+STATIC int
+xfs_inobt_keys_inorder(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_key     *k1,
+       union xfs_btree_key     *k2)
+{
+       return be32_to_cpu(k1->inobt.ir_startino) <
+               be32_to_cpu(k2->inobt.ir_startino);
 }
 
-/*
- * Update keys at all levels from here to the root along the cursor's path.
- */
-STATIC int                             /* error */
-xfs_inobt_updkey(
-       xfs_btree_cur_t         *cur,   /* btree cursor */
-       xfs_inobt_key_t         *keyp,  /* new key value to update to */
-       int                     level)  /* starting level for update */
+STATIC int
+xfs_inobt_recs_inorder(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_rec     *r1,
+       union xfs_btree_rec     *r2)
 {
-       int                     ptr;    /* index of key in block */
-
-       /*
-        * Go up the tree from this level toward the root.
-        * At each level, update the key value to the value input.
-        * Stop when we reach a level where the cursor isn't pointing
-        * at the first entry in the block.
-        */
-       for (ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) {
-               xfs_buf_t               *bp;    /* buffer for block */
-               xfs_inobt_block_t       *block; /* btree block */
-#ifdef DEBUG
-               int                     error;  /* error return value */
-#endif
-               xfs_inobt_key_t         *kp;    /* ptr to btree block keys */
-
-               bp = cur->bc_bufs[level];
-               block = XFS_BUF_TO_INOBT_BLOCK(bp);
-#ifdef DEBUG
-               if ((error = xfs_btree_check_sblock(cur, block, level, bp)))
-                       return error;
-#endif
-               ptr = cur->bc_ptrs[level];
-               kp = XFS_INOBT_KEY_ADDR(block, ptr, cur);
-               *kp = *keyp;
-               xfs_inobt_log_keys(cur, bp, ptr, ptr);
-       }
-       return 0;
+       return be32_to_cpu(r1->inobt.ir_startino) + XFS_INODES_PER_CHUNK <=
+               be32_to_cpu(r2->inobt.ir_startino);
 }
+#endif /* DEBUG */
 
-/*
- * Externally visible routines.
- */
+#ifdef XFS_BTREE_TRACE
+ktrace_t       *xfs_inobt_trace_buf;
 
-/*
- * Decrement cursor by one record at the level.
- * For nonzero levels the leaf-ward information is untouched.
- */
-int                                    /* error */
-xfs_inobt_decrement(
-       xfs_btree_cur_t         *cur,   /* btree cursor */
-       int                     level,  /* level in btree, 0 is leaf */
-       int                     *stat)  /* success/failure */
+STATIC void
+xfs_inobt_trace_enter(
+       struct xfs_btree_cur    *cur,
+       const char              *func,
+       char                    *s,
+       int                     type,
+       int                     line,
+       __psunsigned_t          a0,
+       __psunsigned_t          a1,
+       __psunsigned_t          a2,
+       __psunsigned_t          a3,
+       __psunsigned_t          a4,
+       __psunsigned_t          a5,
+       __psunsigned_t          a6,
+       __psunsigned_t          a7,
+       __psunsigned_t          a8,
+       __psunsigned_t          a9,
+       __psunsigned_t          a10)
 {
-       xfs_inobt_block_t       *block; /* btree block */
-       int                     error;
-       int                     lev;    /* btree level */
-
-       ASSERT(level < cur->bc_nlevels);
-       /*
-        * Read-ahead to the left at this level.
-        */
-       xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA);
-       /*
-        * Decrement the ptr at this level.  If we're still in the block
-        * then we're done.
-        */
-       if (--cur->bc_ptrs[level] > 0) {
-               *stat = 1;
-               return 0;
-       }
-       /*
-        * Get a pointer to the btree block.
-        */
-       block = XFS_BUF_TO_INOBT_BLOCK(cur->bc_bufs[level]);
-#ifdef DEBUG
-       if ((error = xfs_btree_check_sblock(cur, block, level,
-                       cur->bc_bufs[level])))
-               return error;
-#endif
-       /*
-        * If we just went off the left edge of the tree, return failure.
-        */
-       if (be32_to_cpu(block->bb_leftsib) == NULLAGBLOCK) {
-               *stat = 0;
-               return 0;
-       }
-       /*
-        * March up the tree decrementing pointers.
-        * Stop when we don't go off the left edge of a block.
-        */
-       for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
-               if (--cur->bc_ptrs[lev] > 0)
-                       break;
-               /*
-                * Read-ahead the left block, we're going to read it
-                * in the next loop.
-                */
-               xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA);
-       }
-       /*
-        * If we went off the root then we are seriously confused.
-        */
-       ASSERT(lev < cur->bc_nlevels);
-       /*
-        * Now walk back down the tree, fixing up the cursor's buffer
-        * pointers and key numbers.
-        */
-       for (block = XFS_BUF_TO_INOBT_BLOCK(cur->bc_bufs[lev]); lev > level; ) {
-               xfs_agblock_t   agbno;  /* block number of btree block */
-               xfs_buf_t       *bp;    /* buffer containing btree block */
-
-               agbno = be32_to_cpu(*XFS_INOBT_PTR_ADDR(block, cur->bc_ptrs[lev], cur));
-               if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
-                               cur->bc_private.a.agno, agbno, 0, &bp,
-                               XFS_INO_BTREE_REF)))
-                       return error;
-               lev--;
-               xfs_btree_setbuf(cur, lev, bp);
-               block = XFS_BUF_TO_INOBT_BLOCK(bp);
-               if ((error = xfs_btree_check_sblock(cur, block, lev, bp)))
-                       return error;
-               cur->bc_ptrs[lev] = be16_to_cpu(block->bb_numrecs);
-       }
-       *stat = 1;
-       return 0;
+       ktrace_enter(xfs_inobt_trace_buf, (void *)(__psint_t)type,
+               (void *)func, (void *)s, NULL, (void *)cur,
+               (void *)a0, (void *)a1, (void *)a2, (void *)a3,
+               (void *)a4, (void *)a5, (void *)a6, (void *)a7,
+               (void *)a8, (void *)a9, (void *)a10);
 }
 
-/*
- * Delete the record pointed to by cur.
- * The cursor refers to the place where the record was (could be inserted)
- * when the operation returns.
- */
-int                                    /* error */
-xfs_inobt_delete(
-       xfs_btree_cur_t *cur,           /* btree cursor */
-       int             *stat)          /* success/failure */
+STATIC void
+xfs_inobt_trace_cursor(
+       struct xfs_btree_cur    *cur,
+       __uint32_t              *s0,
+       __uint64_t              *l0,
+       __uint64_t              *l1)
 {
-       int             error;
-       int             i;              /* result code */
-       int             level;          /* btree level */
-
-       /*
-        * Go up the tree, starting at leaf level.
-        * If 2 is returned then a join was done; go to the next level.
-        * Otherwise we are done.
-        */
-       for (level = 0, i = 2; i == 2; level++) {
-               if ((error = xfs_inobt_delrec(cur, level, &i)))
-                       return error;
-       }
-       if (i == 0) {
-               for (level = 1; level < cur->bc_nlevels; level++) {
-                       if (cur->bc_ptrs[level] == 0) {
-                               if ((error = xfs_inobt_decrement(cur, level, &i)))
-                                       return error;
-                               break;
-                       }
-               }
-       }
-       *stat = i;
-       return 0;
+       *s0 = cur->bc_private.a.agno;
+       *l0 = cur->bc_rec.i.ir_startino;
+       *l1 = cur->bc_rec.i.ir_free;
 }
 
-
-/*
- * Get the data from the pointed-to record.
- */
-int                                    /* error */
-xfs_inobt_get_rec(
-       xfs_btree_cur_t         *cur,   /* btree cursor */
-       xfs_agino_t             *ino,   /* output: starting inode of chunk */
-       __int32_t               *fcnt,  /* output: number of free inodes */
-       xfs_inofree_t           *free,  /* output: free inode mask */
-       int                     *stat)  /* output: success/failure */
+STATIC void
+xfs_inobt_trace_key(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_key     *key,
+       __uint64_t              *l0,
+       __uint64_t              *l1)
 {
-       xfs_inobt_block_t       *block; /* btree block */
-       xfs_buf_t               *bp;    /* buffer containing btree block */
-#ifdef DEBUG
-       int                     error;  /* error return value */
-#endif
-       int                     ptr;    /* record number */
-       xfs_inobt_rec_t         *rec;   /* record data */
-
-       bp = cur->bc_bufs[0];
-       ptr = cur->bc_ptrs[0];
-       block = XFS_BUF_TO_INOBT_BLOCK(bp);
-#ifdef DEBUG
-       if ((error = xfs_btree_check_sblock(cur, block, 0, bp)))
-               return error;
-#endif
-       /*
-        * Off the right end or left end, return failure.
-        */
-       if (ptr > be16_to_cpu(block->bb_numrecs) || ptr <= 0) {
-               *stat = 0;
-               return 0;
-       }
-       /*
-        * Point to the record and extract its data.
-        */
-       rec = XFS_INOBT_REC_ADDR(block, ptr, cur);
-       *ino = be32_to_cpu(rec->ir_startino);
-       *fcnt = be32_to_cpu(rec->ir_freecount);
-       *free = be64_to_cpu(rec->ir_free);
-       *stat = 1;
-       return 0;
+       *l0 = be32_to_cpu(key->inobt.ir_startino);
+       *l1 = 0;
 }
 
-/*
- * Increment cursor by one record at the level.
- * For nonzero levels the leaf-ward information is untouched.
- */
-int                                    /* error */
-xfs_inobt_increment(
-       xfs_btree_cur_t         *cur,   /* btree cursor */
-       int                     level,  /* level in btree, 0 is leaf */
-       int                     *stat)  /* success/failure */
+STATIC void
+xfs_inobt_trace_record(
+       struct xfs_btree_cur    *cur,
+       union xfs_btree_rec     *rec,
+       __uint64_t              *l0,
+       __uint64_t              *l1,
+       __uint64_t              *l2)
 {
-       xfs_inobt_block_t       *block; /* btree block */
-       xfs_buf_t               *bp;    /* buffer containing btree block */
-       int                     error;  /* error return value */
-       int                     lev;    /* btree level */
+       *l0 = be32_to_cpu(rec->inobt.ir_startino);
+       *l1 = be32_to_cpu(rec->inobt.ir_freecount);
+       *l2 = be64_to_cpu(rec->inobt.ir_free);
+}
+#endif /* XFS_BTREE_TRACE */
+
+static const struct xfs_btree_ops xfs_inobt_ops = {
+       .rec_len                = sizeof(xfs_inobt_rec_t),
+       .key_len                = sizeof(xfs_inobt_key_t),
+
+       .dup_cursor             = xfs_inobt_dup_cursor,
+       .set_root               = xfs_inobt_set_root,
+       .kill_root              = xfs_inobt_kill_root,
+       .alloc_block            = xfs_inobt_alloc_block,
+       .free_block             = xfs_inobt_free_block,
+       .get_minrecs            = xfs_inobt_get_minrecs,
+       .get_maxrecs            = xfs_inobt_get_maxrecs,
+       .init_key_from_rec      = xfs_inobt_init_key_from_rec,
+       .init_rec_from_key      = xfs_inobt_init_rec_from_key,
+       .init_rec_from_cur      = xfs_inobt_init_rec_from_cur,
+       .init_ptr_from_cur      = xfs_inobt_init_ptr_from_cur,
+       .key_diff               = xfs_inobt_key_diff,
 
-       ASSERT(level < cur->bc_nlevels);
-       /*
-        * Read-ahead to the right at this level.
-        */
-       xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
-       /*
-        * Get a pointer to the btree block.
-        */
-       bp = cur->bc_bufs[level];
-       block = XFS_BUF_TO_INOBT_BLOCK(bp);
-#ifdef DEBUG
-       if ((error = xfs_btree_check_sblock(cur, block, level, bp)))
-               return error;
-#endif
-       /*
-        * Increment the ptr at this level.  If we're still in the block
-        * then we're done.
-        */
-       if (++cur->bc_ptrs[level] <= be16_to_cpu(block->bb_numrecs)) {
-               *stat = 1;
-               return 0;
-       }
-       /*
-        * If we just went off the right edge of the tree, return failure.
-        */
-       if (be32_to_cpu(block->bb_rightsib) == NULLAGBLOCK) {
-               *stat = 0;
-               return 0;
-       }
-       /*
-        * March up the tree incrementing pointers.
-        * Stop when we don't go off the right edge of a block.
-        */
-       for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
-               bp = cur->bc_bufs[lev];
-               block = XFS_BUF_TO_INOBT_BLOCK(bp);
 #ifdef DEBUG
-               if ((error = xfs_btree_check_sblock(cur, block, lev, bp)))
-                       return error;
+       .keys_inorder           = xfs_inobt_keys_inorder,
+       .recs_inorder           = xfs_inobt_recs_inorder,
 #endif
-               if (++cur->bc_ptrs[lev] <= be16_to_cpu(block->bb_numrecs))
-                       break;
-               /*
-                * Read-ahead the right block, we're going to read it
-                * in the next loop.
-                */
-               xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA);
-       }
-       /*
-        * If we went off the root then we are seriously confused.
-        */
-       ASSERT(lev < cur->bc_nlevels);
-       /*
-        * Now walk back down the tree, fixing up the cursor's buffer
-        * pointers and key numbers.
-        */
-       for (bp = cur->bc_bufs[lev], block = XFS_BUF_TO_INOBT_BLOCK(bp);
-            lev > level; ) {
-               xfs_agblock_t   agbno;  /* block number of btree block */
 
-               agbno = be32_to_cpu(*XFS_INOBT_PTR_ADDR(block, cur->bc_ptrs[lev], cur));
-               if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
-                               cur->bc_private.a.agno, agbno, 0, &bp,
-                               XFS_INO_BTREE_REF)))
-                       return error;
-               lev--;
-               xfs_btree_setbuf(cur, lev, bp);
-               block = XFS_BUF_TO_INOBT_BLOCK(bp);
-               if ((error = xfs_btree_check_sblock(cur, block, lev, bp)))
-                       return error;
-               cur->bc_ptrs[lev] = 1;
-       }
-       *stat = 1;
-       return 0;
-}
+#ifdef XFS_BTREE_TRACE
+       .trace_enter            = xfs_inobt_trace_enter,
+       .trace_cursor           = xfs_inobt_trace_cursor,
+       .trace_key              = xfs_inobt_trace_key,
+       .trace_record           = xfs_inobt_trace_record,
+#endif
+};
 
 /*
- * Insert the current record at the point referenced by cur.
- * The cursor may be inconsistent on return if splits have been done.
+ * Allocate a new inode btree cursor.
  */
-int                                    /* error */
-xfs_inobt_insert(
-       xfs_btree_cur_t *cur,           /* btree cursor */
-       int             *stat)          /* success/failure */
+struct xfs_btree_cur *                         /* new inode btree cursor */
+xfs_inobt_init_cursor(
+       struct xfs_mount        *mp,            /* file system mount point */
+       struct xfs_trans        *tp,            /* transaction pointer */
+       struct xfs_buf          *agbp,          /* buffer for agi structure */
+       xfs_agnumber_t          agno)           /* allocation group number */
 {
-       int             error;          /* error return value */
-       int             i;              /* result value, 0 for failure */
-       int             level;          /* current level number in btree */
-       xfs_agblock_t   nbno;           /* new block number (split result) */
-       xfs_btree_cur_t *ncur;          /* new cursor (split result) */
-       xfs_inobt_rec_t nrec;           /* record being inserted this level */
-       xfs_btree_cur_t *pcur;          /* previous level's cursor */
+       struct xfs_agi          *agi = XFS_BUF_TO_AGI(agbp);
+       struct xfs_btree_cur    *cur;
 
-       level = 0;
-       nbno = NULLAGBLOCK;
-       nrec.ir_startino = cpu_to_be32(cur->bc_rec.i.ir_startino);
-       nrec.ir_freecount = cpu_to_be32(cur->bc_rec.i.ir_freecount);
-       nrec.ir_free = cpu_to_be64(cur->bc_rec.i.ir_free);
-       ncur = NULL;
-       pcur = cur;
-       /*
-        * Loop going up the tree, starting at the leaf level.
-        * Stop when we don't get a split block, that must mean that
-        * the insert is finished with this level.
-        */
-       do {
-               /*
-                * Insert nrec/nbno into this level of the tree.
-                * Note if we fail, nbno will be null.
-                */
-               if ((error = xfs_inobt_insrec(pcur, level++, &nbno, &nrec, &ncur,
-                               &i))) {
-                       if (pcur != cur)
-                               xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR);
-                       return error;
-               }
-               /*
-                * See if the cursor we just used is trash.
-                * Can't trash the caller's cursor, but otherwise we should
-                * if ncur is a new cursor or we're about to be done.
-                */
-               if (pcur != cur && (ncur || nbno == NULLAGBLOCK)) {
-                       cur->bc_nlevels = pcur->bc_nlevels;
-                       xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR);
-               }
-               /*
-                * If we got a new cursor, switch to it.
-                */
-               if (ncur) {
-                       pcur = ncur;
-                       ncur = NULL;
-               }
-       } while (nbno != NULLAGBLOCK);
-       *stat = i;
-       return 0;
-}
+       cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP);
 
-/*
- * Lookup the record equal to ino in the btree given by cur.
- */
-int                                    /* error */
-xfs_inobt_lookup_eq(
-       xfs_btree_cur_t *cur,           /* btree cursor */
-       xfs_agino_t     ino,            /* starting inode of chunk */
-       __int32_t       fcnt,           /* free inode count */
-       xfs_inofree_t   free,           /* free inode mask */
-       int             *stat)          /* success/failure */
-{
-       cur->bc_rec.i.ir_startino = ino;
-       cur->bc_rec.i.ir_freecount = fcnt;
-       cur->bc_rec.i.ir_free = free;
-       return xfs_inobt_lookup(cur, XFS_LOOKUP_EQ, stat);
-}
+       cur->bc_tp = tp;
+       cur->bc_mp = mp;
+       cur->bc_nlevels = be32_to_cpu(agi->agi_level);
+       cur->bc_btnum = XFS_BTNUM_INO;
+       cur->bc_blocklog = mp->m_sb.sb_blocklog;
 
-/*
- * Lookup the first record greater than or equal to ino
- * in the btree given by cur.
- */
-int                                    /* error */
-xfs_inobt_lookup_ge(
-       xfs_btree_cur_t *cur,           /* btree cursor */
-       xfs_agino_t     ino,            /* starting inode of chunk */
-       __int32_t       fcnt,           /* free inode count */
-       xfs_inofree_t   free,           /* free inode mask */
-       int             *stat)          /* success/failure */
-{
-       cur->bc_rec.i.ir_startino = ino;
-       cur->bc_rec.i.ir_freecount = fcnt;
-       cur->bc_rec.i.ir_free = free;
-       return xfs_inobt_lookup(cur, XFS_LOOKUP_GE, stat);
-}
+       cur->bc_ops = &xfs_inobt_ops;
 
-/*
- * Lookup the first record less than or equal to ino
- * in the btree given by cur.
- */
-int                                    /* error */
-xfs_inobt_lookup_le(
-       xfs_btree_cur_t *cur,           /* btree cursor */
-       xfs_agino_t     ino,            /* starting inode of chunk */
-       __int32_t       fcnt,           /* free inode count */
-       xfs_inofree_t   free,           /* free inode mask */
-       int             *stat)          /* success/failure */
-{
-       cur->bc_rec.i.ir_startino = ino;
-       cur->bc_rec.i.ir_freecount = fcnt;
-       cur->bc_rec.i.ir_free = free;
-       return xfs_inobt_lookup(cur, XFS_LOOKUP_LE, stat);
+       cur->bc_private.a.agbp = agbp;
+       cur->bc_private.a.agno = agno;
+
+       return cur;
 }
 
 /*
- * Update the record referred to by cur, to the value given
- * by [ino, fcnt, free].
- * This either works (return 0) or gets an EFSCORRUPTED error.
+ * Calculate number of records in an inobt btree block.
  */
-int                                    /* error */
-xfs_inobt_update(
-       xfs_btree_cur_t         *cur,   /* btree cursor */
-       xfs_agino_t             ino,    /* starting inode of chunk */
-       __int32_t               fcnt,   /* free inode count */
-       xfs_inofree_t           free)   /* free inode mask */
+int
+xfs_inobt_maxrecs(
+       struct xfs_mount        *mp,
+       int                     blocklen,
+       int                     leaf)
 {
-       xfs_inobt_block_t       *block; /* btree block to update */
-       xfs_buf_t               *bp;    /* buffer containing btree block */
-       int                     error;  /* error return value */
-       int                     ptr;    /* current record number (updating) */
-       xfs_inobt_rec_t         *rp;    /* pointer to updated record */
+       blocklen -= XFS_INOBT_BLOCK_LEN(mp);
 
-       /*
-        * Pick up the current block.
-        */
-       bp = cur->bc_bufs[0];
-       block = XFS_BUF_TO_INOBT_BLOCK(bp);
-#ifdef DEBUG
-       if ((error = xfs_btree_check_sblock(cur, block, 0, bp)))
-               return error;
-#endif
-       /*
-        * Get the address of the rec to be updated.
-        */
-       ptr = cur->bc_ptrs[0];
-       rp = XFS_INOBT_REC_ADDR(block, ptr, cur);
-       /*
-        * Fill in the new contents and log them.
-        */
-       rp->ir_startino = cpu_to_be32(ino);
-       rp->ir_freecount = cpu_to_be32(fcnt);
-       rp->ir_free = cpu_to_be64(free);
-       xfs_inobt_log_recs(cur, bp, ptr, ptr);
-       /*
-        * Updating first record in leaf. Pass new key value up to our parent.
-        */
-       if (ptr == 1) {
-               xfs_inobt_key_t key;    /* key containing [ino] */
-
-               key.ir_startino = cpu_to_be32(ino);
-               if ((error = xfs_inobt_updkey(cur, &key, 1)))
-                       return error;
-       }
-       return 0;
+       if (leaf)
+               return blocklen / sizeof(xfs_inobt_rec_t);
+       return blocklen / (sizeof(xfs_inobt_key_t) + sizeof(xfs_inobt_ptr_t));
 }
index 8efc4a5b8b9289f30d8d11c069021b6a606e592d..37e5dd01a5779a9da7cd836439d8a1cee0415b8b 100644 (file)
@@ -24,7 +24,6 @@
 
 struct xfs_buf;
 struct xfs_btree_cur;
-struct xfs_btree_sblock;
 struct xfs_mount;
 
 /*
@@ -70,11 +69,6 @@ typedef struct xfs_inobt_key {
 /* btree pointer type */
 typedef __be32 xfs_inobt_ptr_t;
 
-/* btree block header type */
-typedef        struct xfs_btree_sblock xfs_inobt_block_t;
-
-#define        XFS_BUF_TO_INOBT_BLOCK(bp)      ((xfs_inobt_block_t *)XFS_BUF_PTR(bp))
-
 /*
  * Bit manipulations for ir_free.
  */
@@ -84,14 +78,6 @@ typedef      struct xfs_btree_sblock xfs_inobt_block_t;
 #define        XFS_INOBT_SET_FREE(rp,i)        ((rp)->ir_free |= XFS_INOBT_MASK(i))
 #define        XFS_INOBT_CLR_FREE(rp,i)        ((rp)->ir_free &= ~XFS_INOBT_MASK(i))
 
-/*
- * Real block structures have a size equal to the disk block size.
- */
-#define        XFS_INOBT_BLOCK_MAXRECS(lev,cur) ((cur)->bc_mp->m_inobt_mxr[lev != 0])
-#define        XFS_INOBT_BLOCK_MINRECS(lev,cur) ((cur)->bc_mp->m_inobt_mnr[lev != 0])
-#define        XFS_INOBT_IS_LAST_REC(cur)      \
-       ((cur)->bc_ptrs[0] == be16_to_cpu(XFS_BUF_TO_INOBT_BLOCK((cur)->bc_bufs[0])->bb_numrecs))
-
 /*
  * Maximum number of inode btree levels.
  */
@@ -104,75 +90,38 @@ typedef    struct xfs_btree_sblock xfs_inobt_block_t;
 #define        XFS_PREALLOC_BLOCKS(mp)         ((xfs_agblock_t)(XFS_IBT_BLOCK(mp) + 1))
 
 /*
- * Record, key, and pointer address macros for btree blocks.
- */
-#define XFS_INOBT_REC_ADDR(bb,i,cur) \
-       (XFS_BTREE_REC_ADDR(xfs_inobt, bb, i))
-
-#define        XFS_INOBT_KEY_ADDR(bb,i,cur) \
-       (XFS_BTREE_KEY_ADDR(xfs_inobt, bb, i))
-
-#define        XFS_INOBT_PTR_ADDR(bb,i,cur) \
-       (XFS_BTREE_PTR_ADDR(xfs_inobt, bb, \
-                               i, XFS_INOBT_BLOCK_MAXRECS(1, cur)))
-
-/*
- * Decrement cursor by one record at the level.
- * For nonzero levels the leaf-ward information is untouched.
- */
-extern int xfs_inobt_decrement(struct xfs_btree_cur *cur, int level, int *stat);
-
-/*
- * Delete the record pointed to by cur.
- * The cursor refers to the place where the record was (could be inserted)
- * when the operation returns.
- */
-extern int xfs_inobt_delete(struct xfs_btree_cur *cur, int *stat);
-
-/*
- * Get the data from the pointed-to record.
- */
-extern int xfs_inobt_get_rec(struct xfs_btree_cur *cur, xfs_agino_t *ino,
-                            __int32_t *fcnt, xfs_inofree_t *free, int *stat);
-
-/*
- * Increment cursor by one record at the level.
- * For nonzero levels the leaf-ward information is untouched.
- */
-extern int xfs_inobt_increment(struct xfs_btree_cur *cur, int level, int *stat);
-
-/*
- * Insert the current record at the point referenced by cur.
- * The cursor may be inconsistent on return if splits have been done.
- */
-extern int xfs_inobt_insert(struct xfs_btree_cur *cur, int *stat);
-
-/*
- * Lookup the record equal to ino in the btree given by cur.
- */
-extern int xfs_inobt_lookup_eq(struct xfs_btree_cur *cur, xfs_agino_t ino,
-                               __int32_t fcnt, xfs_inofree_t free, int *stat);
-
-/*
- * Lookup the first record greater than or equal to ino
- * in the btree given by cur.
- */
-extern int xfs_inobt_lookup_ge(struct xfs_btree_cur *cur, xfs_agino_t ino,
-                               __int32_t fcnt, xfs_inofree_t free, int *stat);
-
-/*
- * Lookup the first record less than or equal to ino
- * in the btree given by cur.
+ * Btree block header size depends on a superblock flag.
+ *
+ * (not quite yet, but soon)
  */
-extern int xfs_inobt_lookup_le(struct xfs_btree_cur *cur, xfs_agino_t ino,
-                               __int32_t fcnt, xfs_inofree_t free, int *stat);
+#define XFS_INOBT_BLOCK_LEN(mp)        XFS_BTREE_SBLOCK_LEN
 
 /*
- * Update the record referred to by cur, to the value given
- * by [ino, fcnt, free].
- * This either works (return 0) or gets an EFSCORRUPTED error.
- */
-extern int xfs_inobt_update(struct xfs_btree_cur *cur, xfs_agino_t ino,
-                               __int32_t fcnt, xfs_inofree_t free);
+ * Record, key, and pointer address macros for btree blocks.
+ *
+ * (note that some of these may appear unused, but they are used in userspace)
+ */
+#define XFS_INOBT_REC_ADDR(mp, block, index) \
+       ((xfs_inobt_rec_t *) \
+               ((char *)(block) + \
+                XFS_INOBT_BLOCK_LEN(mp) + \
+                (((index) - 1) * sizeof(xfs_inobt_rec_t))))
+
+#define XFS_INOBT_KEY_ADDR(mp, block, index) \
+       ((xfs_inobt_key_t *) \
+               ((char *)(block) + \
+                XFS_INOBT_BLOCK_LEN(mp) + \
+                ((index) - 1) * sizeof(xfs_inobt_key_t)))
+
+#define XFS_INOBT_PTR_ADDR(mp, block, index, maxrecs) \
+       ((xfs_inobt_ptr_t *) \
+               ((char *)(block) + \
+                XFS_INOBT_BLOCK_LEN(mp) + \
+                (maxrecs) * sizeof(xfs_inobt_key_t) + \
+                ((index) - 1) * sizeof(xfs_inobt_ptr_t)))
+
+extern struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_mount *,
+               struct xfs_trans *, struct xfs_buf *, xfs_agnumber_t);
+extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int);
 
 #endif /* __XFS_IALLOC_BTREE_H__ */
index e229e9e001c291fa556c043cf79c9070743675e7..bf4dc5eb4cfc77c673c6843ec1ad1c12fb808182 100644 (file)
 #include "xfs_ialloc.h"
 #include "xfs_quota.h"
 #include "xfs_utils.h"
+#include "xfs_trans_priv.h"
+#include "xfs_inode_item.h"
 
 /*
- * Look up an inode by number in the given file system.
- * The inode is looked up in the cache held in each AG.
- * If the inode is found in the cache, attach it to the provided
- * vnode.
- *
- * If it is not in core, read it in from the file system's device,
- * add it to the cache and attach the provided vnode.
- *
- * The inode is locked according to the value of the lock_flags parameter.
- * This flag parameter indicates how and if the inode's IO lock and inode lock
- * should be taken.
- *
- * mp -- the mount point structure for the current file system.  It points
- *       to the inode hash table.
- * tp -- a pointer to the current transaction if there is one.  This is
- *       simply passed through to the xfs_iread() call.
- * ino -- the number of the inode desired.  This is the unique identifier
- *        within the file system for the inode being requested.
- * lock_flags -- flags indicating how to lock the inode.  See the comment
- *              for xfs_ilock() for a list of valid values.
- * bno -- the block number starting the buffer containing the inode,
- *       if known (as by bulkstat), else 0.
+ * Check the validity of the inode we just found it the cache
  */
-STATIC int
-xfs_iget_core(
-       struct inode    *inode,
-       xfs_mount_t     *mp,
-       xfs_trans_t     *tp,
-       xfs_ino_t       ino,
-       uint            flags,
-       uint            lock_flags,
-       xfs_inode_t     **ipp,
-       xfs_daddr_t     bno)
+static int
+xfs_iget_cache_hit(
+       struct xfs_perag        *pag,
+       struct xfs_inode        *ip,
+       int                     flags,
+       int                     lock_flags) __releases(pag->pag_ici_lock)
 {
-       struct inode    *old_inode;
-       xfs_inode_t     *ip;
-       xfs_inode_t     *iq;
-       int             error;
-       unsigned long   first_index, mask;
-       xfs_perag_t     *pag;
-       xfs_agino_t     agino;
+       struct xfs_mount        *mp = ip->i_mount;
+       int                     error = EAGAIN;
 
-       /* the radix tree exists only in inode capable AGs */
-       if (XFS_INO_TO_AGNO(mp, ino) >= mp->m_maxagi)
-               return EINVAL;
-
-       /* get the perag structure and ensure that it's inode capable */
-       pag = xfs_get_perag(mp, ino);
-       if (!pag->pagi_inodeok)
-               return EINVAL;
-       ASSERT(pag->pag_ici_init);
-       agino = XFS_INO_TO_AGINO(mp, ino);
+       /*
+        * If INEW is set this inode is being set up
+        * If IRECLAIM is set this inode is being torn down
+        * Pause and try again.
+        */
+       if (xfs_iflags_test(ip, (XFS_INEW|XFS_IRECLAIM))) {
+               XFS_STATS_INC(xs_ig_frecycle);
+               goto out_error;
+       }
 
-again:
-       read_lock(&pag->pag_ici_lock);
-       ip = radix_tree_lookup(&pag->pag_ici_root, agino);
+       /* If IRECLAIMABLE is set, we've torn down the vfs inode part */
+       if (xfs_iflags_test(ip, XFS_IRECLAIMABLE)) {
 
-       if (ip != NULL) {
                /*
-                * If INEW is set this inode is being set up
-                * we need to pause and try again.
+                * If lookup is racing with unlink, then we should return an
+                * error immediately so we don't remove it from the reclaim
+                * list and potentially leak the inode.
                 */
-               if (xfs_iflags_test(ip, XFS_INEW)) {
-                       read_unlock(&pag->pag_ici_lock);
-                       delay(1);
-                       XFS_STATS_INC(xs_ig_frecycle);
-
-                       goto again;
+               if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) {
+                       error = ENOENT;
+                       goto out_error;
                }
 
-               old_inode = ip->i_vnode;
-               if (old_inode == NULL) {
-                       /*
-                        * If IRECLAIM is set this inode is
-                        * on its way out of the system,
-                        * we need to pause and try again.
-                        */
-                       if (xfs_iflags_test(ip, XFS_IRECLAIM)) {
-                               read_unlock(&pag->pag_ici_lock);
-                               delay(1);
-                               XFS_STATS_INC(xs_ig_frecycle);
-
-                               goto again;
-                       }
-                       ASSERT(xfs_iflags_test(ip, XFS_IRECLAIMABLE));
-
-                       /*
-                        * If lookup is racing with unlink, then we
-                        * should return an error immediately so we
-                        * don't remove it from the reclaim list and
-                        * potentially leak the inode.
-                        */
-                       if ((ip->i_d.di_mode == 0) &&
-                           !(flags & XFS_IGET_CREATE)) {
-                               read_unlock(&pag->pag_ici_lock);
-                               xfs_put_perag(mp, pag);
-                               return ENOENT;
-                       }
-
-                       xfs_itrace_exit_tag(ip, "xfs_iget.alloc");
-
-                       XFS_STATS_INC(xs_ig_found);
-                       xfs_iflags_clear(ip, XFS_IRECLAIMABLE);
-                       read_unlock(&pag->pag_ici_lock);
-
-                       XFS_MOUNT_ILOCK(mp);
-                       list_del_init(&ip->i_reclaim);
-                       XFS_MOUNT_IUNLOCK(mp);
-
-                       goto finish_inode;
-
-               } else if (inode != old_inode) {
-                       /* The inode is being torn down, pause and
-                        * try again.
-                        */
-                       if (old_inode->i_state & (I_FREEING | I_CLEAR)) {
-                               read_unlock(&pag->pag_ici_lock);
-                               delay(1);
-                               XFS_STATS_INC(xs_ig_frecycle);
-
-                               goto again;
-                       }
-/* Chances are the other vnode (the one in the inode) is being torn
-* down right now, and we landed on top of it. Question is, what do
-* we do? Unhook the old inode and hook up the new one?
-*/
-                       cmn_err(CE_PANIC,
-               "xfs_iget_core: ambiguous vns: vp/0x%p, invp/0x%p",
-                                       old_inode, inode);
-               }
+               xfs_itrace_exit_tag(ip, "xfs_iget.alloc");
 
                /*
-                * Inode cache hit
+                * We need to re-initialise the VFS inode as it has been
+                * 'freed' by the VFS. Do this here so we can deal with
+                * errors cleanly, then tag it so it can be set up correctly
+                * later.
                 */
-               read_unlock(&pag->pag_ici_lock);
-               XFS_STATS_INC(xs_ig_found);
-
-finish_inode:
-               if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
-                       xfs_put_perag(mp, pag);
-                       return ENOENT;
+               if (!inode_init_always(mp->m_super, VFS_I(ip))) {
+                       error = ENOMEM;
+                       goto out_error;
                }
 
-               if (lock_flags != 0)
-                       xfs_ilock(ip, lock_flags);
+               /*
+                * We must set the XFS_INEW flag before clearing the
+                * XFS_IRECLAIMABLE flag so that if a racing lookup does
+                * not find the XFS_IRECLAIMABLE above but has the igrab()
+                * below succeed we can safely check XFS_INEW to detect
+                * that this inode is still being initialised.
+                */
+               xfs_iflags_set(ip, XFS_INEW);
+               xfs_iflags_clear(ip, XFS_IRECLAIMABLE);
+
+               /* clear the radix tree reclaim flag as well. */
+               __xfs_inode_clear_reclaim_tag(mp, pag, ip);
+       } else if (!igrab(VFS_I(ip))) {
+               /* If the VFS inode is being torn down, pause and try again. */
+               XFS_STATS_INC(xs_ig_frecycle);
+               goto out_error;
+       } else if (xfs_iflags_test(ip, XFS_INEW)) {
+               /*
+                * We are racing with another cache hit that is
+                * currently recycling this inode out of the XFS_IRECLAIMABLE
+                * state. Wait for the initialisation to complete before
+                * continuing.
+                */
+               wait_on_inode(VFS_I(ip));
+       }
 
-               xfs_iflags_clear(ip, XFS_ISTALE);
-               xfs_itrace_exit_tag(ip, "xfs_iget.found");
-               goto return_ip;
+       if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
+               error = ENOENT;
+               iput(VFS_I(ip));
+               goto out_error;
        }
 
-       /*
-        * Inode cache miss
-        */
+       /* We've got a live one. */
+       read_unlock(&pag->pag_ici_lock);
+
+       if (lock_flags != 0)
+               xfs_ilock(ip, lock_flags);
+
+       xfs_iflags_clear(ip, XFS_ISTALE);
+       xfs_itrace_exit_tag(ip, "xfs_iget.found");
+       XFS_STATS_INC(xs_ig_found);
+       return 0;
+
+out_error:
        read_unlock(&pag->pag_ici_lock);
-       XFS_STATS_INC(xs_ig_missed);
+       return error;
+}
+
+
+static int
+xfs_iget_cache_miss(
+       struct xfs_mount        *mp,
+       struct xfs_perag        *pag,
+       xfs_trans_t             *tp,
+       xfs_ino_t               ino,
+       struct xfs_inode        **ipp,
+       xfs_daddr_t             bno,
+       int                     flags,
+       int                     lock_flags) __releases(pag->pag_ici_lock)
+{
+       struct xfs_inode        *ip;
+       int                     error;
+       unsigned long           first_index, mask;
+       xfs_agino_t             agino = XFS_INO_TO_AGINO(mp, ino);
 
        /*
         * Read the disk inode attributes into a new inode structure and get
@@ -203,116 +161,85 @@ finish_inode:
         */
        error = xfs_iread(mp, tp, ino, &ip, bno,
                          (flags & XFS_IGET_BULKSTAT) ? XFS_IMAP_BULKSTAT : 0);
-       if (error) {
-               xfs_put_perag(mp, pag);
+       if (error)
                return error;
-       }
 
        xfs_itrace_exit_tag(ip, "xfs_iget.alloc");
 
-
-       mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER,
-                    "xfsino", ip->i_ino);
-       mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
-       init_waitqueue_head(&ip->i_ipin_wait);
-       atomic_set(&ip->i_pincount, 0);
-
-       /*
-        * Because we want to use a counting completion, complete
-        * the flush completion once to allow a single access to
-        * the flush completion without blocking.
-        */
-       init_completion(&ip->i_flush);
-       complete(&ip->i_flush);
+       if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) {
+               error = ENOENT;
+               goto out_destroy;
+       }
 
        if (lock_flags)
                xfs_ilock(ip, lock_flags);
 
-       if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) {
-               xfs_idestroy(ip);
-               xfs_put_perag(mp, pag);
-               return ENOENT;
-       }
-
        /*
         * Preload the radix tree so we can insert safely under the
-        * write spinlock.
+        * write spinlock. Note that we cannot sleep inside the preload
+        * region.
         */
        if (radix_tree_preload(GFP_KERNEL)) {
-               xfs_idestroy(ip);
-               delay(1);
-               goto again;
+               error = EAGAIN;
+               goto out_unlock;
        }
+
        mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1);
        first_index = agino & mask;
        write_lock(&pag->pag_ici_lock);
-       /*
-        * insert the new inode
-        */
+
+       /* insert the new inode */
        error = radix_tree_insert(&pag->pag_ici_root, agino, ip);
        if (unlikely(error)) {
-               BUG_ON(error != -EEXIST);
-               write_unlock(&pag->pag_ici_lock);
-               radix_tree_preload_end();
-               xfs_idestroy(ip);
+               WARN_ON(error != -EEXIST);
                XFS_STATS_INC(xs_ig_dup);
-               goto again;
+               error = EAGAIN;
+               goto out_preload_end;
        }
 
-       /*
-        * These values _must_ be set before releasing the radix tree lock!
-        */
+       /* These values _must_ be set before releasing the radix tree lock! */
        ip->i_udquot = ip->i_gdquot = NULL;
        xfs_iflags_set(ip, XFS_INEW);
 
        write_unlock(&pag->pag_ici_lock);
        radix_tree_preload_end();
-
-       /*
-        * Link ip to its mount and thread it on the mount's inode list.
-        */
-       XFS_MOUNT_ILOCK(mp);
-       if ((iq = mp->m_inodes)) {
-               ASSERT(iq->i_mprev->i_mnext == iq);
-               ip->i_mprev = iq->i_mprev;
-               iq->i_mprev->i_mnext = ip;
-               iq->i_mprev = ip;
-               ip->i_mnext = iq;
-       } else {
-               ip->i_mnext = ip;
-               ip->i_mprev = ip;
-       }
-       mp->m_inodes = ip;
-
-       XFS_MOUNT_IUNLOCK(mp);
-       xfs_put_perag(mp, pag);
-
- return_ip:
-       ASSERT(ip->i_df.if_ext_max ==
-              XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t));
-
-       xfs_iflags_set(ip, XFS_IMODIFIED);
        *ipp = ip;
-
-       /*
-        * Set up the Linux with the Linux inode.
-        */
-       ip->i_vnode = inode;
-       inode->i_private = ip;
-
-       /*
-        * If we have a real type for an on-disk inode, we can set ops(&unlock)
-        * now.  If it's a new inode being created, xfs_ialloc will handle it.
-        */
-       if (ip->i_d.di_mode != 0)
-               xfs_setup_inode(ip);
        return 0;
-}
 
+out_preload_end:
+       write_unlock(&pag->pag_ici_lock);
+       radix_tree_preload_end();
+out_unlock:
+       if (lock_flags)
+               xfs_iunlock(ip, lock_flags);
+out_destroy:
+       xfs_destroy_inode(ip);
+       return error;
+}
 
 /*
- * The 'normal' internal xfs_iget, if needed it will
- * 'allocate', or 'get', the vnode.
+ * Look up an inode by number in the given file system.
+ * The inode is looked up in the cache held in each AG.
+ * If the inode is found in the cache, initialise the vfs inode
+ * if necessary.
+ *
+ * If it is not in core, read it in from the file system's device,
+ * add it to the cache and initialise the vfs inode.
+ *
+ * The inode is locked according to the value of the lock_flags parameter.
+ * This flag parameter indicates how and if the inode's IO lock and inode lock
+ * should be taken.
+ *
+ * mp -- the mount point structure for the current file system.  It points
+ *       to the inode hash table.
+ * tp -- a pointer to the current transaction if there is one.  This is
+ *       simply passed through to the xfs_iread() call.
+ * ino -- the number of the inode desired.  This is the unique identifier
+ *        within the file system for the inode being requested.
+ * lock_flags -- flags indicating how to lock the inode.  See the comment
+ *              for xfs_ilock() for a list of valid values.
+ * bno -- the block number starting the buffer containing the inode,
+ *       if known (as by bulkstat), else 0.
  */
 int
 xfs_iget(
@@ -324,61 +251,65 @@ xfs_iget(
        xfs_inode_t     **ipp,
        xfs_daddr_t     bno)
 {
-       struct inode    *inode;
        xfs_inode_t     *ip;
        int             error;
+       xfs_perag_t     *pag;
+       xfs_agino_t     agino;
 
-       XFS_STATS_INC(xs_ig_attempts);
-
-retry:
-       inode = iget_locked(mp->m_super, ino);
-       if (!inode)
-               /* If we got no inode we are out of memory */
-               return ENOMEM;
-
-       if (inode->i_state & I_NEW) {
-               XFS_STATS_INC(vn_active);
-               XFS_STATS_INC(vn_alloc);
-
-               error = xfs_iget_core(inode, mp, tp, ino, flags,
-                               lock_flags, ipp, bno);
-               if (error) {
-                       make_bad_inode(inode);
-                       if (inode->i_state & I_NEW)
-                               unlock_new_inode(inode);
-                       iput(inode);
-               }
-               return error;
+       /* the radix tree exists only in inode capable AGs */
+       if (XFS_INO_TO_AGNO(mp, ino) >= mp->m_maxagi)
+               return EINVAL;
+
+       /* get the perag structure and ensure that it's inode capable */
+       pag = xfs_get_perag(mp, ino);
+       if (!pag->pagi_inodeok)
+               return EINVAL;
+       ASSERT(pag->pag_ici_init);
+       agino = XFS_INO_TO_AGINO(mp, ino);
+
+again:
+       error = 0;
+       read_lock(&pag->pag_ici_lock);
+       ip = radix_tree_lookup(&pag->pag_ici_root, agino);
+
+       if (ip) {
+               error = xfs_iget_cache_hit(pag, ip, flags, lock_flags);
+               if (error)
+                       goto out_error_or_again;
+       } else {
+               read_unlock(&pag->pag_ici_lock);
+               XFS_STATS_INC(xs_ig_missed);
+
+               error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip, bno,
+                                                       flags, lock_flags);
+               if (error)
+                       goto out_error_or_again;
        }
+       xfs_put_perag(mp, pag);
+
+       xfs_iflags_set(ip, XFS_IMODIFIED);
+       *ipp = ip;
 
+       ASSERT(ip->i_df.if_ext_max ==
+              XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t));
        /*
-        * If the inode is not fully constructed due to
-        * filehandle mismatches wait for the inode to go
-        * away and try again.
-        *
-        * iget_locked will call __wait_on_freeing_inode
-        * to wait for the inode to go away.
+        * If we have a real type for an on-disk inode, we can set ops(&unlock)
+        * now.  If it's a new inode being created, xfs_ialloc will handle it.
         */
-       if (is_bad_inode(inode)) {
-               iput(inode);
-               delay(1);
-               goto retry;
-       }
+       if (xfs_iflags_test(ip, XFS_INEW) && ip->i_d.di_mode != 0)
+               xfs_setup_inode(ip);
+       return 0;
 
-       ip = XFS_I(inode);
-       if (!ip) {
-               iput(inode);
+out_error_or_again:
+       if (error == EAGAIN) {
                delay(1);
-               goto retry;
+               goto again;
        }
-
-       if (lock_flags != 0)
-               xfs_ilock(ip, lock_flags);
-       XFS_STATS_INC(xs_ig_found);
-       *ipp = ip;
-       return 0;
+       xfs_put_perag(mp, pag);
+       return error;
 }
 
+
 /*
  * Look for the inode corresponding to the given ino in the hash table.
  * If it is there and its i_transp pointer matches tp, return it.
@@ -462,14 +393,13 @@ xfs_ireclaim(xfs_inode_t *ip)
        xfs_iextract(ip);
 
        /*
-        * Here we do a spurious inode lock in order to coordinate with
-        * xfs_sync().  This is because xfs_sync() references the inodes
-        * in the mount list without taking references on the corresponding
-        * vnodes.  We make that OK here by ensuring that we wait until
-        * the inode is unlocked in xfs_sync() before we go ahead and
-        * free it.  We get both the regular lock and the io lock because
-        * the xfs_sync() code may need to drop the regular one but will
-        * still hold the io lock.
+        * Here we do a spurious inode lock in order to coordinate with inode
+        * cache radix tree lookups.  This is because the lookup can reference
+        * the inodes in the cache without taking references.  We make that OK
+        * here by ensuring that we wait until the inode is unlocked after the
+        * lookup before we go ahead and free it.  We get both the ilock and
+        * the iolock because the code may need to drop the ilock one but will
+        * still hold the iolock.
         */
        xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
 
@@ -479,14 +409,6 @@ xfs_ireclaim(xfs_inode_t *ip)
         */
        XFS_QM_DQDETACH(ip->i_mount, ip);
 
-       /*
-        * Pull our behavior descriptor from the vnode chain.
-        */
-       if (ip->i_vnode) {
-               ip->i_vnode->i_private = NULL;
-               ip->i_vnode = NULL;
-       }
-
        /*
         * Free all memory associated with the inode.
         */
@@ -505,38 +427,13 @@ xfs_iextract(
 {
        xfs_mount_t     *mp = ip->i_mount;
        xfs_perag_t     *pag = xfs_get_perag(mp, ip->i_ino);
-       xfs_inode_t     *iq;
 
        write_lock(&pag->pag_ici_lock);
        radix_tree_delete(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, ip->i_ino));
        write_unlock(&pag->pag_ici_lock);
        xfs_put_perag(mp, pag);
 
-       /*
-        * Remove from mount's inode list.
-        */
-       XFS_MOUNT_ILOCK(mp);
-       ASSERT((ip->i_mnext != NULL) && (ip->i_mprev != NULL));
-       iq = ip->i_mnext;
-       iq->i_mprev = ip->i_mprev;
-       ip->i_mprev->i_mnext = iq;
-
-       /*
-        * Fix up the head pointer if it points to the inode being deleted.
-        */
-       if (mp->m_inodes == ip) {
-               if (ip == iq) {
-                       mp->m_inodes = NULL;
-               } else {
-                       mp->m_inodes = iq;
-               }
-       }
-
-       /* Deal with the deleted inodes list */
-       list_del_init(&ip->i_reclaim);
-
        mp->m_ireclaims++;
-       XFS_MOUNT_IUNLOCK(mp);
 }
 
 /*
@@ -737,7 +634,7 @@ xfs_iunlock(
                 * it is in the AIL and anyone is waiting on it.  Don't do
                 * this if the caller has asked us not to.
                 */
-               xfs_trans_unlocked_item(ip->i_mount,
+               xfs_trans_unlocked_item(ip->i_itemp->ili_item.li_ailp,
                                        (xfs_log_item_t*)(ip->i_itemp));
        }
        xfs_ilock_trace(ip, 3, lock_flags, (inst_t *)__return_address);
index d36450003983fe0e19aa7e6aaac6d6660faca1e6..f9ce62890ea5b0da0782d9eddec61d3217749a60 100644 (file)
@@ -30,11 +30,9 @@ typedef struct xfs_imap {
        ushort          im_boffset;     /* inode offset in block in bytes */
 } xfs_imap_t;
 
-#ifdef __KERNEL__
 struct xfs_mount;
 struct xfs_trans;
 int    xfs_imap(struct xfs_mount *, struct xfs_trans *, xfs_ino_t,
                 xfs_imap_t *, uint);
-#endif
 
 #endif /* __XFS_IMAP_H__ */
index dbd9cef852ece3d6d8b3b5b930626000c88bed06..cd522827f99e3c7f9fb6185bd3034170f659c13c 100644 (file)
@@ -41,6 +41,7 @@
 #include "xfs_buf_item.h"
 #include "xfs_inode_item.h"
 #include "xfs_btree.h"
+#include "xfs_btree_trace.h"
 #include "xfs_alloc.h"
 #include "xfs_ialloc.h"
 #include "xfs_bmap.h"
@@ -221,25 +222,26 @@ xfs_imap_to_bp(
  * Use xfs_imap() to determine the size and location of the
  * buffer to read from disk.
  */
-STATIC int
+int
 xfs_inotobp(
        xfs_mount_t     *mp,
        xfs_trans_t     *tp,
        xfs_ino_t       ino,
        xfs_dinode_t    **dipp,
        xfs_buf_t       **bpp,
-       int             *offset)
+       int             *offset,
+       uint            imap_flags)
 {
        xfs_imap_t      imap;
        xfs_buf_t       *bp;
        int             error;
 
        imap.im_blkno = 0;
-       error = xfs_imap(mp, tp, ino, &imap, XFS_IMAP_LOOKUP);
+       error = xfs_imap(mp, tp, ino, &imap, imap_flags | XFS_IMAP_LOOKUP);
        if (error)
                return error;
 
-       error = xfs_imap_to_bp(mp, tp, &imap, &bp, XFS_BUF_LOCK, 0);
+       error = xfs_imap_to_bp(mp, tp, &imap, &bp, XFS_BUF_LOCK, imap_flags);
        if (error)
                return error;
 
@@ -621,7 +623,7 @@ xfs_iformat_btree(
        ifp = XFS_IFORK_PTR(ip, whichfork);
        dfp = (xfs_bmdr_block_t *)XFS_DFORK_PTR(dip, whichfork);
        size = XFS_BMAP_BROOT_SPACE(dfp);
-       nrecs = XFS_BMAP_BROOT_NUMRECS(dfp);
+       nrecs = be16_to_cpu(dfp->bb_numrecs);
 
        /*
         * blow out if -- fork has less extents than can fit in
@@ -649,8 +651,9 @@ xfs_iformat_btree(
         * Copy and convert from the on-disk structure
         * to the in-memory structure.
         */
-       xfs_bmdr_to_bmbt(dfp, XFS_DFORK_SIZE(dip, ip->i_mount, whichfork),
-               ifp->if_broot, size);
+       xfs_bmdr_to_bmbt(ip->i_mount, dfp,
+                        XFS_DFORK_SIZE(dip, ip->i_mount, whichfork),
+                        ifp->if_broot, size);
        ifp->if_flags &= ~XFS_IFEXTENTS;
        ifp->if_flags |= XFS_IFBROOT;
 
@@ -788,51 +791,56 @@ xfs_dic2xflags(
 }
 
 /*
- * Given a mount structure and an inode number, return a pointer
- * to a newly allocated in-core inode corresponding to the given
- * inode number.
- *
- * Initialize the inode's attributes and extent pointers if it
- * already has them (it will not if the inode has no links).
+ * Allocate and initialise an xfs_inode.
  */
-int
-xfs_iread(
-       xfs_mount_t     *mp,
-       xfs_trans_t     *tp,
-       xfs_ino_t       ino,
-       xfs_inode_t     **ipp,
-       xfs_daddr_t     bno,
-       uint            imap_flags)
+STATIC struct xfs_inode *
+xfs_inode_alloc(
+       struct xfs_mount        *mp,
+       xfs_ino_t               ino)
 {
-       xfs_buf_t       *bp;
-       xfs_dinode_t    *dip;
-       xfs_inode_t     *ip;
-       int             error;
+       struct xfs_inode        *ip;
 
-       ASSERT(xfs_inode_zone != NULL);
+       /*
+        * if this didn't occur in transactions, we could use
+        * KM_MAYFAIL and return NULL here on ENOMEM. Set the
+        * code up to do this anyway.
+        */
+       ip = kmem_zone_alloc(xfs_inode_zone, KM_SLEEP);
+       if (!ip)
+               return NULL;
 
-       ip = kmem_zone_zalloc(xfs_inode_zone, KM_SLEEP);
-       ip->i_ino = ino;
-       ip->i_mount = mp;
-       atomic_set(&ip->i_iocount, 0);
-       spin_lock_init(&ip->i_flags_lock);
+       ASSERT(atomic_read(&ip->i_iocount) == 0);
+       ASSERT(atomic_read(&ip->i_pincount) == 0);
+       ASSERT(!spin_is_locked(&ip->i_flags_lock));
+       ASSERT(completion_done(&ip->i_flush));
 
        /*
-        * Get pointer's to the on-disk inode and the buffer containing it.
-        * If the inode number refers to a block outside the file system
-        * then xfs_itobp() will return NULL.  In this case we should
-        * return NULL as well.  Set i_blkno to 0 so that xfs_itobp() will
-        * know that this is a new incore inode.
+        * initialise the VFS inode here to get failures
+        * out of the way early.
         */
-       error = xfs_itobp(mp, tp, ip, &dip, &bp, bno, imap_flags, XFS_BUF_LOCK);
-       if (error) {
+       if (!inode_init_always(mp->m_super, VFS_I(ip))) {
                kmem_zone_free(xfs_inode_zone, ip);
-               return error;
+               return NULL;
        }
 
+       /* initialise the xfs inode */
+       ip->i_ino = ino;
+       ip->i_mount = mp;
+       ip->i_blkno = 0;
+       ip->i_len = 0;
+       ip->i_boffset =0;
+       ip->i_afp = NULL;
+       memset(&ip->i_df, 0, sizeof(xfs_ifork_t));
+       ip->i_flags = 0;
+       ip->i_update_core = 0;
+       ip->i_update_size = 0;
+       ip->i_delayed_blks = 0;
+       memset(&ip->i_d, 0, sizeof(xfs_icdinode_t));
+       ip->i_size = 0;
+       ip->i_new_size = 0;
+
        /*
         * Initialize inode's trace buffers.
-        * Do this before xfs_iformat in case it adds entries.
         */
 #ifdef XFS_INODE_TRACE
        ip->i_trace = ktrace_alloc(INODE_TRACE_SIZE, KM_NOFS);
@@ -840,7 +848,7 @@ xfs_iread(
 #ifdef XFS_BMAP_TRACE
        ip->i_xtrace = ktrace_alloc(XFS_BMAP_KTRACE_SIZE, KM_NOFS);
 #endif
-#ifdef XFS_BMBT_TRACE
+#ifdef XFS_BTREE_TRACE
        ip->i_btrace = ktrace_alloc(XFS_BMBT_KTRACE_SIZE, KM_NOFS);
 #endif
 #ifdef XFS_RW_TRACE
@@ -853,13 +861,51 @@ xfs_iread(
        ip->i_dir_trace = ktrace_alloc(XFS_DIR2_KTRACE_SIZE, KM_NOFS);
 #endif
 
+       return ip;
+}
+
+/*
+ * Given a mount structure and an inode number, return a pointer
+ * to a newly allocated in-core inode corresponding to the given
+ * inode number.
+ *
+ * Initialize the inode's attributes and extent pointers if it
+ * already has them (it will not if the inode has no links).
+ */
+int
+xfs_iread(
+       xfs_mount_t     *mp,
+       xfs_trans_t     *tp,
+       xfs_ino_t       ino,
+       xfs_inode_t     **ipp,
+       xfs_daddr_t     bno,
+       uint            imap_flags)
+{
+       xfs_buf_t       *bp;
+       xfs_dinode_t    *dip;
+       xfs_inode_t     *ip;
+       int             error;
+
+       ip = xfs_inode_alloc(mp, ino);
+       if (!ip)
+               return ENOMEM;
+
+       /*
+        * Get pointer's to the on-disk inode and the buffer containing it.
+        * If the inode number refers to a block outside the file system
+        * then xfs_itobp() will return NULL.  In this case we should
+        * return NULL as well.  Set i_blkno to 0 so that xfs_itobp() will
+        * know that this is a new incore inode.
+        */
+       error = xfs_itobp(mp, tp, ip, &dip, &bp, bno, imap_flags, XFS_BUF_LOCK);
+       if (error)
+               goto out_destroy_inode;
+
        /*
         * If we got something that isn't an inode it means someone
         * (nfs or dmi) has a stale handle.
         */
        if (be16_to_cpu(dip->di_core.di_magic) != XFS_DINODE_MAGIC) {
-               kmem_zone_free(xfs_inode_zone, ip);
-               xfs_trans_brelse(tp, bp);
 #ifdef DEBUG
                xfs_fs_cmn_err(CE_ALERT, mp, "xfs_iread: "
                                "dip->di_core.di_magic (0x%x) != "
@@ -867,7 +913,8 @@ xfs_iread(
                                be16_to_cpu(dip->di_core.di_magic),
                                XFS_DINODE_MAGIC);
 #endif /* DEBUG */
-               return XFS_ERROR(EINVAL);
+               error = XFS_ERROR(EINVAL);
+               goto out_brelse;
        }
 
        /*
@@ -881,14 +928,12 @@ xfs_iread(
                xfs_dinode_from_disk(&ip->i_d, &dip->di_core);
                error = xfs_iformat(ip, dip);
                if (error)  {
-                       kmem_zone_free(xfs_inode_zone, ip);
-                       xfs_trans_brelse(tp, bp);
 #ifdef DEBUG
                        xfs_fs_cmn_err(CE_ALERT, mp, "xfs_iread: "
                                        "xfs_iformat() returned error %d",
                                        error);
 #endif /* DEBUG */
-                       return error;
+                       goto out_brelse;
                }
        } else {
                ip->i_d.di_magic = be16_to_cpu(dip->di_core.di_magic);
@@ -911,8 +956,6 @@ xfs_iread(
                        XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
        }
 
-       INIT_LIST_HEAD(&ip->i_reclaim);
-
        /*
         * The inode format changed when we moved the link count and
         * made it 32 bits long.  If this is an old format inode,
@@ -956,6 +999,12 @@ xfs_iread(
        xfs_trans_brelse(tp, bp);
        *ipp = ip;
        return 0;
+
+ out_brelse:
+       xfs_trans_brelse(tp, bp);
+ out_destroy_inode:
+       xfs_destroy_inode(ip);
+       return error;
 }
 
 /*
@@ -1049,6 +1098,7 @@ xfs_ialloc(
        uint            flags;
        int             error;
        timespec_t      tv;
+       int             filestreams = 0;
 
        /*
         * Call the space management code to pick
@@ -1056,9 +1106,8 @@ xfs_ialloc(
         */
        error = xfs_dialloc(tp, pip ? pip->i_ino : 0, mode, okalloc,
                            ialloc_context, call_again, &ino);
-       if (error != 0) {
+       if (error)
                return error;
-       }
        if (*call_again || ino == NULLFSINO) {
                *ipp = NULL;
                return 0;
@@ -1072,9 +1121,8 @@ xfs_ialloc(
         */
        error = xfs_trans_iget(tp->t_mountp, tp, ino,
                                XFS_IGET_CREATE, XFS_ILOCK_EXCL, &ip);
-       if (error != 0) {
+       if (error)
                return error;
-       }
        ASSERT(ip != NULL);
 
        ip->i_d.di_mode = (__uint16_t)mode;
@@ -1155,13 +1203,12 @@ xfs_ialloc(
                flags |= XFS_ILOG_DEV;
                break;
        case S_IFREG:
-               if (pip && xfs_inode_is_filestream(pip)) {
-                       error = xfs_filestream_associate(pip, ip);
-                       if (error < 0)
-                               return -error;
-                       if (!error)
-                               xfs_iflags_set(ip, XFS_IFILESTREAM);
-               }
+               /*
+                * we can't set up filestreams until after the VFS inode
+                * is set up properly.
+                */
+               if (pip && xfs_inode_is_filestream(pip))
+                       filestreams = 1;
                /* fall through */
        case S_IFDIR:
                if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY)) {
@@ -1227,6 +1274,15 @@ xfs_ialloc(
        /* now that we have an i_mode we can setup inode ops and unlock */
        xfs_setup_inode(ip);
 
+       /* now we have set up the vfs inode we can associate the filestream */
+       if (filestreams) {
+               error = xfs_filestream_associate(pip, ip);
+               if (error < 0)
+                       return -error;
+               if (!error)
+                       xfs_iflags_set(ip, XFS_IFILESTREAM);
+       }
+
        *ipp = ip;
        return 0;
 }
@@ -1414,7 +1470,7 @@ xfs_itruncate_start(
        mp = ip->i_mount;
 
        /* wait for the completion of any pending DIOs */
-       if (new_size < ip->i_size)
+       if (new_size == 0 || new_size < ip->i_size)
                vn_iowait(ip);
 
        /*
@@ -1992,7 +2048,7 @@ xfs_iunlink_remove(
                        }
                        next_ino = XFS_AGINO_TO_INO(mp, agno, next_agino);
                        error = xfs_inotobp(mp, tp, next_ino, &last_dip,
-                                           &last_ibp, &last_offset);
+                                           &last_ibp, &last_offset, 0);
                        if (error) {
                                cmn_err(CE_WARN,
                        "xfs_iunlink_remove: xfs_inotobp()  returned an error %d on %s.  Returning error.",
@@ -2160,9 +2216,9 @@ xfs_ifree_cluster(
                                iip = (xfs_inode_log_item_t *)lip;
                                ASSERT(iip->ili_logged == 1);
                                lip->li_cb = (void(*)(xfs_buf_t*,xfs_log_item_t*)) xfs_istale_done;
-                               spin_lock(&mp->m_ail_lock);
-                               iip->ili_flush_lsn = iip->ili_item.li_lsn;
-                               spin_unlock(&mp->m_ail_lock);
+                               xfs_trans_ail_copy_lsn(mp->m_ail,
+                                                       &iip->ili_flush_lsn,
+                                                       &iip->ili_item.li_lsn);
                                xfs_iflags_set(iip->ili_inode, XFS_ISTALE);
                                pre_flushed++;
                        }
@@ -2183,9 +2239,8 @@ xfs_ifree_cluster(
                        iip->ili_last_fields = iip->ili_format.ilf_fields;
                        iip->ili_format.ilf_fields = 0;
                        iip->ili_logged = 1;
-                       spin_lock(&mp->m_ail_lock);
-                       iip->ili_flush_lsn = iip->ili_item.li_lsn;
-                       spin_unlock(&mp->m_ail_lock);
+                       xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
+                                               &iip->ili_item.li_lsn);
 
                        xfs_buf_attach_iodone(bp,
                                (void(*)(xfs_buf_t*,xfs_log_item_t*))
@@ -2312,9 +2367,10 @@ xfs_iroot_realloc(
        int                     rec_diff,
        int                     whichfork)
 {
+       struct xfs_mount        *mp = ip->i_mount;
        int                     cur_max;
        xfs_ifork_t             *ifp;
-       xfs_bmbt_block_t        *new_broot;
+       struct xfs_btree_block  *new_broot;
        int                     new_max;
        size_t                  new_size;
        char                    *np;
@@ -2335,8 +2391,7 @@ xfs_iroot_realloc(
                 */
                if (ifp->if_broot_bytes == 0) {
                        new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(rec_diff);
-                       ifp->if_broot = (xfs_bmbt_block_t*)kmem_alloc(new_size,
-                                                                    KM_SLEEP);
+                       ifp->if_broot = kmem_alloc(new_size, KM_SLEEP);
                        ifp->if_broot_bytes = (int)new_size;
                        return;
                }
@@ -2347,18 +2402,16 @@ xfs_iroot_realloc(
                 * location.  The records don't change location because
                 * they are kept butted up against the btree block header.
                 */
-               cur_max = XFS_BMAP_BROOT_MAXRECS(ifp->if_broot_bytes);
+               cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0);
                new_max = cur_max + rec_diff;
                new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(new_max);
-               ifp->if_broot = (xfs_bmbt_block_t *)
-                 kmem_realloc(ifp->if_broot,
-                               new_size,
+               ifp->if_broot = kmem_realloc(ifp->if_broot, new_size,
                                (size_t)XFS_BMAP_BROOT_SPACE_CALC(cur_max), /* old size */
                                KM_SLEEP);
-               op = (char *)XFS_BMAP_BROOT_PTR_ADDR(ifp->if_broot, 1,
-                                                     ifp->if_broot_bytes);
-               np = (char *)XFS_BMAP_BROOT_PTR_ADDR(ifp->if_broot, 1,
-                                                     (int)new_size);
+               op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
+                                                    ifp->if_broot_bytes);
+               np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
+                                                    (int)new_size);
                ifp->if_broot_bytes = (int)new_size;
                ASSERT(ifp->if_broot_bytes <=
                        XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ);
@@ -2372,7 +2425,7 @@ xfs_iroot_realloc(
         * records, just get rid of the root and clear the status bit.
         */
        ASSERT((ifp->if_broot != NULL) && (ifp->if_broot_bytes > 0));
-       cur_max = XFS_BMAP_BROOT_MAXRECS(ifp->if_broot_bytes);
+       cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0);
        new_max = cur_max + rec_diff;
        ASSERT(new_max >= 0);
        if (new_max > 0)
@@ -2380,11 +2433,11 @@ xfs_iroot_realloc(
        else
                new_size = 0;
        if (new_size > 0) {
-               new_broot = (xfs_bmbt_block_t *)kmem_alloc(new_size, KM_SLEEP);
+               new_broot = kmem_alloc(new_size, KM_SLEEP);
                /*
                 * First copy over the btree block header.
                 */
-               memcpy(new_broot, ifp->if_broot, sizeof(xfs_bmbt_block_t));
+               memcpy(new_broot, ifp->if_broot, XFS_BTREE_LBLOCK_LEN);
        } else {
                new_broot = NULL;
                ifp->if_flags &= ~XFS_IFBROOT;
@@ -2397,18 +2450,16 @@ xfs_iroot_realloc(
                /*
                 * First copy the records.
                 */
-               op = (char *)XFS_BMAP_BROOT_REC_ADDR(ifp->if_broot, 1,
-                                                    ifp->if_broot_bytes);
-               np = (char *)XFS_BMAP_BROOT_REC_ADDR(new_broot, 1,
-                                                    (int)new_size);
+               op = (char *)XFS_BMBT_REC_ADDR(mp, ifp->if_broot, 1);
+               np = (char *)XFS_BMBT_REC_ADDR(mp, new_broot, 1);
                memcpy(np, op, new_max * (uint)sizeof(xfs_bmbt_rec_t));
 
                /*
                 * Then copy the pointers.
                 */
-               op = (char *)XFS_BMAP_BROOT_PTR_ADDR(ifp->if_broot, 1,
+               op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
                                                     ifp->if_broot_bytes);
-               np = (char *)XFS_BMAP_BROOT_PTR_ADDR(new_broot, 1,
+               np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, new_broot, 1,
                                                     (int)new_size);
                memcpy(np, op, new_max * (uint)sizeof(xfs_dfsbno_t));
        }
@@ -2617,6 +2668,10 @@ xfs_idestroy_fork(
  * It must free the inode itself and any buffers allocated for
  * if_extents/if_data and if_broot.  It must also free the lock
  * associated with the inode.
+ *
+ * Note: because we don't initialise everything on reallocation out
+ * of the zone, we must ensure we nullify everything correctly before
+ * freeing the structure.
  */
 void
 xfs_idestroy(
@@ -2631,8 +2686,6 @@ xfs_idestroy(
        }
        if (ip->i_afp)
                xfs_idestroy_fork(ip, XFS_ATTR_FORK);
-       mrfree(&ip->i_lock);
-       mrfree(&ip->i_iolock);
 
 #ifdef XFS_INODE_TRACE
        ktrace_free(ip->i_trace);
@@ -2640,7 +2693,7 @@ xfs_idestroy(
 #ifdef XFS_BMAP_TRACE
        ktrace_free(ip->i_xtrace);
 #endif
-#ifdef XFS_BMBT_TRACE
+#ifdef XFS_BTREE_TRACE
        ktrace_free(ip->i_btrace);
 #endif
 #ifdef XFS_RW_TRACE
@@ -2658,20 +2711,26 @@ xfs_idestroy(
                 * inode still in the AIL. If it is there, we should remove
                 * it to prevent a use-after-free from occurring.
                 */
-               xfs_mount_t     *mp = ip->i_mount;
                xfs_log_item_t  *lip = &ip->i_itemp->ili_item;
+               struct xfs_ail  *ailp = lip->li_ailp;
 
                ASSERT(((lip->li_flags & XFS_LI_IN_AIL) == 0) ||
                                       XFS_FORCED_SHUTDOWN(ip->i_mount));
                if (lip->li_flags & XFS_LI_IN_AIL) {
-                       spin_lock(&mp->m_ail_lock);
+                       spin_lock(&ailp->xa_lock);
                        if (lip->li_flags & XFS_LI_IN_AIL)
-                               xfs_trans_delete_ail(mp, lip);
+                               xfs_trans_ail_delete(ailp, lip);
                        else
-                               spin_unlock(&mp->m_ail_lock);
+                               spin_unlock(&ailp->xa_lock);
                }
                xfs_inode_item_destroy(ip);
+               ip->i_itemp = NULL;
        }
+       /* asserts to verify all state is correct here */
+       ASSERT(atomic_read(&ip->i_iocount) == 0);
+       ASSERT(atomic_read(&ip->i_pincount) == 0);
+       ASSERT(!spin_is_locked(&ip->i_flags_lock));
+       ASSERT(completion_done(&ip->i_flush));
        kmem_zone_free(xfs_inode_zone, ip);
 }
 
@@ -2880,7 +2939,7 @@ xfs_iflush_fork(
                        ASSERT(ifp->if_broot_bytes <=
                               (XFS_IFORK_SIZE(ip, whichfork) +
                                XFS_BROOT_SIZE_ADJ));
-                       xfs_bmbt_to_bmdr(ifp->if_broot, ifp->if_broot_bytes,
+                       xfs_bmbt_to_bmdr(mp, ifp->if_broot, ifp->if_broot_bytes,
                                (xfs_bmdr_block_t *)cp,
                                XFS_DFORK_SIZE(dip, mp, whichfork));
                }
@@ -3418,10 +3477,8 @@ xfs_iflush_int(
                iip->ili_format.ilf_fields = 0;
                iip->ili_logged = 1;
 
-               ASSERT(sizeof(xfs_lsn_t) == 8); /* don't lock if it shrinks */
-               spin_lock(&mp->m_ail_lock);
-               iip->ili_flush_lsn = iip->ili_item.li_lsn;
-               spin_unlock(&mp->m_ail_lock);
+               xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
+                                       &iip->ili_item.li_lsn);
 
                /*
                 * Attach the function xfs_iflush_done to the inode's
@@ -3459,41 +3516,6 @@ corrupt_out:
 }
 
 
-/*
- * Flush all inactive inodes in mp.
- */
-void
-xfs_iflush_all(
-       xfs_mount_t     *mp)
-{
-       xfs_inode_t     *ip;
-
- again:
-       XFS_MOUNT_ILOCK(mp);
-       ip = mp->m_inodes;
-       if (ip == NULL)
-               goto out;
-
-       do {
-               /* Make sure we skip markers inserted by sync */
-               if (ip->i_mount == NULL) {
-                       ip = ip->i_mnext;
-                       continue;
-               }
-
-               if (!VFS_I(ip)) {
-                       XFS_MOUNT_IUNLOCK(mp);
-                       xfs_finish_reclaim(ip, 0, XFS_IFLUSH_ASYNC);
-                       goto again;
-               }
-
-               ASSERT(vn_count(VFS_I(ip)) == 0);
-
-               ip = ip->i_mnext;
-       } while (ip != mp->m_inodes);
- out:
-       XFS_MOUNT_IUNLOCK(mp);
-}
 
 #ifdef XFS_ILOCK_TRACE
 ktrace_t       *xfs_ilock_trace_buf;
index 1420c49674d7451f196fabbd1df76b731dfc7703..7f007ef4bbb3883a90daa6f0d3469786b5fd1862 100644 (file)
@@ -20,7 +20,7 @@
 
 struct xfs_dinode;
 struct xfs_dinode_core;
-
+struct xfs_inode;
 
 /*
  * Fork identifiers.
@@ -63,7 +63,7 @@ typedef struct xfs_ext_irec {
 typedef struct xfs_ifork {
        int                     if_bytes;       /* bytes in if_u1 */
        int                     if_real_bytes;  /* bytes allocated in if_u1 */
-       xfs_bmbt_block_t        *if_broot;      /* file's incore btree root */
+       struct xfs_btree_block  *if_broot;      /* file's incore btree root */
        short                   if_broot_bytes; /* bytes allocated for root */
        unsigned char           if_flags;       /* per-fork flags */
        unsigned char           if_ext_max;     /* max # of extent records */
@@ -83,54 +83,6 @@ typedef struct xfs_ifork {
        } if_u2;
 } xfs_ifork_t;
 
-/*
- * Flags for xfs_ichgtime().
- */
-#define        XFS_ICHGTIME_MOD        0x1     /* data fork modification timestamp */
-#define        XFS_ICHGTIME_CHG        0x2     /* inode field change timestamp */
-
-/*
- * Per-fork incore inode flags.
- */
-#define        XFS_IFINLINE    0x01    /* Inline data is read in */
-#define        XFS_IFEXTENTS   0x02    /* All extent pointers are read in */
-#define        XFS_IFBROOT     0x04    /* i_broot points to the bmap b-tree root */
-#define        XFS_IFEXTIREC   0x08    /* Indirection array of extent blocks */
-
-/*
- * Flags for xfs_itobp(), xfs_imap() and xfs_dilocate().
- */
-#define XFS_IMAP_LOOKUP                0x1
-#define XFS_IMAP_BULKSTAT      0x2
-
-#ifdef __KERNEL__
-struct bhv_desc;
-struct cred;
-struct ktrace;
-struct xfs_buf;
-struct xfs_bmap_free;
-struct xfs_bmbt_irec;
-struct xfs_bmbt_block;
-struct xfs_inode;
-struct xfs_inode_log_item;
-struct xfs_mount;
-struct xfs_trans;
-struct xfs_dquot;
-
-#if defined(XFS_ILOCK_TRACE)
-#define XFS_ILOCK_KTRACE_SIZE  32
-extern ktrace_t *xfs_ilock_trace_buf;
-extern void xfs_ilock_trace(struct xfs_inode *, int, unsigned int, inst_t *);
-#else
-#define        xfs_ilock_trace(i,n,f,ra)
-#endif
-
-typedef struct dm_attrs_s {
-       __uint32_t      da_dmevmask;    /* DMIG event mask */
-       __uint16_t      da_dmstate;     /* DMIG state info */
-       __uint16_t      da_pad;         /* DMIG extra padding */
-} dm_attrs_t;
-
 /*
  * This is the xfs in-core inode structure.
  * Most of the on-disk inode is embedded in the i_d field.
@@ -191,19 +143,98 @@ typedef struct xfs_icdinode {
        __uint32_t      di_gen;         /* generation number */
 } xfs_icdinode_t;
 
-typedef struct {
-       struct xfs_inode        *ip_mnext;      /* next inode in mount list */
-       struct xfs_inode        *ip_mprev;      /* ptr to prev inode */
-       struct xfs_mount        *ip_mount;      /* fs mount struct ptr */
-} xfs_iptr_t;
+/*
+ * Flags for xfs_ichgtime().
+ */
+#define        XFS_ICHGTIME_MOD        0x1     /* data fork modification timestamp */
+#define        XFS_ICHGTIME_CHG        0x2     /* inode field change timestamp */
+
+/*
+ * Per-fork incore inode flags.
+ */
+#define        XFS_IFINLINE    0x01    /* Inline data is read in */
+#define        XFS_IFEXTENTS   0x02    /* All extent pointers are read in */
+#define        XFS_IFBROOT     0x04    /* i_broot points to the bmap b-tree root */
+#define        XFS_IFEXTIREC   0x08    /* Indirection array of extent blocks */
+
+/*
+ * Flags for xfs_inotobp, xfs_itobp(), xfs_imap() and xfs_dilocate().
+ */
+#define XFS_IMAP_LOOKUP                0x1
+#define XFS_IMAP_BULKSTAT      0x2
+
+/*
+ * Fork handling.
+ */
+
+#define XFS_IFORK_Q(ip)                        ((ip)->i_d.di_forkoff != 0)
+#define XFS_IFORK_BOFF(ip)             ((int)((ip)->i_d.di_forkoff << 3))
+
+#define XFS_IFORK_PTR(ip,w)            \
+       ((w) == XFS_DATA_FORK ? \
+               &(ip)->i_df : \
+               (ip)->i_afp)
+#define XFS_IFORK_DSIZE(ip) \
+       (XFS_IFORK_Q(ip) ? \
+               XFS_IFORK_BOFF(ip) : \
+               XFS_LITINO((ip)->i_mount))
+#define XFS_IFORK_ASIZE(ip) \
+       (XFS_IFORK_Q(ip) ? \
+               XFS_LITINO((ip)->i_mount) - XFS_IFORK_BOFF(ip) : \
+               0)
+#define XFS_IFORK_SIZE(ip,w) \
+       ((w) == XFS_DATA_FORK ? \
+               XFS_IFORK_DSIZE(ip) : \
+               XFS_IFORK_ASIZE(ip))
+#define XFS_IFORK_FORMAT(ip,w) \
+       ((w) == XFS_DATA_FORK ? \
+               (ip)->i_d.di_format : \
+               (ip)->i_d.di_aformat)
+#define XFS_IFORK_FMT_SET(ip,w,n) \
+       ((w) == XFS_DATA_FORK ? \
+               ((ip)->i_d.di_format = (n)) : \
+               ((ip)->i_d.di_aformat = (n)))
+#define XFS_IFORK_NEXTENTS(ip,w) \
+       ((w) == XFS_DATA_FORK ? \
+               (ip)->i_d.di_nextents : \
+               (ip)->i_d.di_anextents)
+#define XFS_IFORK_NEXT_SET(ip,w,n) \
+       ((w) == XFS_DATA_FORK ? \
+               ((ip)->i_d.di_nextents = (n)) : \
+               ((ip)->i_d.di_anextents = (n)))
+
+
+
+#ifdef __KERNEL__
+
+struct bhv_desc;
+struct cred;
+struct ktrace;
+struct xfs_buf;
+struct xfs_bmap_free;
+struct xfs_bmbt_irec;
+struct xfs_inode_log_item;
+struct xfs_mount;
+struct xfs_trans;
+struct xfs_dquot;
+
+#if defined(XFS_ILOCK_TRACE)
+#define XFS_ILOCK_KTRACE_SIZE  32
+extern ktrace_t *xfs_ilock_trace_buf;
+extern void xfs_ilock_trace(struct xfs_inode *, int, unsigned int, inst_t *);
+#else
+#define        xfs_ilock_trace(i,n,f,ra)
+#endif
+
+typedef struct dm_attrs_s {
+       __uint32_t      da_dmevmask;    /* DMIG event mask */
+       __uint16_t      da_dmstate;     /* DMIG state info */
+       __uint16_t      da_pad;         /* DMIG extra padding */
+} dm_attrs_t;
 
 typedef struct xfs_inode {
        /* Inode linking and identification information. */
-       struct xfs_inode        *i_mnext;       /* next inode in mount list */
-       struct xfs_inode        *i_mprev;       /* ptr to prev inode */
        struct xfs_mount        *i_mount;       /* fs mount struct ptr */
-       struct list_head        i_reclaim;      /* reclaim list */
-       struct inode            *i_vnode;       /* vnode backpointer */
        struct xfs_dquot        *i_udquot;      /* user dquot */
        struct xfs_dquot        *i_gdquot;      /* group dquot */
 
@@ -238,6 +269,10 @@ typedef struct xfs_inode {
        xfs_fsize_t             i_size;         /* in-memory size */
        xfs_fsize_t             i_new_size;     /* size when write completes */
        atomic_t                i_iocount;      /* outstanding I/O count */
+
+       /* VFS inode */
+       struct inode            i_vnode;        /* embedded VFS inode */
+
        /* Trace buffers per inode. */
 #ifdef XFS_INODE_TRACE
        struct ktrace           *i_trace;       /* general inode trace */
@@ -245,7 +280,7 @@ typedef struct xfs_inode {
 #ifdef XFS_BMAP_TRACE
        struct ktrace           *i_xtrace;      /* inode extent list trace */
 #endif
-#ifdef XFS_BMBT_TRACE
+#ifdef XFS_BTREE_TRACE
        struct ktrace           *i_btrace;      /* inode bmap btree trace */
 #endif
 #ifdef XFS_RW_TRACE
@@ -265,13 +300,30 @@ typedef struct xfs_inode {
 /* Convert from vfs inode to xfs inode */
 static inline struct xfs_inode *XFS_I(struct inode *inode)
 {
-       return (struct xfs_inode *)inode->i_private;
+       return container_of(inode, struct xfs_inode, i_vnode);
 }
 
 /* convert from xfs inode to vfs inode */
 static inline struct inode *VFS_I(struct xfs_inode *ip)
 {
-       return (struct inode *)ip->i_vnode;
+       return &ip->i_vnode;
+}
+
+/*
+ * Get rid of a partially initialized inode.
+ *
+ * We have to go through destroy_inode to make sure allocations
+ * from init_inode_always like the security data are undone.
+ *
+ * We mark the inode bad so that it takes the short cut in
+ * the reclaim path instead of going through the flush path
+ * which doesn't make sense for an inode that has never seen the
+ * light of day.
+ */
+static inline void xfs_destroy_inode(struct xfs_inode *ip)
+{
+       make_bad_inode(VFS_I(ip));
+       return destroy_inode(VFS_I(ip));
 }
 
 /*
@@ -327,50 +379,26 @@ xfs_iflags_test_and_clear(xfs_inode_t *ip, unsigned short flags)
        spin_unlock(&ip->i_flags_lock);
        return ret;
 }
-#endif /* __KERNEL__ */
-
 
 /*
- * Fork handling.
+ * Manage the i_flush queue embedded in the inode.  This completion
+ * queue synchronizes processes attempting to flush the in-core
+ * inode back to disk.
  */
+static inline void xfs_iflock(xfs_inode_t *ip)
+{
+       wait_for_completion(&ip->i_flush);
+}
 
-#define XFS_IFORK_Q(ip)                        ((ip)->i_d.di_forkoff != 0)
-#define XFS_IFORK_BOFF(ip)             ((int)((ip)->i_d.di_forkoff << 3))
-
-#define XFS_IFORK_PTR(ip,w)            \
-       ((w) == XFS_DATA_FORK ? \
-               &(ip)->i_df : \
-               (ip)->i_afp)
-#define XFS_IFORK_DSIZE(ip) \
-       (XFS_IFORK_Q(ip) ? \
-               XFS_IFORK_BOFF(ip) : \
-               XFS_LITINO((ip)->i_mount))
-#define XFS_IFORK_ASIZE(ip) \
-       (XFS_IFORK_Q(ip) ? \
-               XFS_LITINO((ip)->i_mount) - XFS_IFORK_BOFF(ip) : \
-               0)
-#define XFS_IFORK_SIZE(ip,w) \
-       ((w) == XFS_DATA_FORK ? \
-               XFS_IFORK_DSIZE(ip) : \
-               XFS_IFORK_ASIZE(ip))
-#define XFS_IFORK_FORMAT(ip,w) \
-       ((w) == XFS_DATA_FORK ? \
-               (ip)->i_d.di_format : \
-               (ip)->i_d.di_aformat)
-#define XFS_IFORK_FMT_SET(ip,w,n) \
-       ((w) == XFS_DATA_FORK ? \
-               ((ip)->i_d.di_format = (n)) : \
-               ((ip)->i_d.di_aformat = (n)))
-#define XFS_IFORK_NEXTENTS(ip,w) \
-       ((w) == XFS_DATA_FORK ? \
-               (ip)->i_d.di_nextents : \
-               (ip)->i_d.di_anextents)
-#define XFS_IFORK_NEXT_SET(ip,w,n) \
-       ((w) == XFS_DATA_FORK ? \
-               ((ip)->i_d.di_nextents = (n)) : \
-               ((ip)->i_d.di_anextents = (n)))
+static inline int xfs_iflock_nowait(xfs_inode_t *ip)
+{
+       return try_wait_for_completion(&ip->i_flush);
+}
 
-#ifdef __KERNEL__
+static inline void xfs_ifunlock(xfs_inode_t *ip)
+{
+       complete(&ip->i_flush);
+}
 
 /*
  * In-core inode flags.
@@ -484,25 +512,15 @@ int               xfs_isilocked(xfs_inode_t *, uint);
 uint           xfs_ilock_map_shared(xfs_inode_t *);
 void           xfs_iunlock_map_shared(xfs_inode_t *, uint);
 void           xfs_ireclaim(xfs_inode_t *);
-int            xfs_finish_reclaim(xfs_inode_t *, int, int);
-int            xfs_finish_reclaim_all(struct xfs_mount *, int);
 
 /*
  * xfs_inode.c prototypes.
  */
-int            xfs_itobp(struct xfs_mount *, struct xfs_trans *,
-                         xfs_inode_t *, struct xfs_dinode **, struct xfs_buf **,
-                         xfs_daddr_t, uint, uint);
 int            xfs_iread(struct xfs_mount *, struct xfs_trans *, xfs_ino_t,
                          xfs_inode_t **, xfs_daddr_t, uint);
-int            xfs_iread_extents(struct xfs_trans *, xfs_inode_t *, int);
 int            xfs_ialloc(struct xfs_trans *, xfs_inode_t *, mode_t,
                           xfs_nlink_t, xfs_dev_t, struct cred *, xfs_prid_t,
                           int, struct xfs_buf **, boolean_t *, xfs_inode_t **);
-void           xfs_dinode_from_disk(struct xfs_icdinode *,
-                                    struct xfs_dinode_core *);
-void           xfs_dinode_to_disk(struct xfs_dinode_core *,
-                                  struct xfs_icdinode *);
 
 uint           xfs_ip2xflags(struct xfs_inode *);
 uint           xfs_dic2xflags(struct xfs_dinode *);
@@ -513,17 +531,12 @@ int               xfs_itruncate_finish(struct xfs_trans **, xfs_inode_t *,
                                     xfs_fsize_t, int, int);
 int            xfs_iunlink(struct xfs_trans *, xfs_inode_t *);
 
-void           xfs_idestroy_fork(xfs_inode_t *, int);
 void           xfs_idestroy(xfs_inode_t *);
-void           xfs_idata_realloc(xfs_inode_t *, int, int);
 void           xfs_iextract(xfs_inode_t *);
 void           xfs_iext_realloc(xfs_inode_t *, int, int);
-void           xfs_iroot_realloc(xfs_inode_t *, int, int);
 void           xfs_ipin(xfs_inode_t *);
 void           xfs_iunpin(xfs_inode_t *);
-int            xfs_iextents_copy(xfs_inode_t *, xfs_bmbt_rec_t *, int);
 int            xfs_iflush(xfs_inode_t *, uint);
-void           xfs_iflush_all(struct xfs_mount *);
 void           xfs_ichgtime(xfs_inode_t *, int);
 xfs_fsize_t    xfs_file_last_byte(xfs_inode_t *);
 void           xfs_lock_inodes(xfs_inode_t **, int, uint);
@@ -532,6 +545,24 @@ void               xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint);
 void           xfs_synchronize_atime(xfs_inode_t *);
 void           xfs_mark_inode_dirty_sync(xfs_inode_t *);
 
+#endif /* __KERNEL__ */
+
+int            xfs_inotobp(struct xfs_mount *, struct xfs_trans *,
+                           xfs_ino_t, struct xfs_dinode **,
+                           struct xfs_buf **, int *, uint);
+int            xfs_itobp(struct xfs_mount *, struct xfs_trans *,
+                         struct xfs_inode *, struct xfs_dinode **,
+                         struct xfs_buf **, xfs_daddr_t, uint, uint);
+void           xfs_dinode_from_disk(struct xfs_icdinode *,
+                                    struct xfs_dinode_core *);
+void           xfs_dinode_to_disk(struct xfs_dinode_core *,
+                                  struct xfs_icdinode *);
+void           xfs_idestroy_fork(struct xfs_inode *, int);
+void           xfs_idata_realloc(struct xfs_inode *, int, int);
+void           xfs_iroot_realloc(struct xfs_inode *, int, int);
+int            xfs_iread_extents(struct xfs_trans *, struct xfs_inode *, int);
+int            xfs_iextents_copy(struct xfs_inode *, xfs_bmbt_rec_t *, int);
+
 xfs_bmbt_rec_host_t *xfs_iext_get_ext(xfs_ifork_t *, xfs_extnum_t);
 void           xfs_iext_insert(xfs_ifork_t *, xfs_extnum_t, xfs_extnum_t,
                                xfs_bmbt_irec_t *);
@@ -561,7 +592,8 @@ void                xfs_iext_irec_update_extoffs(xfs_ifork_t *, int, int);
 #define xfs_ipincount(ip)      ((unsigned int) atomic_read(&ip->i_pincount))
 
 #ifdef DEBUG
-void           xfs_isize_check(struct xfs_mount *, xfs_inode_t *, xfs_fsize_t);
+void           xfs_isize_check(struct xfs_mount *, struct xfs_inode *,
+                               xfs_fsize_t);
 #else  /* DEBUG */
 #define xfs_isize_check(mp, ip, isize)
 #endif /* DEBUG */
@@ -576,26 +608,4 @@ extern struct kmem_zone    *xfs_ifork_zone;
 extern struct kmem_zone        *xfs_inode_zone;
 extern struct kmem_zone        *xfs_ili_zone;
 
-/*
- * Manage the i_flush queue embedded in the inode.  This completion
- * queue synchronizes processes attempting to flush the in-core
- * inode back to disk.
- */
-static inline void xfs_iflock(xfs_inode_t *ip)
-{
-       wait_for_completion(&ip->i_flush);
-}
-
-static inline int xfs_iflock_nowait(xfs_inode_t *ip)
-{
-       return try_wait_for_completion(&ip->i_flush);
-}
-
-static inline void xfs_ifunlock(xfs_inode_t *ip)
-{
-       complete(&ip->i_flush);
-}
-
-#endif /* __KERNEL__ */
-
 #endif /* __XFS_INODE_H__ */
index 97c7452e2620ee56040c03a6b8bb09fdfb4cc854..aa9bf05060c6a4126923761bab416bc834d4cf5f 100644 (file)
@@ -932,6 +932,7 @@ xfs_inode_item_init(
        iip->ili_item.li_type = XFS_LI_INODE;
        iip->ili_item.li_ops = &xfs_inode_item_ops;
        iip->ili_item.li_mountp = mp;
+       iip->ili_item.li_ailp = mp->m_ail;
        iip->ili_inode = ip;
 
        /*
@@ -976,9 +977,8 @@ xfs_iflush_done(
        xfs_buf_t               *bp,
        xfs_inode_log_item_t    *iip)
 {
-       xfs_inode_t     *ip;
-
-       ip = iip->ili_inode;
+       xfs_inode_t             *ip = iip->ili_inode;
+       struct xfs_ail          *ailp = iip->ili_item.li_ailp;
 
        /*
         * We only want to pull the item from the AIL if it is
@@ -991,15 +991,12 @@ xfs_iflush_done(
         */
        if (iip->ili_logged &&
            (iip->ili_item.li_lsn == iip->ili_flush_lsn)) {
-               spin_lock(&ip->i_mount->m_ail_lock);
+               spin_lock(&ailp->xa_lock);
                if (iip->ili_item.li_lsn == iip->ili_flush_lsn) {
-                       /*
-                        * xfs_trans_delete_ail() drops the AIL lock.
-                        */
-                       xfs_trans_delete_ail(ip->i_mount,
-                                            (xfs_log_item_t*)iip);
+                       /* xfs_trans_ail_delete() drops the AIL lock. */
+                       xfs_trans_ail_delete(ailp, (xfs_log_item_t*)iip);
                } else {
-                       spin_unlock(&ip->i_mount->m_ail_lock);
+                       spin_unlock(&ailp->xa_lock);
                }
        }
 
@@ -1031,21 +1028,20 @@ void
 xfs_iflush_abort(
        xfs_inode_t             *ip)
 {
-       xfs_inode_log_item_t    *iip;
+       xfs_inode_log_item_t    *iip = ip->i_itemp;
        xfs_mount_t             *mp;
 
        iip = ip->i_itemp;
        mp = ip->i_mount;
        if (iip) {
+               struct xfs_ail  *ailp = iip->ili_item.li_ailp;
                if (iip->ili_item.li_flags & XFS_LI_IN_AIL) {
-                       spin_lock(&mp->m_ail_lock);
+                       spin_lock(&ailp->xa_lock);
                        if (iip->ili_item.li_flags & XFS_LI_IN_AIL) {
-                               /*
-                                * xfs_trans_delete_ail() drops the AIL lock.
-                                */
-                               xfs_trans_delete_ail(mp, (xfs_log_item_t *)iip);
+                               /* xfs_trans_ail_delete() drops the AIL lock. */
+                               xfs_trans_ail_delete(ailp, (xfs_log_item_t *)iip);
                        } else
-                               spin_unlock(&mp->m_ail_lock);
+                               spin_unlock(&ailp->xa_lock);
                }
                iip->ili_logged = 0;
                /*
index 40513077ab36f71178d50a0df2a0505e56df0853..1ff04cc323ad98c5b5faddd5e760ac4c212aba2e 100644 (file)
@@ -112,6 +112,24 @@ typedef struct xfs_inode_log_format_64 {
 #define        XFS_ILI_IOLOCKED_ANY   (XFS_ILI_IOLOCKED_EXCL | XFS_ILI_IOLOCKED_SHARED)
 
 
+#define        XFS_ILOG_FBROOT(w)      xfs_ilog_fbroot(w)
+static inline int xfs_ilog_fbroot(int w)
+{
+       return (w == XFS_DATA_FORK ? XFS_ILOG_DBROOT : XFS_ILOG_ABROOT);
+}
+
+#define        XFS_ILOG_FEXT(w)        xfs_ilog_fext(w)
+static inline int xfs_ilog_fext(int w)
+{
+       return (w == XFS_DATA_FORK ? XFS_ILOG_DEXT : XFS_ILOG_AEXT);
+}
+
+#define        XFS_ILOG_FDATA(w)       xfs_ilog_fdata(w)
+static inline int xfs_ilog_fdata(int w)
+{
+       return (w == XFS_DATA_FORK ? XFS_ILOG_DDATA : XFS_ILOG_ADATA);
+}
+
 #ifdef __KERNEL__
 
 struct xfs_buf;
@@ -148,26 +166,6 @@ typedef struct xfs_inode_log_item {
 } xfs_inode_log_item_t;
 
 
-#define        XFS_ILOG_FDATA(w)       xfs_ilog_fdata(w)
-static inline int xfs_ilog_fdata(int w)
-{
-       return (w == XFS_DATA_FORK ? XFS_ILOG_DDATA : XFS_ILOG_ADATA);
-}
-
-#endif /* __KERNEL__ */
-
-#define        XFS_ILOG_FBROOT(w)      xfs_ilog_fbroot(w)
-static inline int xfs_ilog_fbroot(int w)
-{
-       return (w == XFS_DATA_FORK ? XFS_ILOG_DBROOT : XFS_ILOG_ABROOT);
-}
-
-#define        XFS_ILOG_FEXT(w)        xfs_ilog_fext(w)
-static inline int xfs_ilog_fext(int w)
-{
-       return (w == XFS_DATA_FORK ? XFS_ILOG_DEXT : XFS_ILOG_AEXT);
-}
-
 static inline int xfs_inode_clean(xfs_inode_t *ip)
 {
        return (!ip->i_itemp ||
@@ -175,9 +173,6 @@ static inline int xfs_inode_clean(xfs_inode_t *ip)
               !ip->i_update_core;
 }
 
-
-#ifdef __KERNEL__
-
 extern void xfs_inode_item_init(struct xfs_inode *, struct xfs_mount *);
 extern void xfs_inode_item_destroy(struct xfs_inode *);
 extern void xfs_iflush_done(struct xfs_buf *, xfs_inode_log_item_t *);
index cf6754a3c5b3e07a1e660c11ead329de21ef314c..35118032a5d654eac40e1415126ae1aae2cc02aa 100644 (file)
@@ -359,7 +359,6 @@ xfs_bulkstat(
        int                     ubused; /* bytes used by formatter */
        xfs_buf_t               *bp;    /* ptr to on-disk inode cluster buf */
        xfs_dinode_t            *dip;   /* ptr into bp for specific inode */
-       xfs_inode_t             *ip;    /* ptr to in-core inode struct */
 
        /*
         * Get the last inode value, see if there's nothing to do.
@@ -416,8 +415,7 @@ xfs_bulkstat(
                /*
                 * Allocate and initialize a btree cursor for ialloc btree.
                 */
-               cur = xfs_btree_init_cursor(mp, NULL, agbp, agno, XFS_BTNUM_INO,
-                                               (xfs_inode_t *)0, 0);
+               cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno);
                irbp = irbuf;
                irbufend = irbuf + nirbuf;
                end_of_ag = 0;
@@ -472,7 +470,7 @@ xfs_bulkstat(
                         * In any case, increment to the next record.
                         */
                        if (!error)
-                               error = xfs_inobt_increment(cur, 0, &tmp);
+                               error = xfs_btree_increment(cur, 0, &tmp);
                } else {
                        /*
                         * Start of ag.  Lookup the first inode chunk.
@@ -539,7 +537,7 @@ xfs_bulkstat(
                         * Set agino to after this chunk and bump the cursor.
                         */
                        agino = gino + XFS_INODES_PER_CHUNK;
-                       error = xfs_inobt_increment(cur, 0, &tmp);
+                       error = xfs_btree_increment(cur, 0, &tmp);
                        cond_resched();
                }
                /*
@@ -586,6 +584,8 @@ xfs_bulkstat(
 
                                        if (flags & (BULKSTAT_FG_QUICK |
                                                     BULKSTAT_FG_INLINE)) {
+                                               int offset;
+
                                                ino = XFS_AGINO_TO_INO(mp, agno,
                                                                       agino);
                                                bno = XFS_AGB_TO_DADDR(mp, agno,
@@ -594,21 +594,15 @@ xfs_bulkstat(
                                                /*
                                                 * Get the inode cluster buffer
                                                 */
-                                               ASSERT(xfs_inode_zone != NULL);
-                                               ip = kmem_zone_zalloc(xfs_inode_zone,
-                                                                     KM_SLEEP);
-                                               ip->i_ino = ino;
-                                               ip->i_mount = mp;
-                                               spin_lock_init(&ip->i_flags_lock);
                                                if (bp)
                                                        xfs_buf_relse(bp);
-                                               error = xfs_itobp(mp, NULL, ip,
-                                                               &dip, &bp, bno,
-                                                               XFS_IMAP_BULKSTAT,
-                                                               XFS_BUF_LOCK);
+
+                                               error = xfs_inotobp(mp, NULL, ino, &dip,
+                                                                   &bp, &offset,
+                                                                   XFS_IMAP_BULKSTAT);
+
                                                if (!error)
-                                                       clustidx = ip->i_boffset / mp->m_sb.sb_inodesize;
-                                               kmem_zone_free(xfs_inode_zone, ip);
+                                                       clustidx = offset / mp->m_sb.sb_inodesize;
                                                if (XFS_TEST_ERROR(error != 0,
                                                                   mp, XFS_ERRTAG_BULKSTAT_READ_CHUNK,
                                                                   XFS_RANDOM_BULKSTAT_READ_CHUNK)) {
@@ -842,8 +836,7 @@ xfs_inumbers(
                                agino = 0;
                                continue;
                        }
-                       cur = xfs_btree_init_cursor(mp, NULL, agbp, agno,
-                               XFS_BTNUM_INO, (xfs_inode_t *)0, 0);
+                       cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno);
                        error = xfs_inobt_lookup_ge(cur, agino, 0, 0, &tmp);
                        if (error) {
                                xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
@@ -887,7 +880,7 @@ xfs_inumbers(
                        bufidx = 0;
                }
                if (left) {
-                       error = xfs_inobt_increment(cur, 0, &tmp);
+                       error = xfs_btree_increment(cur, 0, &tmp);
                        if (error) {
                                xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
                                cur = NULL;
index 0b02c6443551f78b24af282cad994c5b07a2523f..51840170b16c43b243c7cfb8ecca2a6b4dc3dac2 100644 (file)
@@ -567,12 +567,12 @@ xfs_log_mount(
        /*
         * Initialize the AIL now we have a log.
         */
-       spin_lock_init(&mp->m_ail_lock);
        error = xfs_trans_ail_init(mp);
        if (error) {
                cmn_err(CE_WARN, "XFS: AIL initialisation failed: error %d", error);
                goto error;
        }
+       mp->m_log->l_ailp = mp->m_ail;
 
        /*
         * skip log recovery on a norecovery mount.  pretend it all
@@ -900,7 +900,7 @@ xfs_log_move_tail(xfs_mount_t       *mp,
 int
 xfs_log_need_covered(xfs_mount_t *mp)
 {
-       int             needed = 0, gen;
+       int             needed = 0;
        xlog_t          *log = mp->m_log;
 
        if (!xfs_fs_writable(mp))
@@ -909,7 +909,7 @@ xfs_log_need_covered(xfs_mount_t *mp)
        spin_lock(&log->l_icloglock);
        if (((log->l_covered_state == XLOG_STATE_COVER_NEED) ||
                (log->l_covered_state == XLOG_STATE_COVER_NEED2))
-                       && !xfs_trans_first_ail(mp, &gen)
+                       && !xfs_trans_ail_tail(log->l_ailp)
                        && xlog_iclogs_empty(log)) {
                if (log->l_covered_state == XLOG_STATE_COVER_NEED)
                        log->l_covered_state = XLOG_STATE_COVER_DONE;
@@ -946,7 +946,7 @@ xlog_assign_tail_lsn(xfs_mount_t *mp)
        xfs_lsn_t tail_lsn;
        xlog_t    *log = mp->m_log;
 
-       tail_lsn = xfs_trans_tail_ail(mp);
+       tail_lsn = xfs_trans_ail_tail(mp->m_ail);
        spin_lock(&log->l_grant_lock);
        if (tail_lsn != 0) {
                log->l_tail_lsn = tail_lsn;
@@ -1413,7 +1413,7 @@ xlog_grant_push_ail(xfs_mount_t   *mp,
      */
     if (threshold_lsn &&
        !XLOG_FORCED_SHUTDOWN(log))
-           xfs_trans_push_ail(mp, threshold_lsn);
+           xfs_trans_ail_push(log->l_ailp, threshold_lsn);
 }      /* xlog_grant_push_ail */
 
 
index e7d8f84443fab87ceee83561fb6b1f04685400ff..de7ef6ca9206d797ea7b9007b2e4a30e2178dffc 100644 (file)
@@ -404,6 +404,7 @@ typedef struct xlog_in_core {
 typedef struct log {
        /* The following fields don't need locking */
        struct xfs_mount        *l_mp;          /* mount point */
+       struct xfs_ail          *l_ailp;        /* AIL log is working with */
        struct xfs_buf          *l_xbuf;        /* extra buffer for log
                                                 * wrapping */
        struct xfs_buftarg      *l_targ;        /* buftarg of log */
index 82d46ce69d5f1e49b73f1842f92e06731af599fb..b411d4947318a97f6ab24a160d92beee1df9ff81 100644 (file)
@@ -54,10 +54,8 @@ STATIC void  xlog_recover_insert_item_backq(xlog_recover_item_t **q,
                                               xlog_recover_item_t *item);
 #if defined(DEBUG)
 STATIC void    xlog_recover_check_summary(xlog_t *);
-STATIC void    xlog_recover_check_ail(xfs_mount_t *, xfs_log_item_t *, int);
 #else
 #define        xlog_recover_check_summary(log)
-#define        xlog_recover_check_ail(mp, lip, gen)
 #endif
 
 
@@ -1419,7 +1417,13 @@ xlog_recover_add_to_trans(
                return 0;
        item = trans->r_itemq;
        if (item == NULL) {
-               ASSERT(*(uint *)dp == XFS_TRANS_HEADER_MAGIC);
+               /* we need to catch log corruptions here */
+               if (*(uint *)dp != XFS_TRANS_HEADER_MAGIC) {
+                       xlog_warn("XFS: xlog_recover_add_to_trans: "
+                                 "bad header magic number");
+                       ASSERT(0);
+                       return XFS_ERROR(EIO);
+               }
                if (len == sizeof(xfs_trans_header_t))
                        xlog_recover_add_item(&trans->r_itemq);
                memcpy(&trans->r_theader, dp, len); /* d, s, l */
@@ -2452,8 +2456,8 @@ xlog_recover_do_inode_trans(
                break;
 
        case XFS_ILOG_DBROOT:
-               xfs_bmbt_to_bmdr((xfs_bmbt_block_t *)src, len,
-                                &(dip->di_u.di_bmbt),
+               xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src, len,
+                                &dip->di_u.di_bmbt,
                                 XFS_DFORK_DSIZE(dip, mp));
                break;
 
@@ -2490,8 +2494,8 @@ xlog_recover_do_inode_trans(
 
                case XFS_ILOG_ABROOT:
                        dest = XFS_DFORK_APTR(dip);
-                       xfs_bmbt_to_bmdr((xfs_bmbt_block_t *)src, len,
-                                        (xfs_bmdr_block_t*)dest,
+                       xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src,
+                                        len, (xfs_bmdr_block_t*)dest,
                                         XFS_DFORK_ASIZE(dip, mp));
                        break;
 
@@ -2683,11 +2687,11 @@ xlog_recover_do_efi_trans(
        efip->efi_next_extent = efi_formatp->efi_nextents;
        efip->efi_flags |= XFS_EFI_COMMITTED;
 
-       spin_lock(&mp->m_ail_lock);
+       spin_lock(&log->l_ailp->xa_lock);
        /*
-        * xfs_trans_update_ail() drops the AIL lock.
+        * xfs_trans_ail_update() drops the AIL lock.
         */
-       xfs_trans_update_ail(mp, (xfs_log_item_t *)efip, lsn);
+       xfs_trans_ail_update(log->l_ailp, (xfs_log_item_t *)efip, lsn);
        return 0;
 }
 
@@ -2706,12 +2710,12 @@ xlog_recover_do_efd_trans(
        xlog_recover_item_t     *item,
        int                     pass)
 {
-       xfs_mount_t             *mp;
        xfs_efd_log_format_t    *efd_formatp;
        xfs_efi_log_item_t      *efip = NULL;
        xfs_log_item_t          *lip;
-       int                     gen;
        __uint64_t              efi_id;
+       struct xfs_ail_cursor   cur;
+       struct xfs_ail          *ailp = log->l_ailp;
 
        if (pass == XLOG_RECOVER_PASS1) {
                return;
@@ -2728,25 +2732,26 @@ xlog_recover_do_efd_trans(
         * Search for the efi with the id in the efd format structure
         * in the AIL.
         */
-       mp = log->l_mp;
-       spin_lock(&mp->m_ail_lock);
-       lip = xfs_trans_first_ail(mp, &gen);
+       spin_lock(&ailp->xa_lock);
+       lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
        while (lip != NULL) {
                if (lip->li_type == XFS_LI_EFI) {
                        efip = (xfs_efi_log_item_t *)lip;
                        if (efip->efi_format.efi_id == efi_id) {
                                /*
-                                * xfs_trans_delete_ail() drops the
+                                * xfs_trans_ail_delete() drops the
                                 * AIL lock.
                                 */
-                               xfs_trans_delete_ail(mp, lip);
+                               xfs_trans_ail_delete(ailp, lip);
                                xfs_efi_item_free(efip);
-                               return;
+                               spin_lock(&ailp->xa_lock);
+                               break;
                        }
                }
-               lip = xfs_trans_next_ail(mp, lip, &gen, NULL);
+               lip = xfs_trans_ail_cursor_next(ailp, &cur);
        }
-       spin_unlock(&mp->m_ail_lock);
+       xfs_trans_ail_cursor_done(ailp, &cur);
+       spin_unlock(&ailp->xa_lock);
 }
 
 /*
@@ -3029,33 +3034,6 @@ abort_error:
        return error;
 }
 
-/*
- * Verify that once we've encountered something other than an EFI
- * in the AIL that there are no more EFIs in the AIL.
- */
-#if defined(DEBUG)
-STATIC void
-xlog_recover_check_ail(
-       xfs_mount_t             *mp,
-       xfs_log_item_t          *lip,
-       int                     gen)
-{
-       int                     orig_gen = gen;
-
-       do {
-               ASSERT(lip->li_type != XFS_LI_EFI);
-               lip = xfs_trans_next_ail(mp, lip, &gen, NULL);
-               /*
-                * The check will be bogus if we restart from the
-                * beginning of the AIL, so ASSERT that we don't.
-                * We never should since we're holding the AIL lock
-                * the entire time.
-                */
-               ASSERT(gen == orig_gen);
-       } while (lip != NULL);
-}
-#endif /* DEBUG */
-
 /*
  * When this is called, all of the EFIs which did not have
  * corresponding EFDs should be in the AIL.  What we do now
@@ -3080,20 +3058,23 @@ xlog_recover_process_efis(
 {
        xfs_log_item_t          *lip;
        xfs_efi_log_item_t      *efip;
-       int                     gen;
-       xfs_mount_t             *mp;
        int                     error = 0;
+       struct xfs_ail_cursor   cur;
+       struct xfs_ail          *ailp;
 
-       mp = log->l_mp;
-       spin_lock(&mp->m_ail_lock);
-
-       lip = xfs_trans_first_ail(mp, &gen);
+       ailp = log->l_ailp;
+       spin_lock(&ailp->xa_lock);
+       lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
        while (lip != NULL) {
                /*
                 * We're done when we see something other than an EFI.
+                * There should be no EFIs left in the AIL now.
                 */
                if (lip->li_type != XFS_LI_EFI) {
-                       xlog_recover_check_ail(mp, lip, gen);
+#ifdef DEBUG
+                       for (; lip; lip = xfs_trans_ail_cursor_next(ailp, &cur))
+                               ASSERT(lip->li_type != XFS_LI_EFI);
+#endif
                        break;
                }
 
@@ -3102,18 +3083,20 @@ xlog_recover_process_efis(
                 */
                efip = (xfs_efi_log_item_t *)lip;
                if (efip->efi_flags & XFS_EFI_RECOVERED) {
-                       lip = xfs_trans_next_ail(mp, lip, &gen, NULL);
+                       lip = xfs_trans_ail_cursor_next(ailp, &cur);
                        continue;
                }
 
-               spin_unlock(&mp->m_ail_lock);
-               error = xlog_recover_process_efi(mp, efip);
+               spin_unlock(&ailp->xa_lock);
+               error = xlog_recover_process_efi(log->l_mp, efip);
+               spin_lock(&ailp->xa_lock);
                if (error)
-                       return error;
-               spin_lock(&mp->m_ail_lock);
-               lip = xfs_trans_next_ail(mp, lip, &gen, NULL);
+                       goto out;
+               lip = xfs_trans_ail_cursor_next(ailp, &cur);
        }
-       spin_unlock(&mp->m_ail_lock);
+out:
+       xfs_trans_ail_cursor_done(ailp, &cur);
+       spin_unlock(&ailp->xa_lock);
        return error;
 }
 
index a4503f5e9497714a62e4fdc6cf90ac718cc5a065..177976dfea04045a2e8a392f04aa6e1e95080e8e 100644 (file)
@@ -567,8 +567,6 @@ xfs_readsb(xfs_mount_t *mp, int flags)
 STATIC void
 xfs_mount_common(xfs_mount_t *mp, xfs_sb_t *sbp)
 {
-       int     i;
-
        mp->m_agfrotor = mp->m_agirotor = 0;
        spin_lock_init(&mp->m_agirotor_lock);
        mp->m_maxagi = mp->m_sb.sb_agcount;
@@ -582,7 +580,6 @@ xfs_mount_common(xfs_mount_t *mp, xfs_sb_t *sbp)
        mp->m_blockmask = sbp->sb_blocksize - 1;
        mp->m_blockwsize = sbp->sb_blocksize >> XFS_WORDLOG;
        mp->m_blockwmask = mp->m_blockwsize - 1;
-       INIT_LIST_HEAD(&mp->m_del_inodes);
 
        /*
         * Setup for attributes, in case they get created.
@@ -605,24 +602,20 @@ xfs_mount_common(xfs_mount_t *mp, xfs_sb_t *sbp)
        }
        ASSERT(mp->m_attroffset < XFS_LITINO(mp));
 
-       for (i = 0; i < 2; i++) {
-               mp->m_alloc_mxr[i] = XFS_BTREE_BLOCK_MAXRECS(sbp->sb_blocksize,
-                       xfs_alloc, i == 0);
-               mp->m_alloc_mnr[i] = XFS_BTREE_BLOCK_MINRECS(sbp->sb_blocksize,
-                       xfs_alloc, i == 0);
-       }
-       for (i = 0; i < 2; i++) {
-               mp->m_bmap_dmxr[i] = XFS_BTREE_BLOCK_MAXRECS(sbp->sb_blocksize,
-                       xfs_bmbt, i == 0);
-               mp->m_bmap_dmnr[i] = XFS_BTREE_BLOCK_MINRECS(sbp->sb_blocksize,
-                       xfs_bmbt, i == 0);
-       }
-       for (i = 0; i < 2; i++) {
-               mp->m_inobt_mxr[i] = XFS_BTREE_BLOCK_MAXRECS(sbp->sb_blocksize,
-                       xfs_inobt, i == 0);
-               mp->m_inobt_mnr[i] = XFS_BTREE_BLOCK_MINRECS(sbp->sb_blocksize,
-                       xfs_inobt, i == 0);
-       }
+       mp->m_alloc_mxr[0] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 1);
+       mp->m_alloc_mxr[1] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 0);
+       mp->m_alloc_mnr[0] = mp->m_alloc_mxr[0] / 2;
+       mp->m_alloc_mnr[1] = mp->m_alloc_mxr[1] / 2;
+
+       mp->m_inobt_mxr[0] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 1);
+       mp->m_inobt_mxr[1] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 0);
+       mp->m_inobt_mnr[0] = mp->m_inobt_mxr[0] / 2;
+       mp->m_inobt_mnr[1] = mp->m_inobt_mxr[1] / 2;
+
+       mp->m_bmap_dmxr[0] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, 1);
+       mp->m_bmap_dmxr[1] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, 0);
+       mp->m_bmap_dmnr[0] = mp->m_bmap_dmxr[0] / 2;
+       mp->m_bmap_dmnr[1] = mp->m_bmap_dmxr[1] / 2;
 
        mp->m_bsize = XFS_FSB_TO_BB(mp, 1);
        mp->m_ialloc_inos = (int)MAX((__uint16_t)XFS_INODES_PER_CHUNK,
@@ -1241,10 +1234,13 @@ xfs_unmountfs(
         * need to force the log first.
         */
        xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC);
-       xfs_iflush_all(mp);
+       xfs_reclaim_inodes(mp, 0, XFS_IFLUSH_ASYNC);
 
        XFS_QM_DQPURGEALL(mp, XFS_QMOPT_QUOTALL | XFS_QMOPT_UMOUNTING);
 
+       if (mp->m_quotainfo)
+               XFS_QM_DONE(mp);
+
        /*
         * Flush out the log synchronously so that we know for sure
         * that nothing is pinned.  This is important because bflush()
@@ -1285,11 +1281,6 @@ xfs_unmountfs(
        xfs_unmountfs_wait(mp);                 /* wait for async bufs */
        xfs_log_unmount(mp);                    /* Done! No more fs ops. */
 
-       /*
-        * All inodes from this mount point should be freed.
-        */
-       ASSERT(mp->m_inodes == NULL);
-
        if ((mp->m_flags & XFS_MOUNT_NOUUID) == 0)
                uuid_table_remove(&mp->m_sb.sb_uuid);
 
@@ -1297,8 +1288,6 @@ xfs_unmountfs(
        xfs_errortag_clearall(mp, 0);
 #endif
        xfs_free_perag(mp);
-       if (mp->m_quotainfo)
-               XFS_QM_DONE(mp);
 }
 
 STATIC void
index f3c1024b1241ed8663568db7e309e75c83f866f0..e3f618c84e4742cdb423b628cae1eb12efc0ea61 100644 (file)
@@ -18,6 +18,7 @@
 #ifndef __XFS_MOUNT_H__
 #define        __XFS_MOUNT_H__
 
+#include "xfs_sync.h"
 
 typedef struct xfs_trans_reservations {
        uint    tr_write;       /* extent alloc trans */
@@ -44,14 +45,14 @@ typedef struct xfs_trans_reservations {
 } xfs_trans_reservations_t;
 
 #ifndef __KERNEL__
-/*
- * Moved here from xfs_ag.h to avoid reordering header files
- */
+
 #define XFS_DADDR_TO_AGNO(mp,d) \
        ((xfs_agnumber_t)(XFS_BB_TO_FSBT(mp, d) / (mp)->m_sb.sb_agblocks))
 #define XFS_DADDR_TO_AGBNO(mp,d) \
        ((xfs_agblock_t)(XFS_BB_TO_FSBT(mp, d) % (mp)->m_sb.sb_agblocks))
-#else
+
+#else /* __KERNEL__ */
+
 struct cred;
 struct log;
 struct xfs_mount_args;
@@ -62,6 +63,7 @@ struct xfs_extdelta;
 struct xfs_swapext;
 struct xfs_mru_cache;
 struct xfs_nameops;
+struct xfs_ail;
 
 /*
  * Prototypes and functions for the Data Migration subsystem.
@@ -223,18 +225,10 @@ extern void       xfs_icsb_sync_counters_locked(struct xfs_mount *, int);
 #define xfs_icsb_sync_counters_locked(mp, flags) do { } while (0)
 #endif
 
-typedef struct xfs_ail {
-       struct list_head        xa_ail;
-       uint                    xa_gen;
-       struct task_struct      *xa_task;
-       xfs_lsn_t               xa_target;
-} xfs_ail_t;
-
 typedef struct xfs_mount {
        struct super_block      *m_super;
        xfs_tid_t               m_tid;          /* next unused tid for fs */
-       spinlock_t              m_ail_lock;     /* fs AIL mutex */
-       xfs_ail_t               m_ail;          /* fs active log item list */
+       struct xfs_ail          *m_ail;         /* fs active log item list */
        xfs_sb_t                m_sb;           /* copy of fs superblock */
        spinlock_t              m_sb_lock;      /* sb counter lock */
        struct xfs_buf          *m_sb_bp;       /* buffer for superblock */
@@ -247,9 +241,6 @@ typedef struct xfs_mount {
        xfs_agnumber_t          m_agirotor;     /* last ag dir inode alloced */
        spinlock_t              m_agirotor_lock;/* .. and lock protecting it */
        xfs_agnumber_t          m_maxagi;       /* highest inode alloc group */
-       struct xfs_inode        *m_inodes;      /* active inode list */
-       struct list_head        m_del_inodes;   /* inodes to reclaim */
-       mutex_t                 m_ilock;        /* inode list mutex */
        uint                    m_ireclaims;    /* count of calls to reclaim*/
        uint                    m_readio_log;   /* min read size log bytes */
        uint                    m_readio_blocks; /* min read size blocks */
@@ -267,7 +258,6 @@ typedef struct xfs_mount {
        xfs_buftarg_t           *m_ddev_targp;  /* saves taking the address */
        xfs_buftarg_t           *m_logdev_targp;/* ptr to log device */
        xfs_buftarg_t           *m_rtdev_targp; /* ptr to rt device */
-       __uint8_t               m_dircook_elog; /* log d-cookie entry bits */
        __uint8_t               m_blkbit_log;   /* blocklog + NBBY */
        __uint8_t               m_blkbb_log;    /* blocklog - BBSHIFT */
        __uint8_t               m_agno_log;     /* log #ag's */
@@ -276,12 +266,12 @@ typedef struct xfs_mount {
        uint                    m_blockmask;    /* sb_blocksize-1 */
        uint                    m_blockwsize;   /* sb_blocksize in words */
        uint                    m_blockwmask;   /* blockwsize-1 */
-       uint                    m_alloc_mxr[2]; /* XFS_ALLOC_BLOCK_MAXRECS */
-       uint                    m_alloc_mnr[2]; /* XFS_ALLOC_BLOCK_MINRECS */
-       uint                    m_bmap_dmxr[2]; /* XFS_BMAP_BLOCK_DMAXRECS */
-       uint                    m_bmap_dmnr[2]; /* XFS_BMAP_BLOCK_DMINRECS */
-       uint                    m_inobt_mxr[2]; /* XFS_INOBT_BLOCK_MAXRECS */
-       uint                    m_inobt_mnr[2]; /* XFS_INOBT_BLOCK_MINRECS */
+       uint                    m_alloc_mxr[2]; /* max alloc btree records */
+       uint                    m_alloc_mnr[2]; /* min alloc btree records */
+       uint                    m_bmap_dmxr[2]; /* max bmap btree records */
+       uint                    m_bmap_dmnr[2]; /* min bmap btree records */
+       uint                    m_inobt_mxr[2]; /* max inobt btree records */
+       uint                    m_inobt_mnr[2]; /* min inobt btree records */
        uint                    m_ag_maxlevels; /* XFS_AG_MAXLEVELS */
        uint                    m_bm_maxlevels[2]; /* XFS_BM_MAXLEVELS */
        uint                    m_in_maxlevels; /* XFS_IN_MAXLEVELS */
@@ -313,8 +303,7 @@ typedef struct xfs_mount {
        int                     m_attr_magicpct;/* 37% of the blocksize */
        int                     m_dir_magicpct; /* 37% of the dir blocksize */
        __uint8_t               m_mk_sharedro;  /* mark shared ro on unmount */
-       __uint8_t               m_inode_quiesce;/* call quiesce on new inodes.
-                                                  field governed by m_ilock */
+       __uint8_t               m_inode_quiesce;/* call quiesce on new inodes. */
        __uint8_t               m_sectbb_log;   /* sectlog - BBSHIFT */
        const struct xfs_nameops *m_dirnameops; /* vector of dir name ops */
        int                     m_dirblksize;   /* directory block sz--bytes */
@@ -508,7 +497,6 @@ typedef struct xfs_mod_sb {
 #define        XFS_MOUNT_ILOCK(mp)     mutex_lock(&((mp)->m_ilock))
 #define        XFS_MOUNT_IUNLOCK(mp)   mutex_unlock(&((mp)->m_ilock))
 
-extern void    xfs_mod_sb(xfs_trans_t *, __int64_t);
 extern int     xfs_log_sbcount(xfs_mount_t *, uint);
 extern int     xfs_mountfs(xfs_mount_t *mp);
 extern void    xfs_mountfs_check_barriers(xfs_mount_t *mp);
@@ -525,20 +513,20 @@ extern struct xfs_buf *xfs_getsb(xfs_mount_t *, int);
 extern int     xfs_readsb(xfs_mount_t *, int);
 extern void    xfs_freesb(xfs_mount_t *);
 extern int     xfs_fs_writable(xfs_mount_t *);
-extern int     xfs_syncsub(xfs_mount_t *, int, int *);
-extern int     xfs_sync_inodes(xfs_mount_t *, int, int *);
-extern xfs_agnumber_t  xfs_initialize_perag(xfs_mount_t *, xfs_agnumber_t);
-extern void    xfs_sb_from_disk(struct xfs_sb *, struct xfs_dsb *);
-extern void    xfs_sb_to_disk(struct xfs_dsb *, struct xfs_sb *, __int64_t);
 extern int     xfs_sb_validate_fsb_count(struct xfs_sb *, __uint64_t);
 
-extern int     xfs_dmops_get(struct xfs_mount *, struct xfs_mount_args *);
+extern int     xfs_dmops_get(struct xfs_mount *);
 extern void    xfs_dmops_put(struct xfs_mount *);
-extern int     xfs_qmops_get(struct xfs_mount *, struct xfs_mount_args *);
+extern int     xfs_qmops_get(struct xfs_mount *);
 extern void    xfs_qmops_put(struct xfs_mount *);
 
 extern struct xfs_dmops xfs_dmcore_xfs;
 
 #endif /* __KERNEL__ */
 
+extern void    xfs_mod_sb(struct xfs_trans *, __int64_t);
+extern xfs_agnumber_t  xfs_initialize_perag(struct xfs_mount *, xfs_agnumber_t);
+extern void    xfs_sb_from_disk(struct xfs_sb *, struct xfs_dsb *);
+extern void    xfs_sb_to_disk(struct xfs_dsb *, struct xfs_sb *, __int64_t);
+
 #endif /* __XFS_MOUNT_H__ */
index a294e58db8dda71c5230c39f39117863c7344d72..27f80581520ad2d19c138419a2e3bca42de86944 100644 (file)
@@ -28,7 +28,6 @@
 #include "xfs_mount.h"
 #include "xfs_quota.h"
 #include "xfs_error.h"
-#include "xfs_clnt.h"
 
 
 STATIC struct xfs_dquot *
@@ -131,9 +130,9 @@ static struct xfs_qmops xfs_qmcore_stub = {
 };
 
 int
-xfs_qmops_get(struct xfs_mount *mp, struct xfs_mount_args *args)
+xfs_qmops_get(struct xfs_mount *mp)
 {
-       if (args->flags & (XFSMNT_UQUOTA | XFSMNT_PQUOTA | XFSMNT_GQUOTA)) {
+       if (XFS_IS_QUOTA_RUNNING(mp)) {
 #ifdef CONFIG_XFS_QUOTA
                mp->m_qm_ops = &xfs_qmcore_xfs;
 #else
index 4e1c22a23be5fb2496b20600767de4ee1a8822ce..ad137efc87020561f208cbbd3a382f98fcff24f7 100644 (file)
@@ -1383,11 +1383,12 @@ xfs_trans_chunk_committed(
        xfs_log_item_desc_t     *lidp;
        xfs_log_item_t          *lip;
        xfs_lsn_t               item_lsn;
-       struct xfs_mount        *mp;
        int                     i;
 
        lidp = licp->lic_descs;
        for (i = 0; i < licp->lic_unused; i++, lidp++) {
+               struct xfs_ail          *ailp;
+
                if (xfs_lic_isfree(licp, i)) {
                        continue;
                }
@@ -1424,19 +1425,19 @@ xfs_trans_chunk_committed(
                 * This would cause the earlier transaction to fail
                 * the test below.
                 */
-               mp = lip->li_mountp;
-               spin_lock(&mp->m_ail_lock);
+               ailp = lip->li_ailp;
+               spin_lock(&ailp->xa_lock);
                if (XFS_LSN_CMP(item_lsn, lip->li_lsn) > 0) {
                        /*
                         * This will set the item's lsn to item_lsn
                         * and update the position of the item in
                         * the AIL.
                         *
-                        * xfs_trans_update_ail() drops the AIL lock.
+                        * xfs_trans_ail_update() drops the AIL lock.
                         */
-                       xfs_trans_update_ail(mp, lip, item_lsn);
+                       xfs_trans_ail_update(ailp, lip, item_lsn);
                } else {
-                       spin_unlock(&mp->m_ail_lock);
+                       spin_unlock(&ailp->xa_lock);
                }
 
                /*
index 74c80bd2b0ec5a838e325945b9787a2a9c3c10b3..d6fe4a88d79f0557bcdd2a46434a438debd9cb72 100644 (file)
@@ -18,6 +18,8 @@
 #ifndef        __XFS_TRANS_H__
 #define        __XFS_TRANS_H__
 
+struct xfs_log_item;
+
 /*
  * This is the structure written in the log at the head of
  * every transaction. It identifies the type and id of the
@@ -98,76 +100,6 @@ typedef struct xfs_trans_header {
 #define        XFS_TRANS_TYPE_MAX              41
 /* new transaction types need to be reflected in xfs_logprint(8) */
 
-
-#ifdef __KERNEL__
-struct xfs_buf;
-struct xfs_buftarg;
-struct xfs_efd_log_item;
-struct xfs_efi_log_item;
-struct xfs_inode;
-struct xfs_item_ops;
-struct xfs_log_iovec;
-struct xfs_log_item;
-struct xfs_log_item_desc;
-struct xfs_mount;
-struct xfs_trans;
-struct xfs_dquot_acct;
-
-typedef struct xfs_log_item {
-       struct list_head                li_ail;         /* AIL pointers */
-       xfs_lsn_t                       li_lsn;         /* last on-disk lsn */
-       struct xfs_log_item_desc        *li_desc;       /* ptr to current desc*/
-       struct xfs_mount                *li_mountp;     /* ptr to fs mount */
-       uint                            li_type;        /* item type */
-       uint                            li_flags;       /* misc flags */
-       struct xfs_log_item             *li_bio_list;   /* buffer item list */
-       void                            (*li_cb)(struct xfs_buf *,
-                                                struct xfs_log_item *);
-                                                       /* buffer item iodone */
-                                                       /* callback func */
-       struct xfs_item_ops             *li_ops;        /* function list */
-} xfs_log_item_t;
-
-#define        XFS_LI_IN_AIL   0x1
-#define XFS_LI_ABORTED 0x2
-
-typedef struct xfs_item_ops {
-       uint (*iop_size)(xfs_log_item_t *);
-       void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *);
-       void (*iop_pin)(xfs_log_item_t *);
-       void (*iop_unpin)(xfs_log_item_t *, int);
-       void (*iop_unpin_remove)(xfs_log_item_t *, struct xfs_trans *);
-       uint (*iop_trylock)(xfs_log_item_t *);
-       void (*iop_unlock)(xfs_log_item_t *);
-       xfs_lsn_t (*iop_committed)(xfs_log_item_t *, xfs_lsn_t);
-       void (*iop_push)(xfs_log_item_t *);
-       void (*iop_pushbuf)(xfs_log_item_t *);
-       void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t);
-} xfs_item_ops_t;
-
-#define IOP_SIZE(ip)           (*(ip)->li_ops->iop_size)(ip)
-#define IOP_FORMAT(ip,vp)      (*(ip)->li_ops->iop_format)(ip, vp)
-#define IOP_PIN(ip)            (*(ip)->li_ops->iop_pin)(ip)
-#define IOP_UNPIN(ip, flags)   (*(ip)->li_ops->iop_unpin)(ip, flags)
-#define IOP_UNPIN_REMOVE(ip,tp) (*(ip)->li_ops->iop_unpin_remove)(ip, tp)
-#define IOP_TRYLOCK(ip)                (*(ip)->li_ops->iop_trylock)(ip)
-#define IOP_UNLOCK(ip)         (*(ip)->li_ops->iop_unlock)(ip)
-#define IOP_COMMITTED(ip, lsn) (*(ip)->li_ops->iop_committed)(ip, lsn)
-#define IOP_PUSH(ip)           (*(ip)->li_ops->iop_push)(ip)
-#define IOP_PUSHBUF(ip)                (*(ip)->li_ops->iop_pushbuf)(ip)
-#define IOP_COMMITTING(ip, lsn) (*(ip)->li_ops->iop_committing)(ip, lsn)
-
-/*
- * Return values for the IOP_TRYLOCK() routines.
- */
-#define        XFS_ITEM_SUCCESS        0
-#define        XFS_ITEM_PINNED         1
-#define        XFS_ITEM_LOCKED         2
-#define        XFS_ITEM_FLUSHING       3
-#define XFS_ITEM_PUSHBUF       4
-
-#endif /* __KERNEL__ */
-
 /*
  * This structure is used to track log items associated with
  * a transaction.  It points to the log item and keeps some
@@ -176,7 +108,7 @@ typedef struct xfs_item_ops {
  * once we get to commit processing (see xfs_trans_commit()).
  */
 typedef struct xfs_log_item_desc {
-       xfs_log_item_t  *lid_item;
+       struct xfs_log_item     *lid_item;
        ushort          lid_size;
        unsigned char   lid_flags;
        unsigned char   lid_index;
@@ -276,94 +208,6 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
                (xfs_caddr_t)(((xfs_log_item_chunk_t*)0)->lic_descs));
 }
 
-#ifdef __KERNEL__
-/*
- * This structure is used to maintain a list of block ranges that have been
- * freed in the transaction.  The ranges are listed in the perag[] busy list
- * between when they're freed and the transaction is committed to disk.
- */
-
-typedef struct xfs_log_busy_slot {
-       xfs_agnumber_t          lbc_ag;
-       ushort                  lbc_idx;        /* index in perag.busy[] */
-} xfs_log_busy_slot_t;
-
-#define XFS_LBC_NUM_SLOTS      31
-typedef struct xfs_log_busy_chunk {
-       struct xfs_log_busy_chunk       *lbc_next;
-       uint                            lbc_free;       /* free slots bitmask */
-       ushort                          lbc_unused;     /* first unused */
-       xfs_log_busy_slot_t             lbc_busy[XFS_LBC_NUM_SLOTS];
-} xfs_log_busy_chunk_t;
-
-#define        XFS_LBC_MAX_SLOT        (XFS_LBC_NUM_SLOTS - 1)
-#define        XFS_LBC_FREEMASK        ((1U << XFS_LBC_NUM_SLOTS) - 1)
-
-#define        XFS_LBC_INIT(cp)        ((cp)->lbc_free = XFS_LBC_FREEMASK)
-#define        XFS_LBC_CLAIM(cp, slot) ((cp)->lbc_free &= ~(1 << (slot)))
-#define        XFS_LBC_SLOT(cp, slot)  (&((cp)->lbc_busy[(slot)]))
-#define        XFS_LBC_VACANCY(cp)     (((cp)->lbc_free) & XFS_LBC_FREEMASK)
-#define        XFS_LBC_ISFREE(cp, slot) ((cp)->lbc_free & (1 << (slot)))
-
-/*
- * This is the type of function which can be given to xfs_trans_callback()
- * to be called upon the transaction's commit to disk.
- */
-typedef void (*xfs_trans_callback_t)(struct xfs_trans *, void *);
-
-/*
- * This is the structure maintained for every active transaction.
- */
-typedef struct xfs_trans {
-       unsigned int            t_magic;        /* magic number */
-       xfs_log_callback_t      t_logcb;        /* log callback struct */
-       unsigned int            t_type;         /* transaction type */
-       unsigned int            t_log_res;      /* amt of log space resvd */
-       unsigned int            t_log_count;    /* count for perm log res */
-       unsigned int            t_blk_res;      /* # of blocks resvd */
-       unsigned int            t_blk_res_used; /* # of resvd blocks used */
-       unsigned int            t_rtx_res;      /* # of rt extents resvd */
-       unsigned int            t_rtx_res_used; /* # of resvd rt extents used */
-       xfs_log_ticket_t        t_ticket;       /* log mgr ticket */
-       xfs_lsn_t               t_lsn;          /* log seq num of start of
-                                                * transaction. */
-       xfs_lsn_t               t_commit_lsn;   /* log seq num of end of
-                                                * transaction. */
-       struct xfs_mount        *t_mountp;      /* ptr to fs mount struct */
-       struct xfs_dquot_acct   *t_dqinfo;      /* acctg info for dquots */
-       xfs_trans_callback_t    t_callback;     /* transaction callback */
-       void                    *t_callarg;     /* callback arg */
-       unsigned int            t_flags;        /* misc flags */
-       int64_t                 t_icount_delta; /* superblock icount change */
-       int64_t                 t_ifree_delta;  /* superblock ifree change */
-       int64_t                 t_fdblocks_delta; /* superblock fdblocks chg */
-       int64_t                 t_res_fdblocks_delta; /* on-disk only chg */
-       int64_t                 t_frextents_delta;/* superblock freextents chg*/
-       int64_t                 t_res_frextents_delta; /* on-disk only chg */
-#ifdef DEBUG
-       int64_t                 t_ag_freeblks_delta; /* debugging counter */
-       int64_t                 t_ag_flist_delta; /* debugging counter */
-       int64_t                 t_ag_btree_delta; /* debugging counter */
-#endif
-       int64_t                 t_dblocks_delta;/* superblock dblocks change */
-       int64_t                 t_agcount_delta;/* superblock agcount change */
-       int64_t                 t_imaxpct_delta;/* superblock imaxpct change */
-       int64_t                 t_rextsize_delta;/* superblock rextsize chg */
-       int64_t                 t_rbmblocks_delta;/* superblock rbmblocks chg */
-       int64_t                 t_rblocks_delta;/* superblock rblocks change */
-       int64_t                 t_rextents_delta;/* superblocks rextents chg */
-       int64_t                 t_rextslog_delta;/* superblocks rextslog chg */
-       unsigned int            t_items_free;   /* log item descs free */
-       xfs_log_item_chunk_t    t_items;        /* first log item desc chunk */
-       xfs_trans_header_t      t_header;       /* header for in-log trans */
-       unsigned int            t_busy_free;    /* busy descs free */
-       xfs_log_busy_chunk_t    t_busy;         /* busy/async free blocks */
-       unsigned long           t_pflags;       /* saved process flags state */
-} xfs_trans_t;
-
-#endif /* __KERNEL__ */
-
-
 #define        XFS_TRANS_MAGIC         0x5452414E      /* 'TRAN' */
 /*
  * Values for t_flags.
@@ -906,6 +750,157 @@ typedef struct xfs_trans {
 #define        XFS_DQUOT_REF           1
 
 #ifdef __KERNEL__
+
+struct xfs_buf;
+struct xfs_buftarg;
+struct xfs_efd_log_item;
+struct xfs_efi_log_item;
+struct xfs_inode;
+struct xfs_item_ops;
+struct xfs_log_iovec;
+struct xfs_log_item_desc;
+struct xfs_mount;
+struct xfs_trans;
+struct xfs_dquot_acct;
+
+typedef struct xfs_log_item {
+       struct list_head                li_ail;         /* AIL pointers */
+       xfs_lsn_t                       li_lsn;         /* last on-disk lsn */
+       struct xfs_log_item_desc        *li_desc;       /* ptr to current desc*/
+       struct xfs_mount                *li_mountp;     /* ptr to fs mount */
+       struct xfs_ail                  *li_ailp;       /* ptr to AIL */
+       uint                            li_type;        /* item type */
+       uint                            li_flags;       /* misc flags */
+       struct xfs_log_item             *li_bio_list;   /* buffer item list */
+       void                            (*li_cb)(struct xfs_buf *,
+                                                struct xfs_log_item *);
+                                                       /* buffer item iodone */
+                                                       /* callback func */
+       struct xfs_item_ops             *li_ops;        /* function list */
+} xfs_log_item_t;
+
+#define        XFS_LI_IN_AIL   0x1
+#define XFS_LI_ABORTED 0x2
+
+typedef struct xfs_item_ops {
+       uint (*iop_size)(xfs_log_item_t *);
+       void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *);
+       void (*iop_pin)(xfs_log_item_t *);
+       void (*iop_unpin)(xfs_log_item_t *, int);
+       void (*iop_unpin_remove)(xfs_log_item_t *, struct xfs_trans *);
+       uint (*iop_trylock)(xfs_log_item_t *);
+       void (*iop_unlock)(xfs_log_item_t *);
+       xfs_lsn_t (*iop_committed)(xfs_log_item_t *, xfs_lsn_t);
+       void (*iop_push)(xfs_log_item_t *);
+       void (*iop_pushbuf)(xfs_log_item_t *);
+       void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t);
+} xfs_item_ops_t;
+
+#define IOP_SIZE(ip)           (*(ip)->li_ops->iop_size)(ip)
+#define IOP_FORMAT(ip,vp)      (*(ip)->li_ops->iop_format)(ip, vp)
+#define IOP_PIN(ip)            (*(ip)->li_ops->iop_pin)(ip)
+#define IOP_UNPIN(ip, flags)   (*(ip)->li_ops->iop_unpin)(ip, flags)
+#define IOP_UNPIN_REMOVE(ip,tp) (*(ip)->li_ops->iop_unpin_remove)(ip, tp)
+#define IOP_TRYLOCK(ip)                (*(ip)->li_ops->iop_trylock)(ip)
+#define IOP_UNLOCK(ip)         (*(ip)->li_ops->iop_unlock)(ip)
+#define IOP_COMMITTED(ip, lsn) (*(ip)->li_ops->iop_committed)(ip, lsn)
+#define IOP_PUSH(ip)           (*(ip)->li_ops->iop_push)(ip)
+#define IOP_PUSHBUF(ip)                (*(ip)->li_ops->iop_pushbuf)(ip)
+#define IOP_COMMITTING(ip, lsn) (*(ip)->li_ops->iop_committing)(ip, lsn)
+
+/*
+ * Return values for the IOP_TRYLOCK() routines.
+ */
+#define        XFS_ITEM_SUCCESS        0
+#define        XFS_ITEM_PINNED         1
+#define        XFS_ITEM_LOCKED         2
+#define        XFS_ITEM_FLUSHING       3
+#define XFS_ITEM_PUSHBUF       4
+
+/*
+ * This structure is used to maintain a list of block ranges that have been
+ * freed in the transaction.  The ranges are listed in the perag[] busy list
+ * between when they're freed and the transaction is committed to disk.
+ */
+
+typedef struct xfs_log_busy_slot {
+       xfs_agnumber_t          lbc_ag;
+       ushort                  lbc_idx;        /* index in perag.busy[] */
+} xfs_log_busy_slot_t;
+
+#define XFS_LBC_NUM_SLOTS      31
+typedef struct xfs_log_busy_chunk {
+       struct xfs_log_busy_chunk       *lbc_next;
+       uint                            lbc_free;       /* free slots bitmask */
+       ushort                          lbc_unused;     /* first unused */
+       xfs_log_busy_slot_t             lbc_busy[XFS_LBC_NUM_SLOTS];
+} xfs_log_busy_chunk_t;
+
+#define        XFS_LBC_MAX_SLOT        (XFS_LBC_NUM_SLOTS - 1)
+#define        XFS_LBC_FREEMASK        ((1U << XFS_LBC_NUM_SLOTS) - 1)
+
+#define        XFS_LBC_INIT(cp)        ((cp)->lbc_free = XFS_LBC_FREEMASK)
+#define        XFS_LBC_CLAIM(cp, slot) ((cp)->lbc_free &= ~(1 << (slot)))
+#define        XFS_LBC_SLOT(cp, slot)  (&((cp)->lbc_busy[(slot)]))
+#define        XFS_LBC_VACANCY(cp)     (((cp)->lbc_free) & XFS_LBC_FREEMASK)
+#define        XFS_LBC_ISFREE(cp, slot) ((cp)->lbc_free & (1 << (slot)))
+
+/*
+ * This is the type of function which can be given to xfs_trans_callback()
+ * to be called upon the transaction's commit to disk.
+ */
+typedef void (*xfs_trans_callback_t)(struct xfs_trans *, void *);
+
+/*
+ * This is the structure maintained for every active transaction.
+ */
+typedef struct xfs_trans {
+       unsigned int            t_magic;        /* magic number */
+       xfs_log_callback_t      t_logcb;        /* log callback struct */
+       unsigned int            t_type;         /* transaction type */
+       unsigned int            t_log_res;      /* amt of log space resvd */
+       unsigned int            t_log_count;    /* count for perm log res */
+       unsigned int            t_blk_res;      /* # of blocks resvd */
+       unsigned int            t_blk_res_used; /* # of resvd blocks used */
+       unsigned int            t_rtx_res;      /* # of rt extents resvd */
+       unsigned int            t_rtx_res_used; /* # of resvd rt extents used */
+       xfs_log_ticket_t        t_ticket;       /* log mgr ticket */
+       xfs_lsn_t               t_lsn;          /* log seq num of start of
+                                                * transaction. */
+       xfs_lsn_t               t_commit_lsn;   /* log seq num of end of
+                                                * transaction. */
+       struct xfs_mount        *t_mountp;      /* ptr to fs mount struct */
+       struct xfs_dquot_acct   *t_dqinfo;      /* acctg info for dquots */
+       xfs_trans_callback_t    t_callback;     /* transaction callback */
+       void                    *t_callarg;     /* callback arg */
+       unsigned int            t_flags;        /* misc flags */
+       int64_t                 t_icount_delta; /* superblock icount change */
+       int64_t                 t_ifree_delta;  /* superblock ifree change */
+       int64_t                 t_fdblocks_delta; /* superblock fdblocks chg */
+       int64_t                 t_res_fdblocks_delta; /* on-disk only chg */
+       int64_t                 t_frextents_delta;/* superblock freextents chg*/
+       int64_t                 t_res_frextents_delta; /* on-disk only chg */
+#ifdef DEBUG
+       int64_t                 t_ag_freeblks_delta; /* debugging counter */
+       int64_t                 t_ag_flist_delta; /* debugging counter */
+       int64_t                 t_ag_btree_delta; /* debugging counter */
+#endif
+       int64_t                 t_dblocks_delta;/* superblock dblocks change */
+       int64_t                 t_agcount_delta;/* superblock agcount change */
+       int64_t                 t_imaxpct_delta;/* superblock imaxpct change */
+       int64_t                 t_rextsize_delta;/* superblock rextsize chg */
+       int64_t                 t_rbmblocks_delta;/* superblock rbmblocks chg */
+       int64_t                 t_rblocks_delta;/* superblock rblocks change */
+       int64_t                 t_rextents_delta;/* superblocks rextents chg */
+       int64_t                 t_rextslog_delta;/* superblocks rextslog chg */
+       unsigned int            t_items_free;   /* log item descs free */
+       xfs_log_item_chunk_t    t_items;        /* first log item desc chunk */
+       xfs_trans_header_t      t_header;       /* header for in-log trans */
+       unsigned int            t_busy_free;    /* busy descs free */
+       xfs_log_busy_chunk_t    t_busy;         /* busy/async free blocks */
+       unsigned long           t_pflags;       /* saved process flags state */
+} xfs_trans_t;
+
 /*
  * XFS transaction mechanism exported interfaces that are
  * actually macros.
@@ -928,7 +923,6 @@ typedef struct xfs_trans {
 /*
  * XFS transaction mechanism exported interfaces.
  */
-void           xfs_trans_init(struct xfs_mount *);
 xfs_trans_t    *xfs_trans_alloc(struct xfs_mount *, uint);
 xfs_trans_t    *_xfs_trans_alloc(struct xfs_mount *, uint);
 xfs_trans_t    *xfs_trans_dup(xfs_trans_t *);
@@ -975,13 +969,8 @@ int                _xfs_trans_commit(xfs_trans_t *,
                                  int *);
 #define xfs_trans_commit(tp, flags)    _xfs_trans_commit(tp, flags, NULL)
 void           xfs_trans_cancel(xfs_trans_t *, int);
-int            xfs_trans_roll(struct xfs_trans **, struct xfs_inode *);
 int            xfs_trans_ail_init(struct xfs_mount *);
 void           xfs_trans_ail_destroy(struct xfs_mount *);
-void           xfs_trans_push_ail(struct xfs_mount *, xfs_lsn_t);
-xfs_lsn_t      xfs_trans_tail_ail(struct xfs_mount *);
-void           xfs_trans_unlocked_item(struct xfs_mount *,
-                                       xfs_log_item_t *);
 xfs_log_busy_slot_t *xfs_trans_add_busy(xfs_trans_t *tp,
                                        xfs_agnumber_t ag,
                                        xfs_extlen_t idx);
@@ -990,4 +979,7 @@ extern kmem_zone_t  *xfs_trans_zone;
 
 #endif /* __KERNEL__ */
 
+void           xfs_trans_init(struct xfs_mount *);
+int            xfs_trans_roll(struct xfs_trans **, struct xfs_inode *);
+
 #endif /* __XFS_TRANS_H__ */
index 1f77c00af566be43c8a2c569e2be9222002f555f..2d47f10f8bed4cbe9b2441a29649311c2aa0acae 100644 (file)
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2008 Dave Chinner
  * All Rights Reserved.
  *
  * This program is free software; you can redistribute it and/or
 #include "xfs_trans_priv.h"
 #include "xfs_error.h"
 
-STATIC void xfs_ail_insert(xfs_ail_t *, xfs_log_item_t *);
-STATIC xfs_log_item_t * xfs_ail_delete(xfs_ail_t *, xfs_log_item_t *);
-STATIC xfs_log_item_t * xfs_ail_min(xfs_ail_t *);
-STATIC xfs_log_item_t * xfs_ail_next(xfs_ail_t *, xfs_log_item_t *);
+STATIC void xfs_ail_insert(struct xfs_ail *, xfs_log_item_t *);
+STATIC xfs_log_item_t * xfs_ail_delete(struct xfs_ail *, xfs_log_item_t *);
+STATIC xfs_log_item_t * xfs_ail_min(struct xfs_ail *);
+STATIC xfs_log_item_t * xfs_ail_next(struct xfs_ail *, xfs_log_item_t *);
 
 #ifdef DEBUG
-STATIC void xfs_ail_check(xfs_ail_t *, xfs_log_item_t *);
+STATIC void xfs_ail_check(struct xfs_ail *, xfs_log_item_t *);
 #else
 #define        xfs_ail_check(a,l)
 #endif /* DEBUG */
@@ -50,20 +51,20 @@ STATIC void xfs_ail_check(xfs_ail_t *, xfs_log_item_t *);
  * lsn of the last item in the AIL.
  */
 xfs_lsn_t
-xfs_trans_tail_ail(
-       xfs_mount_t     *mp)
+xfs_trans_ail_tail(
+       struct xfs_ail  *ailp)
 {
        xfs_lsn_t       lsn;
        xfs_log_item_t  *lip;
 
-       spin_lock(&mp->m_ail_lock);
-       lip = xfs_ail_min(&mp->m_ail);
+       spin_lock(&ailp->xa_lock);
+       lip = xfs_ail_min(ailp);
        if (lip == NULL) {
                lsn = (xfs_lsn_t)0;
        } else {
                lsn = lip->li_lsn;
        }
-       spin_unlock(&mp->m_ail_lock);
+       spin_unlock(&ailp->xa_lock);
 
        return lsn;
 }
@@ -85,16 +86,125 @@ xfs_trans_tail_ail(
  * any of the objects, so the lock is not needed.
  */
 void
-xfs_trans_push_ail(
-       xfs_mount_t             *mp,
-       xfs_lsn_t               threshold_lsn)
+xfs_trans_ail_push(
+       struct xfs_ail  *ailp,
+       xfs_lsn_t       threshold_lsn)
 {
-       xfs_log_item_t          *lip;
+       xfs_log_item_t  *lip;
+
+       lip = xfs_ail_min(ailp);
+       if (lip && !XFS_FORCED_SHUTDOWN(ailp->xa_mount)) {
+               if (XFS_LSN_CMP(threshold_lsn, ailp->xa_target) > 0)
+                       xfsaild_wakeup(ailp, threshold_lsn);
+       }
+}
+
+/*
+ * AIL traversal cursor initialisation.
+ *
+ * The cursor keeps track of where our current traversal is up
+ * to by tracking the next Æ£tem in the list for us. However, for
+ * this to be safe, removing an object from the AIL needs to invalidate
+ * any cursor that points to it. hence the traversal cursor needs to
+ * be linked to the struct xfs_ail so that deletion can search all the
+ * active cursors for invalidation.
+ *
+ * We don't link the push cursor because it is embedded in the struct
+ * xfs_ail and hence easily findable.
+ */
+STATIC void
+xfs_trans_ail_cursor_init(
+       struct xfs_ail          *ailp,
+       struct xfs_ail_cursor   *cur)
+{
+       cur->item = NULL;
+       if (cur == &ailp->xa_cursors)
+               return;
+
+       cur->next = ailp->xa_cursors.next;
+       ailp->xa_cursors.next = cur;
+}
+
+/*
+ * Set the cursor to the next item, because when we look
+ * up the cursor the current item may have been freed.
+ */
+STATIC void
+xfs_trans_ail_cursor_set(
+       struct xfs_ail          *ailp,
+       struct xfs_ail_cursor   *cur,
+       struct xfs_log_item     *lip)
+{
+       if (lip)
+               cur->item = xfs_ail_next(ailp, lip);
+}
+
+/*
+ * Get the next item in the traversal and advance the cursor.
+ * If the cursor was invalidated (inidicated by a lip of 1),
+ * restart the traversal.
+ */
+struct xfs_log_item *
+xfs_trans_ail_cursor_next(
+       struct xfs_ail          *ailp,
+       struct xfs_ail_cursor   *cur)
+{
+       struct xfs_log_item     *lip = cur->item;
+
+       if ((__psint_t)lip & 1)
+               lip = xfs_ail_min(ailp);
+       xfs_trans_ail_cursor_set(ailp, cur, lip);
+       return lip;
+}
+
+/*
+ * Now that the traversal is complete, we need to remove the cursor
+ * from the list of traversing cursors. Avoid removing the embedded
+ * push cursor, but use the fact it is alway present to make the
+ * list deletion simple.
+ */
+void
+xfs_trans_ail_cursor_done(
+       struct xfs_ail          *ailp,
+       struct xfs_ail_cursor   *done)
+{
+       struct xfs_ail_cursor   *prev = NULL;
+       struct xfs_ail_cursor   *cur;
+
+       done->item = NULL;
+       if (done == &ailp->xa_cursors)
+               return;
+       prev = &ailp->xa_cursors;
+       for (cur = prev->next; cur; prev = cur, cur = prev->next) {
+               if (cur == done) {
+                       prev->next = cur->next;
+                       break;
+               }
+       }
+       ASSERT(cur);
+}
+
+/*
+ * Invalidate any cursor that is pointing to this item. This is
+ * called when an item is removed from the AIL. Any cursor pointing
+ * to this object is now invalid and the traversal needs to be
+ * terminated so it doesn't reference a freed object. We set the
+ * cursor item to a value of 1 so we can distinguish between an
+ * invalidation and the end of the list when getting the next item
+ * from the cursor.
+ */
+STATIC void
+xfs_trans_ail_cursor_clear(
+       struct xfs_ail          *ailp,
+       struct xfs_log_item     *lip)
+{
+       struct xfs_ail_cursor   *cur;
 
-       lip = xfs_ail_min(&mp->m_ail);
-       if (lip && !XFS_FORCED_SHUTDOWN(mp)) {
-               if (XFS_LSN_CMP(threshold_lsn, mp->m_ail.xa_target) > 0)
-                       xfsaild_wakeup(mp, threshold_lsn);
+       /* need to search all cursors */
+       for (cur = &ailp->xa_cursors; cur; cur = cur->next) {
+               if (cur->item == lip)
+                       cur->item = (struct xfs_log_item *)
+                                       ((__psint_t)cur->item | 1);
        }
 }
 
@@ -103,25 +213,27 @@ xfs_trans_push_ail(
  * Return the current tree generation number for use
  * in calls to xfs_trans_next_ail().
  */
-STATIC xfs_log_item_t *
-xfs_trans_first_push_ail(
-       xfs_mount_t     *mp,
-       int             *gen,
-       xfs_lsn_t       lsn)
+xfs_log_item_t *
+xfs_trans_ail_cursor_first(
+       struct xfs_ail          *ailp,
+       struct xfs_ail_cursor   *cur,
+       xfs_lsn_t               lsn)
 {
-       xfs_log_item_t  *lip;
+       xfs_log_item_t          *lip;
 
-       lip = xfs_ail_min(&mp->m_ail);
-       *gen = (int)mp->m_ail.xa_gen;
+       xfs_trans_ail_cursor_init(ailp, cur);
+       lip = xfs_ail_min(ailp);
        if (lsn == 0)
-               return lip;
+               goto out;
 
-       list_for_each_entry(lip, &mp->m_ail.xa_ail, li_ail) {
+       list_for_each_entry(lip, &ailp->xa_ail, li_ail) {
                if (XFS_LSN_CMP(lip->li_lsn, lsn) >= 0)
-                       return lip;
+                       goto out;
        }
-
-       return NULL;
+       lip = NULL;
+out:
+       xfs_trans_ail_cursor_set(ailp, cur, lip);
+       return lip;
 }
 
 /*
@@ -129,29 +241,29 @@ xfs_trans_first_push_ail(
  */
 long
 xfsaild_push(
-       xfs_mount_t     *mp,
+       struct xfs_ail  *ailp,
        xfs_lsn_t       *last_lsn)
 {
        long            tout = 1000; /* milliseconds */
        xfs_lsn_t       last_pushed_lsn = *last_lsn;
-       xfs_lsn_t       target =  mp->m_ail.xa_target;
+       xfs_lsn_t       target =  ailp->xa_target;
        xfs_lsn_t       lsn;
        xfs_log_item_t  *lip;
-       int             gen;
-       int             restarts;
        int             flush_log, count, stuck;
+       xfs_mount_t     *mp = ailp->xa_mount;
+       struct xfs_ail_cursor   *cur = &ailp->xa_cursors;
 
-#define        XFS_TRANS_PUSH_AIL_RESTARTS     10
-
-       spin_lock(&mp->m_ail_lock);
-       lip = xfs_trans_first_push_ail(mp, &gen, *last_lsn);
+       spin_lock(&ailp->xa_lock);
+       xfs_trans_ail_cursor_init(ailp, cur);
+       lip = xfs_trans_ail_cursor_first(ailp, cur, *last_lsn);
        if (!lip || XFS_FORCED_SHUTDOWN(mp)) {
                /*
                 * AIL is empty or our push has reached the end.
                 */
-               spin_unlock(&mp->m_ail_lock);
+               xfs_trans_ail_cursor_done(ailp, cur);
+               spin_unlock(&ailp->xa_lock);
                last_pushed_lsn = 0;
-               goto out;
+               return tout;
        }
 
        XFS_STATS_INC(xs_push_ail);
@@ -169,7 +281,7 @@ xfsaild_push(
         */
        tout = 10;
        lsn = lip->li_lsn;
-       flush_log = stuck = count = restarts = 0;
+       flush_log = stuck = count = 0;
        while ((XFS_LSN_CMP(lip->li_lsn, target) < 0)) {
                int     lock_result;
                /*
@@ -184,7 +296,7 @@ xfsaild_push(
                 * skip to the next item in the list.
                 */
                lock_result = IOP_TRYLOCK(lip);
-               spin_unlock(&mp->m_ail_lock);
+               spin_unlock(&ailp->xa_lock);
                switch (lock_result) {
                case XFS_ITEM_SUCCESS:
                        XFS_STATS_INC(xs_push_ail_success);
@@ -221,7 +333,7 @@ xfsaild_push(
                        break;
                }
 
-               spin_lock(&mp->m_ail_lock);
+               spin_lock(&ailp->xa_lock);
                /* should we bother continuing? */
                if (XFS_FORCED_SHUTDOWN(mp))
                        break;
@@ -244,14 +356,13 @@ xfsaild_push(
                if (stuck > 100)
                        break;
 
-               lip = xfs_trans_next_ail(mp, lip, &gen, &restarts);
+               lip = xfs_trans_ail_cursor_next(ailp, cur);
                if (lip == NULL)
                        break;
-               if (restarts > XFS_TRANS_PUSH_AIL_RESTARTS)
-                       break;
                lsn = lip->li_lsn;
        }
-       spin_unlock(&mp->m_ail_lock);
+       xfs_trans_ail_cursor_done(ailp, cur);
+       spin_unlock(&ailp->xa_lock);
 
        if (flush_log) {
                /*
@@ -274,8 +385,7 @@ xfsaild_push(
                 */
                tout += 20;
                last_pushed_lsn = 0;
-       } else if ((restarts > XFS_TRANS_PUSH_AIL_RESTARTS) ||
-                  ((stuck * 100) / count > 90)) {
+       } else if ((stuck * 100) / count > 90) {
                /*
                 * Either there is a lot of contention on the AIL or we
                 * are stuck due to operations in progress. "Stuck" in this
@@ -287,7 +397,6 @@ xfsaild_push(
                 */
                tout += 10;
        }
-out:
        *last_lsn = last_pushed_lsn;
        return tout;
 }      /* xfsaild_push */
@@ -303,7 +412,7 @@ out:
  */
 void
 xfs_trans_unlocked_item(
-       xfs_mount_t     *mp,
+       struct xfs_ail  *ailp,
        xfs_log_item_t  *lip)
 {
        xfs_log_item_t  *min_lip;
@@ -315,7 +424,7 @@ xfs_trans_unlocked_item(
         * over some potentially valid data.
         */
        if (!(lip->li_flags & XFS_LI_IN_AIL) ||
-           XFS_FORCED_SHUTDOWN(mp)) {
+           XFS_FORCED_SHUTDOWN(ailp->xa_mount)) {
                return;
        }
 
@@ -331,10 +440,10 @@ xfs_trans_unlocked_item(
         * the call to xfs_log_move_tail() doesn't do anything if there's
         * not enough free space to wake people up so we're safe calling it.
         */
-       min_lip = xfs_ail_min(&mp->m_ail);
+       min_lip = xfs_ail_min(ailp);
 
        if (min_lip == lip)
-               xfs_log_move_tail(mp, 1);
+               xfs_log_move_tail(ailp->xa_mount, 1);
 }      /* xfs_trans_unlocked_item */
 
 
@@ -347,41 +456,37 @@ xfs_trans_unlocked_item(
  * we move in the AIL is the minimum one, update the tail lsn in the
  * log manager.
  *
- * Increment the AIL's generation count to indicate that the tree
- * has changed.
- *
  * This function must be called with the AIL lock held.  The lock
  * is dropped before returning.
  */
 void
-xfs_trans_update_ail(
-       xfs_mount_t     *mp,
+xfs_trans_ail_update(
+       struct xfs_ail  *ailp,
        xfs_log_item_t  *lip,
-       xfs_lsn_t       lsn) __releases(mp->m_ail_lock)
+       xfs_lsn_t       lsn) __releases(ailp->xa_lock)
 {
-       xfs_log_item_t          *dlip=NULL;
+       xfs_log_item_t          *dlip = NULL;
        xfs_log_item_t          *mlip;  /* ptr to minimum lip */
 
-       mlip = xfs_ail_min(&mp->m_ail);
+       mlip = xfs_ail_min(ailp);
 
        if (lip->li_flags & XFS_LI_IN_AIL) {
-               dlip = xfs_ail_delete(&mp->m_ail, lip);
+               dlip = xfs_ail_delete(ailp, lip);
                ASSERT(dlip == lip);
+               xfs_trans_ail_cursor_clear(ailp, dlip);
        } else {
                lip->li_flags |= XFS_LI_IN_AIL;
        }
 
        lip->li_lsn = lsn;
-
-       xfs_ail_insert(&mp->m_ail, lip);
-       mp->m_ail.xa_gen++;
+       xfs_ail_insert(ailp, lip);
 
        if (mlip == dlip) {
-               mlip = xfs_ail_min(&mp->m_ail);
-               spin_unlock(&mp->m_ail_lock);
-               xfs_log_move_tail(mp, mlip->li_lsn);
+               mlip = xfs_ail_min(ailp);
+               spin_unlock(&ailp->xa_lock);
+               xfs_log_move_tail(ailp->xa_mount, mlip->li_lsn);
        } else {
-               spin_unlock(&mp->m_ail_lock);
+               spin_unlock(&ailp->xa_lock);
        }
 
 
@@ -403,29 +508,30 @@ xfs_trans_update_ail(
  * is dropped before returning.
  */
 void
-xfs_trans_delete_ail(
-       xfs_mount_t     *mp,
-       xfs_log_item_t  *lip) __releases(mp->m_ail_lock)
+xfs_trans_ail_delete(
+       struct xfs_ail  *ailp,
+       xfs_log_item_t  *lip) __releases(ailp->xa_lock)
 {
        xfs_log_item_t          *dlip;
        xfs_log_item_t          *mlip;
 
        if (lip->li_flags & XFS_LI_IN_AIL) {
-               mlip = xfs_ail_min(&mp->m_ail);
-               dlip = xfs_ail_delete(&mp->m_ail, lip);
+               mlip = xfs_ail_min(ailp);
+               dlip = xfs_ail_delete(ailp, lip);
                ASSERT(dlip == lip);
+               xfs_trans_ail_cursor_clear(ailp, dlip);
 
 
                lip->li_flags &= ~XFS_LI_IN_AIL;
                lip->li_lsn = 0;
-               mp->m_ail.xa_gen++;
 
                if (mlip == dlip) {
-                       mlip = xfs_ail_min(&mp->m_ail);
-                       spin_unlock(&mp->m_ail_lock);
-                       xfs_log_move_tail(mp, (mlip ? mlip->li_lsn : 0));
+                       mlip = xfs_ail_min(ailp);
+                       spin_unlock(&ailp->xa_lock);
+                       xfs_log_move_tail(ailp->xa_mount,
+                                               (mlip ? mlip->li_lsn : 0));
                } else {
-                       spin_unlock(&mp->m_ail_lock);
+                       spin_unlock(&ailp->xa_lock);
                }
        }
        else {
@@ -433,13 +539,13 @@ xfs_trans_delete_ail(
                 * If the file system is not being shutdown, we are in
                 * serious trouble if we get to this stage.
                 */
-               if (XFS_FORCED_SHUTDOWN(mp))
-                       spin_unlock(&mp->m_ail_lock);
-               else {
+               struct xfs_mount        *mp = ailp->xa_mount;
+
+               spin_unlock(&ailp->xa_lock);
+               if (!XFS_FORCED_SHUTDOWN(mp)) {
                        xfs_cmn_err(XFS_PTAG_AILDELETE, CE_ALERT, mp,
                "%s: attempting to delete a log item that is not in the AIL",
                                        __func__);
-                       spin_unlock(&mp->m_ail_lock);
                        xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
                }
        }
@@ -447,56 +553,6 @@ xfs_trans_delete_ail(
 
 
 
-/*
- * Return the item in the AIL with the smallest lsn.
- * Return the current tree generation number for use
- * in calls to xfs_trans_next_ail().
- */
-xfs_log_item_t *
-xfs_trans_first_ail(
-       xfs_mount_t     *mp,
-       int             *gen)
-{
-       xfs_log_item_t  *lip;
-
-       lip = xfs_ail_min(&mp->m_ail);
-       *gen = (int)mp->m_ail.xa_gen;
-
-       return lip;
-}
-
-/*
- * If the generation count of the tree has not changed since the
- * caller last took something from the AIL, then return the elmt
- * in the tree which follows the one given.  If the count has changed,
- * then return the minimum elmt of the AIL and bump the restarts counter
- * if one is given.
- */
-xfs_log_item_t *
-xfs_trans_next_ail(
-       xfs_mount_t     *mp,
-       xfs_log_item_t  *lip,
-       int             *gen,
-       int             *restarts)
-{
-       xfs_log_item_t  *nlip;
-
-       ASSERT(mp && lip && gen);
-       if (mp->m_ail.xa_gen == *gen) {
-               nlip = xfs_ail_next(&mp->m_ail, lip);
-       } else {
-               nlip = xfs_ail_min(&mp->m_ail);
-               *gen = (int)mp->m_ail.xa_gen;
-               if (restarts != NULL) {
-                       XFS_STATS_INC(xs_push_ail_restarts);
-                       (*restarts)++;
-               }
-       }
-
-       return (nlip);
-}
-
-
 /*
  * The active item list (AIL) is a doubly linked list of log
  * items sorted by ascending lsn.  The base of the list is
@@ -515,15 +571,35 @@ int
 xfs_trans_ail_init(
        xfs_mount_t     *mp)
 {
-       INIT_LIST_HEAD(&mp->m_ail.xa_ail);
-       return xfsaild_start(mp);
+       struct xfs_ail  *ailp;
+       int             error;
+
+       ailp = kmem_zalloc(sizeof(struct xfs_ail), KM_MAYFAIL);
+       if (!ailp)
+               return ENOMEM;
+
+       ailp->xa_mount = mp;
+       INIT_LIST_HEAD(&ailp->xa_ail);
+       spin_lock_init(&ailp->xa_lock);
+       error = xfsaild_start(ailp);
+       if (error)
+               goto out_free_ailp;
+       mp->m_ail = ailp;
+       return 0;
+
+out_free_ailp:
+       kmem_free(ailp);
+       return error;
 }
 
 void
 xfs_trans_ail_destroy(
        xfs_mount_t     *mp)
 {
-       xfsaild_stop(mp);
+       struct xfs_ail  *ailp = mp->m_ail;
+
+       xfsaild_stop(ailp);
+       kmem_free(ailp);
 }
 
 /*
@@ -534,7 +610,7 @@ xfs_trans_ail_destroy(
  */
 STATIC void
 xfs_ail_insert(
-       xfs_ail_t       *ailp,
+       struct xfs_ail  *ailp,
        xfs_log_item_t  *lip)
 /* ARGSUSED */
 {
@@ -568,7 +644,7 @@ xfs_ail_insert(
 /*ARGSUSED*/
 STATIC xfs_log_item_t *
 xfs_ail_delete(
-       xfs_ail_t       *ailp,
+       struct xfs_ail  *ailp,
        xfs_log_item_t  *lip)
 /* ARGSUSED */
 {
@@ -585,7 +661,7 @@ xfs_ail_delete(
  */
 STATIC xfs_log_item_t *
 xfs_ail_min(
-       xfs_ail_t       *ailp)
+       struct xfs_ail  *ailp)
 /* ARGSUSED */
 {
        if (list_empty(&ailp->xa_ail))
@@ -601,7 +677,7 @@ xfs_ail_min(
  */
 STATIC xfs_log_item_t *
 xfs_ail_next(
-       xfs_ail_t       *ailp,
+       struct xfs_ail  *ailp,
        xfs_log_item_t  *lip)
 /* ARGSUSED */
 {
@@ -617,7 +693,7 @@ xfs_ail_next(
  */
 STATIC void
 xfs_ail_check(
-       xfs_ail_t       *ailp,
+       struct xfs_ail  *ailp,
        xfs_log_item_t  *lip)
 {
        xfs_log_item_t  *prev_lip;
index 4e855b5ced666eefeb5b910e0599adeed3a0ec17..8ee2f8c8b0a61863c9686a7bbae9b4794616615f 100644 (file)
@@ -527,9 +527,8 @@ xfs_trans_brelse(xfs_trans_t        *tp,
                        lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
                        if (lip->li_type == XFS_LI_BUF) {
                                bip = XFS_BUF_FSPRIVATE(bp,xfs_buf_log_item_t*);
-                               xfs_trans_unlocked_item(
-                                               bip->bli_item.li_mountp,
-                                               lip);
+                               xfs_trans_unlocked_item(bip->bli_item.li_ailp,
+                                                       lip);
                        }
                }
                xfs_buf_relse(bp);
@@ -626,7 +625,7 @@ xfs_trans_brelse(xfs_trans_t        *tp,
         * tell the AIL that the buffer is being unlocked.
         */
        if (bip != NULL) {
-               xfs_trans_unlocked_item(bip->bli_item.li_mountp,
+               xfs_trans_unlocked_item(bip->bli_item.li_ailp,
                                        (xfs_log_item_t*)bip);
        }
 
index 3c666e8317f82bd3f0e41451a06bd39d3c9ff8db..e110bf57d7f496ddd70ce9f4927da8d67fa20545 100644 (file)
 #include "xfs_inum.h"
 #include "xfs_trans.h"
 #include "xfs_trans_priv.h"
+/* XXX: from here down needed until struct xfs_trans has it's own ailp */
+#include "xfs_bit.h"
+#include "xfs_buf_item.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_dir2.h"
+#include "xfs_dmapi.h"
+#include "xfs_mount.h"
 
 STATIC int     xfs_trans_unlock_chunk(xfs_log_item_chunk_t *,
                                        int, int, xfs_lsn_t);
@@ -79,6 +87,7 @@ xfs_trans_add_item(xfs_trans_t *tp, xfs_log_item_t *lip)
                lidp->lid_size = 0;
                lip->li_desc = lidp;
                lip->li_mountp = tp->t_mountp;
+               lip->li_ailp = tp->t_mountp->m_ail;
                return lidp;
        }
 
@@ -120,6 +129,7 @@ xfs_trans_add_item(xfs_trans_t *tp, xfs_log_item_t *lip)
        lidp->lid_size = 0;
        lip->li_desc = lidp;
        lip->li_mountp = tp->t_mountp;
+       lip->li_ailp = tp->t_mountp->m_ail;
        return lidp;
 }
 
index 3c748c456ed4f9a58c03babd86d9afd401ca8efe..73e2ad3974328ed9a62a81663feab3aacd0f730d 100644 (file)
@@ -44,25 +44,93 @@ xfs_log_busy_slot_t         *xfs_trans_add_busy(xfs_trans_t *tp,
                                                    xfs_extlen_t idx);
 
 /*
- * From xfs_trans_ail.c
+ * AIL traversal cursor.
+ *
+ * Rather than using a generation number for detecting changes in the ail, use
+ * a cursor that is protected by the ail lock. The aild cursor exists in the
+ * struct xfs_ail, but other traversals can declare it on the stack and link it
+ * to the ail list.
+ *
+ * When an object is deleted from or moved int the AIL, the cursor list is
+ * searched to see if the object is a designated cursor item. If it is, it is
+ * deleted from the cursor so that the next time the cursor is used traversal
+ * will return to the start.
+ *
+ * This means a traversal colliding with a removal will cause a restart of the
+ * list scan, rather than any insertion or deletion anywhere in the list. The
+ * low bit of the item pointer is set if the cursor has been invalidated so
+ * that we can tell the difference between invalidation and reaching the end
+ * of the list to trigger traversal restarts.
  */
-void                   xfs_trans_update_ail(struct xfs_mount *mp,
-                                    struct xfs_log_item *lip, xfs_lsn_t lsn)
-                                    __releases(mp->m_ail_lock);
-void                   xfs_trans_delete_ail(struct xfs_mount *mp,
-                                    struct xfs_log_item *lip)
-                                    __releases(mp->m_ail_lock);
-struct xfs_log_item    *xfs_trans_first_ail(struct xfs_mount *, int *);
-struct xfs_log_item    *xfs_trans_next_ail(struct xfs_mount *,
-                                    struct xfs_log_item *, int *, int *);
+struct xfs_ail_cursor {
+       struct xfs_ail_cursor   *next;
+       struct xfs_log_item     *item;
+};
 
+/*
+ * Private AIL structures.
+ *
+ * Eventually we need to drive the locking in here as well.
+ */
+struct xfs_ail {
+       struct xfs_mount        *xa_mount;
+       struct list_head        xa_ail;
+       uint                    xa_gen;
+       struct task_struct      *xa_task;
+       xfs_lsn_t               xa_target;
+       struct xfs_ail_cursor   xa_cursors;
+       spinlock_t              xa_lock;
+};
 
 /*
- * AIL push thread support
+ * From xfs_trans_ail.c
  */
-long   xfsaild_push(struct xfs_mount *, xfs_lsn_t *);
-void   xfsaild_wakeup(struct xfs_mount *, xfs_lsn_t);
-int    xfsaild_start(struct xfs_mount *);
-void   xfsaild_stop(struct xfs_mount *);
+void                   xfs_trans_ail_update(struct xfs_ail *ailp,
+                                       struct xfs_log_item *lip, xfs_lsn_t lsn)
+                                       __releases(ailp->xa_lock);
+void                   xfs_trans_ail_delete(struct xfs_ail *ailp,
+                                       struct xfs_log_item *lip)
+                                       __releases(ailp->xa_lock);
+void                   xfs_trans_ail_push(struct xfs_ail *, xfs_lsn_t);
+void                   xfs_trans_unlocked_item(struct xfs_ail *,
+                                       xfs_log_item_t *);
+
+xfs_lsn_t              xfs_trans_ail_tail(struct xfs_ail *ailp);
+
+struct xfs_log_item    *xfs_trans_ail_cursor_first(struct xfs_ail *ailp,
+                                       struct xfs_ail_cursor *cur,
+                                       xfs_lsn_t lsn);
+struct xfs_log_item    *xfs_trans_ail_cursor_next(struct xfs_ail *ailp,
+                                       struct xfs_ail_cursor *cur);
+void                   xfs_trans_ail_cursor_done(struct xfs_ail *ailp,
+                                       struct xfs_ail_cursor *cur);
+
+long   xfsaild_push(struct xfs_ail *, xfs_lsn_t *);
+void   xfsaild_wakeup(struct xfs_ail *, xfs_lsn_t);
+int    xfsaild_start(struct xfs_ail *);
+void   xfsaild_stop(struct xfs_ail *);
 
+#if BITS_PER_LONG != 64
+static inline void
+xfs_trans_ail_copy_lsn(
+       struct xfs_ail  *ailp,
+       xfs_lsn_t       *dst,
+       xfs_lsn_t       *src)
+{
+       ASSERT(sizeof(xfs_lsn_t) == 8); /* don't lock if it shrinks */
+       spin_lock(&ailp->xa_lock);
+       *dst = *src;
+       spin_unlock(&ailp->xa_lock);
+}
+#else
+static inline void
+xfs_trans_ail_copy_lsn(
+       struct xfs_ail  *ailp,
+       xfs_lsn_t       *dst,
+       xfs_lsn_t       *src)
+{
+       ASSERT(sizeof(xfs_lsn_t) == 8);
+       *dst = *src;
+}
+#endif
 #endif /* __XFS_TRANS_PRIV_H__ */
index 439dd3939dda043d172c5a9b0285085afc042534..305d9f3948e03e773e7a53a88556b28b7fe1f8ed 100644 (file)
 #include "xfs_extfree_item.h"
 #include "xfs_acl.h"
 #include "xfs_attr.h"
-#include "xfs_clnt.h"
 #include "xfs_mru_cache.h"
 #include "xfs_filestream.h"
 #include "xfs_fsops.h"
 #include "xfs_vnodeops.h"
 #include "xfs_vfsops.h"
 #include "xfs_utils.h"
+#include "xfs_sync.h"
 
 
-STATIC void
-xfs_quiesce_fs(
-       xfs_mount_t             *mp)
-{
-       int                     count = 0, pincount;
-
-       xfs_flush_buftarg(mp->m_ddev_targp, 0);
-       xfs_finish_reclaim_all(mp, 0);
-
-       /* This loop must run at least twice.
-        * The first instance of the loop will flush
-        * most meta data but that will generate more
-        * meta data (typically directory updates).
-        * Which then must be flushed and logged before
-        * we can write the unmount record.
-        */
-       do {
-               xfs_syncsub(mp, SYNC_INODE_QUIESCE, NULL);
-               pincount = xfs_flush_buftarg(mp->m_ddev_targp, 1);
-               if (!pincount) {
-                       delay(50);
-                       count++;
-               }
-       } while (count < 2);
-}
-
-/*
- * Second stage of a quiesce. The data is already synced, now we have to take
- * care of the metadata. New transactions are already blocked, so we need to
- * wait for any remaining transactions to drain out before proceding.
- */
-void
-xfs_attr_quiesce(
-       xfs_mount_t     *mp)
-{
-       int     error = 0;
-
-       /* wait for all modifications to complete */
-       while (atomic_read(&mp->m_active_trans) > 0)
-               delay(100);
-
-       /* flush inodes and push all remaining buffers out to disk */
-       xfs_quiesce_fs(mp);
-
-       ASSERT_ALWAYS(atomic_read(&mp->m_active_trans) == 0);
-
-       /* Push the superblock and write an unmount record */
-       error = xfs_log_sbcount(mp, 1);
-       if (error)
-               xfs_fs_cmn_err(CE_WARN, mp,
-                               "xfs_attr_quiesce: failed to log sb changes. "
-                               "Frozen image may not be consistent.");
-       xfs_log_unmount_write(mp);
-       xfs_unmountfs_writesb(mp);
-}
-
 /*
  * xfs_unmount_flush implements a set of flush operation on special
  * inodes, which are needed as a separate set of operations so that
@@ -196,562 +140,3 @@ fscorrupt_out2:
        return XFS_ERROR(EFSCORRUPTED);
 }
 
-/*
- * xfs_sync flushes any pending I/O to file system vfsp.
- *
- * This routine is called by vfs_sync() to make sure that things make it
- * out to disk eventually, on sync() system calls to flush out everything,
- * and when the file system is unmounted.  For the vfs_sync() case, all
- * we really need to do is sync out the log to make all of our meta-data
- * updates permanent (except for timestamps).  For calls from pflushd(),
- * dirty pages are kept moving by calling pdflush() on the inodes
- * containing them.  We also flush the inodes that we can lock without
- * sleeping and the superblock if we can lock it without sleeping from
- * vfs_sync() so that items at the tail of the log are always moving out.
- *
- * Flags:
- *      SYNC_BDFLUSH - We're being called from vfs_sync() so we don't want
- *                    to sleep if we can help it.  All we really need
- *                    to do is ensure that the log is synced at least
- *                    periodically.  We also push the inodes and
- *                    superblock if we can lock them without sleeping
- *                     and they are not pinned.
- *      SYNC_ATTR    - We need to flush the inodes.  If SYNC_BDFLUSH is not
- *                    set, then we really want to lock each inode and flush
- *                    it.
- *      SYNC_WAIT    - All the flushes that take place in this call should
- *                    be synchronous.
- *      SYNC_DELWRI  - This tells us to push dirty pages associated with
- *                    inodes.  SYNC_WAIT and SYNC_BDFLUSH are used to
- *                    determine if they should be flushed sync, async, or
- *                    delwri.
- *      SYNC_CLOSE   - This flag is passed when the system is being
- *                    unmounted.  We should sync and invalidate everything.
- *      SYNC_FSDATA  - This indicates that the caller would like to make
- *                    sure the superblock is safe on disk.  We can ensure
- *                    this by simply making sure the log gets flushed
- *                    if SYNC_BDFLUSH is set, and by actually writing it
- *                    out otherwise.
- *     SYNC_IOWAIT  - The caller wants us to wait for all data I/O to complete
- *                    before we return (including direct I/O). Forms the drain
- *                    side of the write barrier needed to safely quiesce the
- *                    filesystem.
- *
- */
-int
-xfs_sync(
-       xfs_mount_t     *mp,
-       int             flags)
-{
-       int             error;
-
-       /*
-        * Get the Quota Manager to flush the dquots.
-        *
-        * If XFS quota support is not enabled or this filesystem
-        * instance does not use quotas XFS_QM_DQSYNC will always
-        * return zero.
-        */
-       error = XFS_QM_DQSYNC(mp, flags);
-       if (error) {
-               /*
-                * If we got an IO error, we will be shutting down.
-                * So, there's nothing more for us to do here.
-                */
-               ASSERT(error != EIO || XFS_FORCED_SHUTDOWN(mp));
-               if (XFS_FORCED_SHUTDOWN(mp))
-                       return XFS_ERROR(error);
-       }
-
-       if (flags & SYNC_IOWAIT)
-               xfs_filestream_flush(mp);
-
-       return xfs_syncsub(mp, flags, NULL);
-}
-
-/*
- * xfs sync routine for internal use
- *
- * This routine supports all of the flags defined for the generic vfs_sync
- * interface as explained above under xfs_sync.
- *
- */
-int
-xfs_sync_inodes(
-       xfs_mount_t     *mp,
-       int             flags,
-       int             *bypassed)
-{
-       xfs_inode_t     *ip = NULL;
-       struct inode    *vp = NULL;
-       int             error;
-       int             last_error;
-       uint64_t        fflag;
-       uint            lock_flags;
-       uint            base_lock_flags;
-       boolean_t       mount_locked;
-       boolean_t       vnode_refed;
-       int             preempt;
-       xfs_iptr_t      *ipointer;
-#ifdef DEBUG
-       boolean_t       ipointer_in = B_FALSE;
-
-#define IPOINTER_SET   ipointer_in = B_TRUE
-#define IPOINTER_CLR   ipointer_in = B_FALSE
-#else
-#define IPOINTER_SET
-#define IPOINTER_CLR
-#endif
-
-
-/* Insert a marker record into the inode list after inode ip. The list
- * must be locked when this is called. After the call the list will no
- * longer be locked.
- */
-#define IPOINTER_INSERT(ip, mp)        { \
-               ASSERT(ipointer_in == B_FALSE); \
-               ipointer->ip_mnext = ip->i_mnext; \
-               ipointer->ip_mprev = ip; \
-               ip->i_mnext = (xfs_inode_t *)ipointer; \
-               ipointer->ip_mnext->i_mprev = (xfs_inode_t *)ipointer; \
-               preempt = 0; \
-               XFS_MOUNT_IUNLOCK(mp); \
-               mount_locked = B_FALSE; \
-               IPOINTER_SET; \
-       }
-
-/* Remove the marker from the inode list. If the marker was the only item
- * in the list then there are no remaining inodes and we should zero out
- * the whole list. If we are the current head of the list then move the head
- * past us.
- */
-#define IPOINTER_REMOVE(ip, mp)        { \
-               ASSERT(ipointer_in == B_TRUE); \
-               if (ipointer->ip_mnext != (xfs_inode_t *)ipointer) { \
-                       ip = ipointer->ip_mnext; \
-                       ip->i_mprev = ipointer->ip_mprev; \
-                       ipointer->ip_mprev->i_mnext = ip; \
-                       if (mp->m_inodes == (xfs_inode_t *)ipointer) { \
-                               mp->m_inodes = ip; \
-                       } \
-               } else { \
-                       ASSERT(mp->m_inodes == (xfs_inode_t *)ipointer); \
-                       mp->m_inodes = NULL; \
-                       ip = NULL; \
-               } \
-               IPOINTER_CLR; \
-       }
-
-#define XFS_PREEMPT_MASK       0x7f
-
-       ASSERT(!(flags & SYNC_BDFLUSH));
-
-       if (bypassed)
-               *bypassed = 0;
-       if (mp->m_flags & XFS_MOUNT_RDONLY)
-               return 0;
-       error = 0;
-       last_error = 0;
-       preempt = 0;
-
-       /* Allocate a reference marker */
-       ipointer = (xfs_iptr_t *)kmem_zalloc(sizeof(xfs_iptr_t), KM_SLEEP);
-
-       fflag = XFS_B_ASYNC;            /* default is don't wait */
-       if (flags & SYNC_DELWRI)
-               fflag = XFS_B_DELWRI;
-       if (flags & SYNC_WAIT)
-               fflag = 0;              /* synchronous overrides all */
-
-       base_lock_flags = XFS_ILOCK_SHARED;
-       if (flags & (SYNC_DELWRI | SYNC_CLOSE)) {
-               /*
-                * We need the I/O lock if we're going to call any of
-                * the flush/inval routines.
-                */
-               base_lock_flags |= XFS_IOLOCK_SHARED;
-       }
-
-       XFS_MOUNT_ILOCK(mp);
-
-       ip = mp->m_inodes;
-
-       mount_locked = B_TRUE;
-       vnode_refed  = B_FALSE;
-
-       IPOINTER_CLR;
-
-       do {
-               ASSERT(ipointer_in == B_FALSE);
-               ASSERT(vnode_refed == B_FALSE);
-
-               lock_flags = base_lock_flags;
-
-               /*
-                * There were no inodes in the list, just break out
-                * of the loop.
-                */
-               if (ip == NULL) {
-                       break;
-               }
-
-               /*
-                * We found another sync thread marker - skip it
-                */
-               if (ip->i_mount == NULL) {
-                       ip = ip->i_mnext;
-                       continue;
-               }
-
-               vp = VFS_I(ip);
-
-               /*
-                * If the vnode is gone then this is being torn down,
-                * call reclaim if it is flushed, else let regular flush
-                * code deal with it later in the loop.
-                */
-
-               if (vp == NULL) {
-                       /* Skip ones already in reclaim */
-                       if (ip->i_flags & XFS_IRECLAIM) {
-                               ip = ip->i_mnext;
-                               continue;
-                       }
-                       if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL) == 0) {
-                               ip = ip->i_mnext;
-                       } else if ((xfs_ipincount(ip) == 0) &&
-                                   xfs_iflock_nowait(ip)) {
-                               IPOINTER_INSERT(ip, mp);
-
-                               xfs_finish_reclaim(ip, 1,
-                                               XFS_IFLUSH_DELWRI_ELSE_ASYNC);
-
-                               XFS_MOUNT_ILOCK(mp);
-                               mount_locked = B_TRUE;
-                               IPOINTER_REMOVE(ip, mp);
-                       } else {
-                               xfs_iunlock(ip, XFS_ILOCK_EXCL);
-                               ip = ip->i_mnext;
-                       }
-                       continue;
-               }
-
-               if (VN_BAD(vp)) {
-                       ip = ip->i_mnext;
-                       continue;
-               }
-
-               if (XFS_FORCED_SHUTDOWN(mp) && !(flags & SYNC_CLOSE)) {
-                       XFS_MOUNT_IUNLOCK(mp);
-                       kmem_free(ipointer);
-                       return 0;
-               }
-
-               /*
-                * Try to lock without sleeping.  We're out of order with
-                * the inode list lock here, so if we fail we need to drop
-                * the mount lock and try again.  If we're called from
-                * bdflush() here, then don't bother.
-                *
-                * The inode lock here actually coordinates with the
-                * almost spurious inode lock in xfs_ireclaim() to prevent
-                * the vnode we handle here without a reference from
-                * being freed while we reference it.  If we lock the inode
-                * while it's on the mount list here, then the spurious inode
-                * lock in xfs_ireclaim() after the inode is pulled from
-                * the mount list will sleep until we release it here.
-                * This keeps the vnode from being freed while we reference
-                * it.
-                */
-               if (xfs_ilock_nowait(ip, lock_flags) == 0) {
-                       if (vp == NULL) {
-                               ip = ip->i_mnext;
-                               continue;
-                       }
-
-                       vp = vn_grab(vp);
-                       if (vp == NULL) {
-                               ip = ip->i_mnext;
-                               continue;
-                       }
-
-                       IPOINTER_INSERT(ip, mp);
-                       xfs_ilock(ip, lock_flags);
-
-                       ASSERT(vp == VFS_I(ip));
-                       ASSERT(ip->i_mount == mp);
-
-                       vnode_refed = B_TRUE;
-               }
-
-               /* From here on in the loop we may have a marker record
-                * in the inode list.
-                */
-
-               /*
-                * If we have to flush data or wait for I/O completion
-                * we need to drop the ilock that we currently hold.
-                * If we need to drop the lock, insert a marker if we
-                * have not already done so.
-                */
-               if ((flags & (SYNC_CLOSE|SYNC_IOWAIT)) ||
-                   ((flags & SYNC_DELWRI) && VN_DIRTY(vp))) {
-                       if (mount_locked) {
-                               IPOINTER_INSERT(ip, mp);
-                       }
-                       xfs_iunlock(ip, XFS_ILOCK_SHARED);
-
-                       if (flags & SYNC_CLOSE) {
-                               /* Shutdown case. Flush and invalidate. */
-                               if (XFS_FORCED_SHUTDOWN(mp))
-                                       xfs_tosspages(ip, 0, -1,
-                                                            FI_REMAPF);
-                               else
-                                       error = xfs_flushinval_pages(ip,
-                                                       0, -1, FI_REMAPF);
-                       } else if ((flags & SYNC_DELWRI) && VN_DIRTY(vp)) {
-                               error = xfs_flush_pages(ip, 0,
-                                                       -1, fflag, FI_NONE);
-                       }
-
-                       /*
-                        * When freezing, we need to wait ensure all I/O (including direct
-                        * I/O) is complete to ensure no further data modification can take
-                        * place after this point
-                        */
-                       if (flags & SYNC_IOWAIT)
-                               vn_iowait(ip);
-
-                       xfs_ilock(ip, XFS_ILOCK_SHARED);
-               }
-
-               if ((flags & SYNC_ATTR) &&
-                   (ip->i_update_core ||
-                    (ip->i_itemp && ip->i_itemp->ili_format.ilf_fields))) {
-                       if (mount_locked)
-                               IPOINTER_INSERT(ip, mp);
-
-                       if (flags & SYNC_WAIT) {
-                               xfs_iflock(ip);
-                               error = xfs_iflush(ip, XFS_IFLUSH_SYNC);
-
-                       /*
-                        * If we can't acquire the flush lock, then the inode
-                        * is already being flushed so don't bother waiting.
-                        *
-                        * If we can lock it then do a delwri flush so we can
-                        * combine multiple inode flushes in each disk write.
-                        */
-                       } else if (xfs_iflock_nowait(ip)) {
-                               error = xfs_iflush(ip, XFS_IFLUSH_DELWRI);
-                       } else if (bypassed) {
-                               (*bypassed)++;
-                       }
-               }
-
-               if (lock_flags != 0) {
-                       xfs_iunlock(ip, lock_flags);
-               }
-
-               if (vnode_refed) {
-                       /*
-                        * If we had to take a reference on the vnode
-                        * above, then wait until after we've unlocked
-                        * the inode to release the reference.  This is
-                        * because we can be already holding the inode
-                        * lock when IRELE() calls xfs_inactive().
-                        *
-                        * Make sure to drop the mount lock before calling
-                        * IRELE() so that we don't trip over ourselves if
-                        * we have to go for the mount lock again in the
-                        * inactive code.
-                        */
-                       if (mount_locked) {
-                               IPOINTER_INSERT(ip, mp);
-                       }
-
-                       IRELE(ip);
-
-                       vnode_refed = B_FALSE;
-               }
-
-               if (error) {
-                       last_error = error;
-               }
-
-               /*
-                * bail out if the filesystem is corrupted.
-                */
-               if (error == EFSCORRUPTED)  {
-                       if (!mount_locked) {
-                               XFS_MOUNT_ILOCK(mp);
-                               IPOINTER_REMOVE(ip, mp);
-                       }
-                       XFS_MOUNT_IUNLOCK(mp);
-                       ASSERT(ipointer_in == B_FALSE);
-                       kmem_free(ipointer);
-                       return XFS_ERROR(error);
-               }
-
-               /* Let other threads have a chance at the mount lock
-                * if we have looped many times without dropping the
-                * lock.
-                */
-               if ((++preempt & XFS_PREEMPT_MASK) == 0) {
-                       if (mount_locked) {
-                               IPOINTER_INSERT(ip, mp);
-                       }
-               }
-
-               if (mount_locked == B_FALSE) {
-                       XFS_MOUNT_ILOCK(mp);
-                       mount_locked = B_TRUE;
-                       IPOINTER_REMOVE(ip, mp);
-                       continue;
-               }
-
-               ASSERT(ipointer_in == B_FALSE);
-               ip = ip->i_mnext;
-
-       } while (ip != mp->m_inodes);
-
-       XFS_MOUNT_IUNLOCK(mp);
-
-       ASSERT(ipointer_in == B_FALSE);
-
-       kmem_free(ipointer);
-       return XFS_ERROR(last_error);
-}
-
-/*
- * xfs sync routine for internal use
- *
- * This routine supports all of the flags defined for the generic vfs_sync
- * interface as explained above under xfs_sync.
- *
- */
-int
-xfs_syncsub(
-       xfs_mount_t     *mp,
-       int             flags,
-       int             *bypassed)
-{
-       int             error = 0;
-       int             last_error = 0;
-       uint            log_flags = XFS_LOG_FORCE;
-       xfs_buf_t       *bp;
-       xfs_buf_log_item_t      *bip;
-
-       /*
-        * Sync out the log.  This ensures that the log is periodically
-        * flushed even if there is not enough activity to fill it up.
-        */
-       if (flags & SYNC_WAIT)
-               log_flags |= XFS_LOG_SYNC;
-
-       xfs_log_force(mp, (xfs_lsn_t)0, log_flags);
-
-       if (flags & (SYNC_ATTR|SYNC_DELWRI)) {
-               if (flags & SYNC_BDFLUSH)
-                       xfs_finish_reclaim_all(mp, 1);
-               else
-                       error = xfs_sync_inodes(mp, flags, bypassed);
-       }
-
-       /*
-        * Flushing out dirty data above probably generated more
-        * log activity, so if this isn't vfs_sync() then flush
-        * the log again.
-        */
-       if (flags & SYNC_DELWRI) {
-               xfs_log_force(mp, (xfs_lsn_t)0, log_flags);
-       }
-
-       if (flags & SYNC_FSDATA) {
-               /*
-                * If this is vfs_sync() then only sync the superblock
-                * if we can lock it without sleeping and it is not pinned.
-                */
-               if (flags & SYNC_BDFLUSH) {
-                       bp = xfs_getsb(mp, XFS_BUF_TRYLOCK);
-                       if (bp != NULL) {
-                               bip = XFS_BUF_FSPRIVATE(bp,xfs_buf_log_item_t*);
-                               if ((bip != NULL) &&
-                                   xfs_buf_item_dirty(bip)) {
-                                       if (!(XFS_BUF_ISPINNED(bp))) {
-                                               XFS_BUF_ASYNC(bp);
-                                               error = xfs_bwrite(mp, bp);
-                                       } else {
-                                               xfs_buf_relse(bp);
-                                       }
-                               } else {
-                                       xfs_buf_relse(bp);
-                               }
-                       }
-               } else {
-                       bp = xfs_getsb(mp, 0);
-                       /*
-                        * If the buffer is pinned then push on the log so
-                        * we won't get stuck waiting in the write for
-                        * someone, maybe ourselves, to flush the log.
-                        * Even though we just pushed the log above, we
-                        * did not have the superblock buffer locked at
-                        * that point so it can become pinned in between
-                        * there and here.
-                        */
-                       if (XFS_BUF_ISPINNED(bp))
-                               xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE);
-                       if (flags & SYNC_WAIT)
-                               XFS_BUF_UNASYNC(bp);
-                       else
-                               XFS_BUF_ASYNC(bp);
-                       error = xfs_bwrite(mp, bp);
-               }
-               if (error) {
-                       last_error = error;
-               }
-       }
-
-       /*
-        * Now check to see if the log needs a "dummy" transaction.
-        */
-       if (!(flags & SYNC_REMOUNT) && xfs_log_need_covered(mp)) {
-               xfs_trans_t *tp;
-               xfs_inode_t *ip;
-
-               /*
-                * Put a dummy transaction in the log to tell
-                * recovery that all others are OK.
-                */
-               tp = xfs_trans_alloc(mp, XFS_TRANS_DUMMY1);
-               if ((error = xfs_trans_reserve(tp, 0,
-                               XFS_ICHANGE_LOG_RES(mp),
-                               0, 0, 0)))  {
-                       xfs_trans_cancel(tp, 0);
-                       return error;
-               }
-
-               ip = mp->m_rootip;
-               xfs_ilock(ip, XFS_ILOCK_EXCL);
-
-               xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
-               xfs_trans_ihold(tp, ip);
-               xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-               error = xfs_trans_commit(tp, 0);
-               xfs_iunlock(ip, XFS_ILOCK_EXCL);
-               xfs_log_force(mp, (xfs_lsn_t)0, log_flags);
-       }
-
-       /*
-        * When shutting down, we need to insure that the AIL is pushed
-        * to disk or the filesystem can appear corrupt from the PROM.
-        */
-       if ((flags & (SYNC_CLOSE|SYNC_WAIT)) == (SYNC_CLOSE|SYNC_WAIT)) {
-               XFS_bflush(mp->m_ddev_targp);
-               if (mp->m_rtdev_targp) {
-                       XFS_bflush(mp->m_rtdev_targp);
-               }
-       }
-
-       return XFS_ERROR(last_error);
-}
index a74b05087da480244620fdd512102eee02976c7c..6b8e0b52b95e8db3ff481784a9c345389a4e6566 100644 (file)
@@ -8,9 +8,7 @@ struct kstatfs;
 struct xfs_mount;
 struct xfs_mount_args;
 
-int xfs_sync(struct xfs_mount *mp, int flags);
 void xfs_do_force_shutdown(struct xfs_mount *mp, int flags, char *fname,
                int lnnum);
-void xfs_attr_quiesce(struct xfs_mount *mp);
 
 #endif /* _XFS_VFSOPS_H */
index 8b6812f66a15bf2895aba5fed5f04be0c22625d8..c45ea278ef414b79c292e180ff8b6a8233435010 100644 (file)
@@ -79,8 +79,7 @@ int
 xfs_setattr(
        struct xfs_inode        *ip,
        struct iattr            *iattr,
-       int                     flags,
-       cred_t                  *credp)
+       int                     flags)
 {
        xfs_mount_t             *mp = ip->i_mount;
        struct inode            *inode = VFS_I(ip);
@@ -233,10 +232,6 @@ xfs_setattr(
 
        /*
         * Change file ownership.  Must be the owner or privileged.
-        * If the system was configured with the "restricted_chown"
-        * option, the owner is not permitted to give away the file,
-        * and can change the group id only to a group of which he
-        * or she is a member.
         */
        if (mask & (ATTR_UID|ATTR_GID)) {
                /*
@@ -260,9 +255,8 @@ xfs_setattr(
                 * shall be equal to either the group ID or one of the
                 * supplementary group IDs of the calling process.
                 */
-               if (restricted_chown &&
-                   (iuid != uid || (igid != gid &&
-                                    !in_group_p((gid_t)gid))) &&
+               if ((iuid != uid ||
+                    (igid != gid && !in_group_p((gid_t)gid))) &&
                    !capable(CAP_CHOWN)) {
                        code = XFS_ERROR(EPERM);
                        goto error_return;
@@ -456,10 +450,6 @@ xfs_setattr(
 
        /*
         * Change file ownership.  Must be the owner or privileged.
-        * If the system was configured with the "restricted_chown"
-        * option, the owner is not permitted to give away the file,
-        * and can change the group id only to a group of which he
-        * or she is a member.
         */
        if (mask & (ATTR_UID|ATTR_GID)) {
                /*
@@ -2009,7 +1999,7 @@ xfs_remove(
                        goto out_bmap_cancel;
 
                /*
-                * Drop the link from dp to ip.
+                * Drop the "." link from ip to self.
                 */
                error = xfs_droplink(tp, ip);
                if (error)
@@ -2024,7 +2014,7 @@ xfs_remove(
        }
 
        /*
-        * Drop the "." link from ip to self.
+        * Drop the link from dp to ip.
         */
        error = xfs_droplink(tp, ip);
        if (error)
@@ -2833,122 +2823,10 @@ xfs_reclaim(
        if (!ip->i_update_core && (ip->i_itemp == NULL)) {
                xfs_ilock(ip, XFS_ILOCK_EXCL);
                xfs_iflock(ip);
-               return xfs_finish_reclaim(ip, 1, XFS_IFLUSH_DELWRI_ELSE_SYNC);
-       } else {
-               xfs_mount_t     *mp = ip->i_mount;
-
-               /* Protect sync and unpin from us */
-               XFS_MOUNT_ILOCK(mp);
-               spin_lock(&ip->i_flags_lock);
-               __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
-               VFS_I(ip)->i_private = NULL;
-               ip->i_vnode = NULL;
-               spin_unlock(&ip->i_flags_lock);
-               list_add_tail(&ip->i_reclaim, &mp->m_del_inodes);
-               XFS_MOUNT_IUNLOCK(mp);
+               xfs_iflags_set(ip, XFS_IRECLAIMABLE);
+               return xfs_reclaim_inode(ip, 1, XFS_IFLUSH_DELWRI_ELSE_SYNC);
        }
-       return 0;
-}
-
-int
-xfs_finish_reclaim(
-       xfs_inode_t     *ip,
-       int             locked,
-       int             sync_mode)
-{
-       xfs_perag_t     *pag = xfs_get_perag(ip->i_mount, ip->i_ino);
-       struct inode    *vp = VFS_I(ip);
-
-       if (vp && VN_BAD(vp))
-               goto reclaim;
-
-       /* The hash lock here protects a thread in xfs_iget_core from
-        * racing with us on linking the inode back with a vnode.
-        * Once we have the XFS_IRECLAIM flag set it will not touch
-        * us.
-        */
-       write_lock(&pag->pag_ici_lock);
-       spin_lock(&ip->i_flags_lock);
-       if (__xfs_iflags_test(ip, XFS_IRECLAIM) ||
-           (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) && vp == NULL)) {
-               spin_unlock(&ip->i_flags_lock);
-               write_unlock(&pag->pag_ici_lock);
-               if (locked) {
-                       xfs_ifunlock(ip);
-                       xfs_iunlock(ip, XFS_ILOCK_EXCL);
-               }
-               return 1;
-       }
-       __xfs_iflags_set(ip, XFS_IRECLAIM);
-       spin_unlock(&ip->i_flags_lock);
-       write_unlock(&pag->pag_ici_lock);
-       xfs_put_perag(ip->i_mount, pag);
-
-       /*
-        * If the inode is still dirty, then flush it out.  If the inode
-        * is not in the AIL, then it will be OK to flush it delwri as
-        * long as xfs_iflush() does not keep any references to the inode.
-        * We leave that decision up to xfs_iflush() since it has the
-        * knowledge of whether it's OK to simply do a delwri flush of
-        * the inode or whether we need to wait until the inode is
-        * pulled from the AIL.
-        * We get the flush lock regardless, though, just to make sure
-        * we don't free it while it is being flushed.
-        */
-       if (!locked) {
-               xfs_ilock(ip, XFS_ILOCK_EXCL);
-               xfs_iflock(ip);
-       }
-
-       /*
-        * In the case of a forced shutdown we rely on xfs_iflush() to
-        * wait for the inode to be unpinned before returning an error.
-        */
-       if (xfs_iflush(ip, sync_mode) == 0) {
-               /* synchronize with xfs_iflush_done */
-               xfs_iflock(ip);
-               xfs_ifunlock(ip);
-       }
-
-       xfs_iunlock(ip, XFS_ILOCK_EXCL);
-
- reclaim:
-       xfs_ireclaim(ip);
-       return 0;
-}
-
-int
-xfs_finish_reclaim_all(xfs_mount_t *mp, int noblock)
-{
-       int             purged;
-       xfs_inode_t     *ip, *n;
-       int             done = 0;
-
-       while (!done) {
-               purged = 0;
-               XFS_MOUNT_ILOCK(mp);
-               list_for_each_entry_safe(ip, n, &mp->m_del_inodes, i_reclaim) {
-                       if (noblock) {
-                               if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL) == 0)
-                                       continue;
-                               if (xfs_ipincount(ip) ||
-                                   !xfs_iflock_nowait(ip)) {
-                                       xfs_iunlock(ip, XFS_ILOCK_EXCL);
-                                       continue;
-                               }
-                       }
-                       XFS_MOUNT_IUNLOCK(mp);
-                       if (xfs_finish_reclaim(ip, noblock,
-                                       XFS_IFLUSH_DELWRI_ELSE_ASYNC))
-                               delay(1);
-                       purged = 1;
-                       break;
-               }
-
-               done = !purged;
-       }
-
-       XFS_MOUNT_IUNLOCK(mp);
+       xfs_inode_set_reclaim_tag(ip);
        return 0;
 }
 
@@ -3474,7 +3352,6 @@ xfs_change_file_space(
        int             cmd,
        xfs_flock64_t   *bf,
        xfs_off_t       offset,
-       cred_t          *credp,
        int             attr_flags)
 {
        xfs_mount_t     *mp = ip->i_mount;
@@ -3562,7 +3439,7 @@ xfs_change_file_space(
                iattr.ia_valid = ATTR_SIZE;
                iattr.ia_size = startoffset;
 
-               error = xfs_setattr(ip, &iattr, attr_flags, credp);
+               error = xfs_setattr(ip, &iattr, attr_flags);
 
                if (error)
                        return error;
index e932a96bec54994615819245a277fb7ca4cfe8fb..b1ae8e3f4043b1b88cf771990518c6bb5f251d96 100644 (file)
@@ -15,8 +15,7 @@ struct xfs_iomap;
 
 
 int xfs_open(struct xfs_inode *ip);
-int xfs_setattr(struct xfs_inode *ip, struct iattr *vap, int flags,
-               struct cred *credp);
+int xfs_setattr(struct xfs_inode *ip, struct iattr *vap, int flags);
 #define        XFS_ATTR_DMI            0x01    /* invocation from a DMI function */
 #define        XFS_ATTR_NONBLOCK       0x02    /* return EAGAIN if operation would block */
 #define XFS_ATTR_NOLOCK                0x04    /* Don't grab any conflicting locks */
@@ -44,8 +43,7 @@ int xfs_inode_flush(struct xfs_inode *ip, int flags);
 int xfs_set_dmattrs(struct xfs_inode *ip, u_int evmask, u_int16_t state);
 int xfs_reclaim(struct xfs_inode *ip);
 int xfs_change_file_space(struct xfs_inode *ip, int cmd,
-               xfs_flock64_t *bf, xfs_off_t offset,
-               struct cred *credp, int attr_flags);
+               xfs_flock64_t *bf, xfs_off_t offset, int attr_flags);
 int xfs_rename(struct xfs_inode *src_dp, struct xfs_name *src_name,
                struct xfs_inode *src_ip, struct xfs_inode *target_dp,
                struct xfs_name *target_name, struct xfs_inode *target_ip);
index 0dcdd9458f4bcce56d1a89c566892f5d5097e876..51bd9370d4372f28a0a77207c3bc61fb027b9d08 100644 (file)
@@ -1874,7 +1874,9 @@ extern loff_t default_llseek(struct file *file, loff_t offset, int origin);
 
 extern loff_t vfs_llseek(struct file *file, loff_t offset, int origin);
 
+extern struct inode * inode_init_always(struct super_block *, struct inode *);
 extern void inode_init_once(struct inode *);
+extern void inode_add_to_lists(struct super_block *, struct inode *);
 extern void iput(struct inode *);
 extern struct inode * igrab(struct inode *);
 extern ino_t iunique(struct super_block *, ino_t);
index c35da23ab8fb025f0fd0a03b54f90df3fe246cad..fafeb48f27c046d0b6586e2797b40397a6a76f19 100644 (file)
@@ -730,7 +730,6 @@ static const struct trans_ctl_table trans_fs_quota_table[] = {
 };
 
 static const struct trans_ctl_table trans_fs_xfs_table[] = {
-       { XFS_RESTRICT_CHOWN,   "restrict_chown" },
        { XFS_SGID_INHERIT,     "irix_sgid_inherit" },
        { XFS_SYMLINK_MODE,     "irix_symlink_mode" },
        { XFS_PANIC_MASK,       "panic_mask" },