fs/xfs/xfs_inode.c

   1 /*
   2  * Copyright (c) 2000-2006 Silicon Graphics, Inc.
   3  * All Rights Reserved.
   4  *
   5  * This program is free software; you can redistribute it and/or
   6  * modify it under the terms of the GNU General Public License as
   7  * published by the Free Software Foundation.
   8  *
   9  * This program is distributed in the hope that it would be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12  * GNU General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU General Public License
  15  * along with this program; if not, write the Free Software Foundation,
  16  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  17  */
  18 #include <linux/log2.h>
  19
  20 #include "xfs.h"
  21 #include "xfs_fs.h"
  22 #include "xfs_types.h"
  23 #include "xfs_log.h"
  24 #include "xfs_inum.h"
  25 #include "xfs_trans.h"
  26 #include "xfs_trans_priv.h"
  27 #include "xfs_sb.h"
  28 #include "xfs_ag.h"
  29 #include "xfs_mount.h"
  30 #include "xfs_bmap_btree.h"
  31 #include "xfs_alloc_btree.h"
  32 #include "xfs_ialloc_btree.h"
  33 #include "xfs_attr_sf.h"
  34 #include "xfs_dinode.h"
  35 #include "xfs_inode.h"
  36 #include "xfs_buf_item.h"
  37 #include "xfs_inode_item.h"
  38 #include "xfs_btree.h"
  39 #include "xfs_alloc.h"
  40 #include "xfs_ialloc.h"
  41 #include "xfs_bmap.h"
  42 #include "xfs_error.h"
  43 #include "xfs_utils.h"
  44 #include "xfs_quota.h"
  45 #include "xfs_filestream.h"
  46 #include "xfs_vnodeops.h"
  47 #include "xfs_trace.h"
  48 #include "xfs_icache.h"
  49
  50 kmem_zone_t *xfs_ifork_zone;
  51 kmem_zone_t *xfs_inode_zone;
  52
  53 /*
  54  * Used in xfs_itruncate_extents().  This is the maximum number of extents
  55  * freed from a file in a single transaction.
  56  */
  57 #define XFS_ITRUNC_MAX_EXTENTS  2
  58
  59 STATIC int xfs_iflush_int(xfs_inode_t *, xfs_buf_t *);
  60 STATIC int xfs_iformat_local(xfs_inode_t *, xfs_dinode_t *, int, int);
  61 STATIC int xfs_iformat_extents(xfs_inode_t *, xfs_dinode_t *, int);
  62 STATIC int xfs_iformat_btree(xfs_inode_t *, xfs_dinode_t *, int);
  63
  64 /*
  65  * helper function to extract extent size hint from inode
  66  */
  67 xfs_extlen_t
  68 xfs_get_extsz_hint(
  69         struct xfs_inode        *ip)
  70 {
  71         if ((ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE) && ip->i_d.di_extsize)
  72                 return ip->i_d.di_extsize;
  73         if (XFS_IS_REALTIME_INODE(ip))
  74                 return ip->i_mount->m_sb.sb_rextsize;
  75         return 0;
  76 }
  77
  78 /*
  79  * This is a wrapper routine around the xfs_ilock() routine used to centralize
  80  * some grungy code.  It is used in places that wish to lock the inode solely
  81  * for reading the extents.  The reason these places can't just call
  82  * xfs_ilock(SHARED) is that the inode lock also guards to bringing in of the
  83  * extents from disk for a file in b-tree format.  If the inode is in b-tree
  84  * format, then we need to lock the inode exclusively until the extents are read
  85  * in.  Locking it exclusively all the time would limit our parallelism
  86  * unnecessarily, though.  What we do instead is check to see if the extents
  87  * have been read in yet, and only lock the inode exclusively if they have not.
  88  *
  89  * The function returns a value which should be given to the corresponding
  90  * xfs_iunlock_map_shared().  This value is the mode in which the lock was
  91  * actually taken.
  92  */
  93 uint
  94 xfs_ilock_map_shared(
  95         xfs_inode_t     *ip)
  96 {
  97         uint    lock_mode;
  98
  99         if ((ip->i_d.di_format == XFS_DINODE_FMT_BTREE) &&
 100             ((ip->i_df.if_flags & XFS_IFEXTENTS) == 0)) {
 101                 lock_mode = XFS_ILOCK_EXCL;
 102         } else {
 103                 lock_mode = XFS_ILOCK_SHARED;
 104         }
 105
 106         xfs_ilock(ip, lock_mode);
 107
 108         return lock_mode;
 109 }
 110
 111 /*
 112  * This is simply the unlock routine to go with xfs_ilock_map_shared().
 113  * All it does is call xfs_iunlock() with the given lock_mode.
 114  */
 115 void
 116 xfs_iunlock_map_shared(
 117         xfs_inode_t     *ip,
 118         unsigned int    lock_mode)
 119 {
 120         xfs_iunlock(ip, lock_mode);
 121 }
 122
 123 /*
 124  * The xfs inode contains 2 locks: a multi-reader lock called the
 125  * i_iolock and a multi-reader lock called the i_lock.  This routine
 126  * allows either or both of the locks to be obtained.
 127  *
 128  * The 2 locks should always be ordered so that the IO lock is
 129  * obtained first in order to prevent deadlock.
 130  *
 131  * ip -- the inode being locked
 132  * lock_flags -- this parameter indicates the inode's locks
 133  *       to be locked.  It can be:
 134  *              XFS_IOLOCK_SHARED,
 135  *              XFS_IOLOCK_EXCL,
 136  *              XFS_ILOCK_SHARED,
 137  *              XFS_ILOCK_EXCL,
 138  *              XFS_IOLOCK_SHARED | XFS_ILOCK_SHARED,
 139  *              XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL,
 140  *              XFS_IOLOCK_EXCL | XFS_ILOCK_SHARED,
 141  *              XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL
 142  */
 143 void
 144 xfs_ilock(
 145         xfs_inode_t             *ip,
 146         uint                    lock_flags)
 147 {
 148         trace_xfs_ilock(ip, lock_flags, _RET_IP_);
 149
 150         /*
 151          * You can't set both SHARED and EXCL for the same lock,
 152          * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
 153          * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
 154          */
 155         ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
 156                (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
 157         ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
 158                (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
 159         ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
 160
 161         if (lock_flags & XFS_IOLOCK_EXCL)
 162                 mrupdate_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags));
 163         else if (lock_flags & XFS_IOLOCK_SHARED)
 164                 mraccess_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags));
 165
 166         if (lock_flags & XFS_ILOCK_EXCL)
 167                 mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
 168         else if (lock_flags & XFS_ILOCK_SHARED)
 169                 mraccess_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
 170 }
 171
 172 /*
 173  * This is just like xfs_ilock(), except that the caller
 174  * is guaranteed not to sleep.  It returns 1 if it gets
 175  * the requested locks and 0 otherwise.  If the IO lock is
 176  * obtained but the inode lock cannot be, then the IO lock
 177  * is dropped before returning.
 178  *
 179  * ip -- the inode being locked
 180  * lock_flags -- this parameter indicates the inode's locks to be
 181  *       to be locked.  See the comment for xfs_ilock() for a list
 182  *       of valid values.
 183  */
 184 int
 185 xfs_ilock_nowait(
 186         xfs_inode_t             *ip,
 187         uint                    lock_flags)
 188 {
 189         trace_xfs_ilock_nowait(ip, lock_flags, _RET_IP_);
 190
 191         /*
 192          * You can't set both SHARED and EXCL for the same lock,
 193          * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
 194          * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
 195          */
 196         ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
 197                (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
 198         ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
 199                (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
 200         ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
 201
 202         if (lock_flags & XFS_IOLOCK_EXCL) {
 203                 if (!mrtryupdate(&ip->i_iolock))
 204                         goto out;
 205         } else if (lock_flags & XFS_IOLOCK_SHARED) {
 206                 if (!mrtryaccess(&ip->i_iolock))
 207                         goto out;
 208         }
 209         if (lock_flags & XFS_ILOCK_EXCL) {
 210                 if (!mrtryupdate(&ip->i_lock))
 211                         goto out_undo_iolock;
 212         } else if (lock_flags & XFS_ILOCK_SHARED) {
 213                 if (!mrtryaccess(&ip->i_lock))
 214                         goto out_undo_iolock;
 215         }
 216         return 1;
 217
 218  out_undo_iolock:
 219         if (lock_flags & XFS_IOLOCK_EXCL)
 220                 mrunlock_excl(&ip->i_iolock);
 221         else if (lock_flags & XFS_IOLOCK_SHARED)
 222                 mrunlock_shared(&ip->i_iolock);
 223  out:
 224         return 0;
 225 }
 226
 227 /*
 228  * xfs_iunlock() is used to drop the inode locks acquired with
 229  * xfs_ilock() and xfs_ilock_nowait().  The caller must pass
 230  * in the flags given to xfs_ilock() or xfs_ilock_nowait() so
 231  * that we know which locks to drop.
 232  *
 233  * ip -- the inode being unlocked
 234  * lock_flags -- this parameter indicates the inode's locks to be
 235  *       to be unlocked.  See the comment for xfs_ilock() for a list
 236  *       of valid values for this parameter.
 237  *
 238  */
 239 void
 240 xfs_iunlock(
 241         xfs_inode_t             *ip,
 242         uint                    lock_flags)
 243 {
 244         /*
 245          * You can't set both SHARED and EXCL for the same lock,
 246          * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
 247          * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
 248          */
 249         ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
 250                (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
 251         ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
 252                (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
 253         ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
 254         ASSERT(lock_flags != 0);
 255
 256         if (lock_flags & XFS_IOLOCK_EXCL)
 257                 mrunlock_excl(&ip->i_iolock);
 258         else if (lock_flags & XFS_IOLOCK_SHARED)
 259                 mrunlock_shared(&ip->i_iolock);
 260
 261         if (lock_flags & XFS_ILOCK_EXCL)
 262                 mrunlock_excl(&ip->i_lock);
 263         else if (lock_flags & XFS_ILOCK_SHARED)
 264                 mrunlock_shared(&ip->i_lock);
 265
 266         trace_xfs_iunlock(ip, lock_flags, _RET_IP_);
 267 }
 268
 269 /*
 270  * give up write locks.  the i/o lock cannot be held nested
 271  * if it is being demoted.
 272  */
 273 void
 274 xfs_ilock_demote(
 275         xfs_inode_t             *ip,
 276         uint                    lock_flags)
 277 {
 278         ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL));
 279         ASSERT((lock_flags & ~(XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)) == 0);
 280
 281         if (lock_flags & XFS_ILOCK_EXCL)
 282                 mrdemote(&ip->i_lock);
 283         if (lock_flags & XFS_IOLOCK_EXCL)
 284                 mrdemote(&ip->i_iolock);
 285
 286         trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_);
 287 }
 288
 289 #ifdef DEBUG
 290 int
 291 xfs_isilocked(
 292         xfs_inode_t             *ip,
 293         uint                    lock_flags)
 294 {
 295         if (lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) {
 296                 if (!(lock_flags & XFS_ILOCK_SHARED))
 297                         return !!ip->i_lock.mr_writer;
 298                 return rwsem_is_locked(&ip->i_lock.mr_lock);
 299         }
 300
 301         if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) {
 302                 if (!(lock_flags & XFS_IOLOCK_SHARED))
 303                         return !!ip->i_iolock.mr_writer;
 304                 return rwsem_is_locked(&ip->i_iolock.mr_lock);
 305         }
 306
 307         ASSERT(0);
 308         return 0;
 309 }
 310 #endif
 311
 312 void
 313 __xfs_iflock(
 314         struct xfs_inode        *ip)
 315 {
 316         wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IFLOCK_BIT);
 317         DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT);
 318
 319         do {
 320                 prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
 321                 if (xfs_isiflocked(ip))
 322                         io_schedule();
 323         } while (!xfs_iflock_nowait(ip));
 324
 325         finish_wait(wq, &wait.wait);
 326 }
 327
 328 #ifdef DEBUG
 329 /*
 330  * Make sure that the extents in the given memory buffer
 331  * are valid.
 332  */
 333 STATIC void
 334 xfs_validate_extents(
 335         xfs_ifork_t             *ifp,
 336         int                     nrecs,
 337         xfs_exntfmt_t           fmt)
 338 {
 339         xfs_bmbt_irec_t         irec;
 340         xfs_bmbt_rec_host_t     rec;
 341         int                     i;
 342
 343         for (i = 0; i < nrecs; i++) {
 344                 xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
 345                 rec.l0 = get_unaligned(&ep->l0);
 346                 rec.l1 = get_unaligned(&ep->l1);
 347                 xfs_bmbt_get_all(&rec, &irec);
 348                 if (fmt == XFS_EXTFMT_NOSTATE)
 349                         ASSERT(irec.br_state == XFS_EXT_NORM);
 350         }
 351 }
 352 #else /* DEBUG */
 353 #define xfs_validate_extents(ifp, nrecs, fmt)
 354 #endif /* DEBUG */
 355
 356 /*
 357  * Check that none of the inode's in the buffer have a next
 358  * unlinked field of 0.
 359  */
 360 #if defined(DEBUG)
 361 void
 362 xfs_inobp_check(
 363         xfs_mount_t     *mp,
 364         xfs_buf_t       *bp)
 365 {
 366         int             i;
 367         int             j;
 368         xfs_dinode_t    *dip;
 369
 370         j = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog;
 371
 372         for (i = 0; i < j; i++) {
 373                 dip = (xfs_dinode_t *)xfs_buf_offset(bp,
 374                                         i * mp->m_sb.sb_inodesize);
 375                 if (!dip->di_next_unlinked)  {
 376                         xfs_alert(mp,
 377         "Detected bogus zero next_unlinked field in incore inode buffer 0x%p.",
 378                                 bp);
 379                         ASSERT(dip->di_next_unlinked);
 380                 }
 381         }
 382 }
 383 #endif
 384
 385 static void
 386 xfs_inode_buf_verify(
 387         struct xfs_buf  *bp)
 388 {
 389         struct xfs_mount *mp = bp->b_target->bt_mount;
 390         int             i;
 391         int             ni;
 392
 393         /*
 394          * Validate the magic number and version of every inode in the buffer
 395          */
 396         ni = XFS_BB_TO_FSB(mp, bp->b_length) * mp->m_sb.sb_inopblock;
 397         for (i = 0; i < ni; i++) {
 398                 int             di_ok;
 399                 xfs_dinode_t    *dip;
 400
 401                 dip = (struct xfs_dinode *)xfs_buf_offset(bp,
 402                                         (i << mp->m_sb.sb_inodelog));
 403                 di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) &&
 404                             XFS_DINODE_GOOD_VERSION(dip->di_version);
 405                 if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
 406                                                 XFS_ERRTAG_ITOBP_INOTOBP,
 407                                                 XFS_RANDOM_ITOBP_INOTOBP))) {
 408                         xfs_buf_ioerror(bp, EFSCORRUPTED);
 409                         XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_HIGH,
 410                                              mp, dip);
 411 #ifdef DEBUG
 412                         xfs_emerg(mp,
 413                                 "bad inode magic/vsn daddr %lld #%d (magic=%x)",
 414                                 (unsigned long long)bp->b_bn, i,
 415                                 be16_to_cpu(dip->di_magic));
 416                         ASSERT(0);
 417 #endif
 418                 }
 419         }
 420         xfs_inobp_check(mp, bp);
 421 }
 422
 423 void
 424 xfs_inode_buf_write_verify(
 425         struct xfs_buf  *bp)
 426 {
 427         xfs_inode_buf_verify(bp);
 428 }
 429
 430 void
 431 xfs_inode_buf_read_verify(
 432         struct xfs_buf  *bp)
 433 {
 434         xfs_inode_buf_verify(bp);
 435         bp->b_pre_io = xfs_inode_buf_write_verify;
 436         bp->b_iodone = NULL;
 437         xfs_buf_ioend(bp, 0);
 438 }
 439
 440 /*
 441  * This routine is called to map an inode to the buffer containing the on-disk
 442  * version of the inode.  It returns a pointer to the buffer containing the
 443  * on-disk inode in the bpp parameter, and in the dipp parameter it returns a
 444  * pointer to the on-disk inode within that buffer.
 445  *
 446  * If a non-zero error is returned, then the contents of bpp and dipp are
 447  * undefined.
 448  */
 449 int
 450 xfs_imap_to_bp(
 451         struct xfs_mount        *mp,
 452         struct xfs_trans        *tp,
 453         struct xfs_imap         *imap,
 454         struct xfs_dinode       **dipp,
 455         struct xfs_buf          **bpp,
 456         uint                    buf_flags,
 457         uint                    iget_flags)
 458 {
 459         struct xfs_buf          *bp;
 460         int                     error;
 461
 462         buf_flags |= XBF_UNMAPPED;
 463         error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno,
 464                                    (int)imap->im_len, buf_flags, &bp,
 465                                    xfs_inode_buf_read_verify);
 466         if (error) {
 467                 if (error == EAGAIN) {
 468                         ASSERT(buf_flags & XBF_TRYLOCK);
 469                         return error;
 470                 }
 471
 472                 if (error == EFSCORRUPTED &&
 473                     (iget_flags & XFS_IGET_UNTRUSTED))
 474                         return XFS_ERROR(EINVAL);
 475
 476                 xfs_warn(mp, "%s: xfs_trans_read_buf() returned error %d.",
 477                         __func__, error);
 478                 return error;
 479         }
 480
 481         *bpp = bp;
 482         *dipp = (struct xfs_dinode *)xfs_buf_offset(bp, imap->im_boffset);
 483         return 0;
 484 }
 485
 486 /*
 487  * Move inode type and inode format specific information from the
 488  * on-disk inode to the in-core inode.  For fifos, devs, and sockets
 489  * this means set if_rdev to the proper value.  For files, directories,
 490  * and symlinks this means to bring in the in-line data or extent
 491  * pointers.  For a file in B-tree format, only the root is immediately
 492  * brought in-core.  The rest will be in-lined in if_extents when it
 493  * is first referenced (see xfs_iread_extents()).
 494  */
 495 STATIC int
 496 xfs_iformat(
 497         xfs_inode_t             *ip,
 498         xfs_dinode_t            *dip)
 499 {
 500         xfs_attr_shortform_t    *atp;
 501         int                     size;
 502         int                     error = 0;
 503         xfs_fsize_t             di_size;
 504
 505         if (unlikely(be32_to_cpu(dip->di_nextents) +
 506                      be16_to_cpu(dip->di_anextents) >
 507                      be64_to_cpu(dip->di_nblocks))) {
 508                 xfs_warn(ip->i_mount,
 509                         "corrupt dinode %Lu, extent total = %d, nblocks = %Lu.",
 510                         (unsigned long long)ip->i_ino,
 511                         (int)(be32_to_cpu(dip->di_nextents) +
 512                               be16_to_cpu(dip->di_anextents)),
 513                         (unsigned long long)
 514                                 be64_to_cpu(dip->di_nblocks));
 515                 XFS_CORRUPTION_ERROR("xfs_iformat(1)", XFS_ERRLEVEL_LOW,
 516                                      ip->i_mount, dip);
 517                 return XFS_ERROR(EFSCORRUPTED);
 518         }
 519
 520         if (unlikely(dip->di_forkoff > ip->i_mount->m_sb.sb_inodesize)) {
 521                 xfs_warn(ip->i_mount, "corrupt dinode %Lu, forkoff = 0x%x.",
 522                         (unsigned long long)ip->i_ino,
 523                         dip->di_forkoff);
 524                 XFS_CORRUPTION_ERROR("xfs_iformat(2)", XFS_ERRLEVEL_LOW,
 525                                      ip->i_mount, dip);
 526                 return XFS_ERROR(EFSCORRUPTED);
 527         }
 528
 529         if (unlikely((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) &&
 530                      !ip->i_mount->m_rtdev_targp)) {
 531                 xfs_warn(ip->i_mount,
 532                         "corrupt dinode %Lu, has realtime flag set.",
 533                         ip->i_ino);
 534                 XFS_CORRUPTION_ERROR("xfs_iformat(realtime)",
 535                                      XFS_ERRLEVEL_LOW, ip->i_mount, dip);
 536                 return XFS_ERROR(EFSCORRUPTED);
 537         }
 538
 539         switch (ip->i_d.di_mode & S_IFMT) {
 540         case S_IFIFO:
 541         case S_IFCHR:
 542         case S_IFBLK:
 543         case S_IFSOCK:
 544                 if (unlikely(dip->di_format != XFS_DINODE_FMT_DEV)) {
 545                         XFS_CORRUPTION_ERROR("xfs_iformat(3)", XFS_ERRLEVEL_LOW,
 546                                               ip->i_mount, dip);
 547                         return XFS_ERROR(EFSCORRUPTED);
 548                 }
 549                 ip->i_d.di_size = 0;
 550                 ip->i_df.if_u2.if_rdev = xfs_dinode_get_rdev(dip);
 551                 break;
 552
 553         case S_IFREG:
 554         case S_IFLNK:
 555         case S_IFDIR:
 556                 switch (dip->di_format) {
 557                 case XFS_DINODE_FMT_LOCAL:
 558                         /*
 559                          * no local regular files yet
 560                          */
 561                         if (unlikely(S_ISREG(be16_to_cpu(dip->di_mode)))) {
 562                                 xfs_warn(ip->i_mount,
 563                         "corrupt inode %Lu (local format for regular file).",
 564                                         (unsigned long long) ip->i_ino);
 565                                 XFS_CORRUPTION_ERROR("xfs_iformat(4)",
 566                                                      XFS_ERRLEVEL_LOW,
 567                                                      ip->i_mount, dip);
 568                                 return XFS_ERROR(EFSCORRUPTED);
 569                         }
 570
 571                         di_size = be64_to_cpu(dip->di_size);
 572                         if (unlikely(di_size > XFS_DFORK_DSIZE(dip, ip->i_mount))) {
 573                                 xfs_warn(ip->i_mount,
 574                         "corrupt inode %Lu (bad size %Ld for local inode).",
 575                                         (unsigned long long) ip->i_ino,
 576                                         (long long) di_size);
 577                                 XFS_CORRUPTION_ERROR("xfs_iformat(5)",
 578                                                      XFS_ERRLEVEL_LOW,
 579                                                      ip->i_mount, dip);
 580                                 return XFS_ERROR(EFSCORRUPTED);
 581                         }
 582
 583                         size = (int)di_size;
 584                         error = xfs_iformat_local(ip, dip, XFS_DATA_FORK, size);
 585                         break;
 586                 case XFS_DINODE_FMT_EXTENTS:
 587                         error = xfs_iformat_extents(ip, dip, XFS_DATA_FORK);
 588                         break;
 589                 case XFS_DINODE_FMT_BTREE:
 590                         error = xfs_iformat_btree(ip, dip, XFS_DATA_FORK);
 591                         break;
 592                 default:
 593                         XFS_ERROR_REPORT("xfs_iformat(6)", XFS_ERRLEVEL_LOW,
 594                                          ip->i_mount);
 595                         return XFS_ERROR(EFSCORRUPTED);
 596                 }
 597                 break;
 598
 599         default:
 600                 XFS_ERROR_REPORT("xfs_iformat(7)", XFS_ERRLEVEL_LOW, ip->i_mount);
 601                 return XFS_ERROR(EFSCORRUPTED);
 602         }
 603         if (error) {
 604                 return error;
 605         }
 606         if (!XFS_DFORK_Q(dip))
 607                 return 0;
 608
 609         ASSERT(ip->i_afp == NULL);
 610         ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP | KM_NOFS);
 611
 612         switch (dip->di_aformat) {
 613         case XFS_DINODE_FMT_LOCAL:
 614                 atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip);
 615                 size = be16_to_cpu(atp->hdr.totsize);
 616
 617                 if (unlikely(size < sizeof(struct xfs_attr_sf_hdr))) {
 618                         xfs_warn(ip->i_mount,
 619                                 "corrupt inode %Lu (bad attr fork size %Ld).",
 620                                 (unsigned long long) ip->i_ino,
 621                                 (long long) size);
 622                         XFS_CORRUPTION_ERROR("xfs_iformat(8)",
 623                                              XFS_ERRLEVEL_LOW,
 624                                              ip->i_mount, dip);
 625                         return XFS_ERROR(EFSCORRUPTED);
 626                 }
 627
 628                 error = xfs_iformat_local(ip, dip, XFS_ATTR_FORK, size);
 629                 break;
 630         case XFS_DINODE_FMT_EXTENTS:
 631                 error = xfs_iformat_extents(ip, dip, XFS_ATTR_FORK);
 632                 break;
 633         case XFS_DINODE_FMT_BTREE:
 634                 error = xfs_iformat_btree(ip, dip, XFS_ATTR_FORK);
 635                 break;
 636         default:
 637                 error = XFS_ERROR(EFSCORRUPTED);
 638                 break;
 639         }
 640         if (error) {
 641                 kmem_zone_free(xfs_ifork_zone, ip->i_afp);
 642                 ip->i_afp = NULL;
 643                 xfs_idestroy_fork(ip, XFS_DATA_FORK);
 644         }
 645         return error;
 646 }
 647
 648 /*
 649  * The file is in-lined in the on-disk inode.
 650  * If it fits into if_inline_data, then copy
 651  * it there, otherwise allocate a buffer for it
 652  * and copy the data there.  Either way, set
 653  * if_data to point at the data.
 654  * If we allocate a buffer for the data, make
 655  * sure that its size is a multiple of 4 and
 656  * record the real size in i_real_bytes.
 657  */
 658 STATIC int
 659 xfs_iformat_local(
 660         xfs_inode_t     *ip,
 661         xfs_dinode_t    *dip,
 662         int             whichfork,
 663         int             size)
 664 {
 665         xfs_ifork_t     *ifp;
 666         int             real_size;
 667
 668         /*
 669          * If the size is unreasonable, then something
 670          * is wrong and we just bail out rather than crash in
 671          * kmem_alloc() or memcpy() below.
 672          */
 673         if (unlikely(size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) {
 674                 xfs_warn(ip->i_mount,
 675         "corrupt inode %Lu (bad size %d for local fork, size = %d).",
 676                         (unsigned long long) ip->i_ino, size,
 677                         XFS_DFORK_SIZE(dip, ip->i_mount, whichfork));
 678                 XFS_CORRUPTION_ERROR("xfs_iformat_local", XFS_ERRLEVEL_LOW,
 679                                      ip->i_mount, dip);
 680                 return XFS_ERROR(EFSCORRUPTED);
 681         }
 682         ifp = XFS_IFORK_PTR(ip, whichfork);
 683         real_size = 0;
 684         if (size == 0)
 685                 ifp->if_u1.if_data = NULL;
 686         else if (size <= sizeof(ifp->if_u2.if_inline_data))
 687                 ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
 688         else {
 689                 real_size = roundup(size, 4);
 690                 ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP | KM_NOFS);
 691         }
 692         ifp->if_bytes = size;
 693         ifp->if_real_bytes = real_size;
 694         if (size)
 695                 memcpy(ifp->if_u1.if_data, XFS_DFORK_PTR(dip, whichfork), size);
 696         ifp->if_flags &= ~XFS_IFEXTENTS;
 697         ifp->if_flags |= XFS_IFINLINE;
 698         return 0;
 699 }
 700
 701 /*
 702  * The file consists of a set of extents all
 703  * of which fit into the on-disk inode.
 704  * If there are few enough extents to fit into
 705  * the if_inline_ext, then copy them there.
 706  * Otherwise allocate a buffer for them and copy
 707  * them into it.  Either way, set if_extents
 708  * to point at the extents.
 709  */
 710 STATIC int
 711 xfs_iformat_extents(
 712         xfs_inode_t     *ip,
 713         xfs_dinode_t    *dip,
 714         int             whichfork)
 715 {
 716         xfs_bmbt_rec_t  *dp;
 717         xfs_ifork_t     *ifp;
 718         int             nex;
 719         int             size;
 720         int             i;
 721
 722         ifp = XFS_IFORK_PTR(ip, whichfork);
 723         nex = XFS_DFORK_NEXTENTS(dip, whichfork);
 724         size = nex * (uint)sizeof(xfs_bmbt_rec_t);
 725
 726         /*
 727          * If the number of extents is unreasonable, then something
 728          * is wrong and we just bail out rather than crash in
 729          * kmem_alloc() or memcpy() below.
 730          */
 731         if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) {
 732                 xfs_warn(ip->i_mount, "corrupt inode %Lu ((a)extents = %d).",
 733                         (unsigned long long) ip->i_ino, nex);
 734                 XFS_CORRUPTION_ERROR("xfs_iformat_extents(1)", XFS_ERRLEVEL_LOW,
 735                                      ip->i_mount, dip);
 736                 return XFS_ERROR(EFSCORRUPTED);
 737         }
 738
 739         ifp->if_real_bytes = 0;
 740         if (nex == 0)
 741                 ifp->if_u1.if_extents = NULL;
 742         else if (nex <= XFS_INLINE_EXTS)
 743                 ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
 744         else
 745                 xfs_iext_add(ifp, 0, nex);
 746
 747         ifp->if_bytes = size;
 748         if (size) {
 749                 dp = (xfs_bmbt_rec_t *) XFS_DFORK_PTR(dip, whichfork);
 750                 xfs_validate_extents(ifp, nex, XFS_EXTFMT_INODE(ip));
 751                 for (i = 0; i < nex; i++, dp++) {
 752                         xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
 753                         ep->l0 = get_unaligned_be64(&dp->l0);
 754                         ep->l1 = get_unaligned_be64(&dp->l1);
 755                 }
 756                 XFS_BMAP_TRACE_EXLIST(ip, nex, whichfork);
 757                 if (whichfork != XFS_DATA_FORK ||
 758                         XFS_EXTFMT_INODE(ip) == XFS_EXTFMT_NOSTATE)
 759                                 if (unlikely(xfs_check_nostate_extents(
 760                                     ifp, 0, nex))) {
 761                                         XFS_ERROR_REPORT("xfs_iformat_extents(2)",
 762                                                          XFS_ERRLEVEL_LOW,
 763                                                          ip->i_mount);
 764                                         return XFS_ERROR(EFSCORRUPTED);
 765                                 }
 766         }
 767         ifp->if_flags |= XFS_IFEXTENTS;
 768         return 0;
 769 }
 770
 771 /*
 772  * The file has too many extents to fit into
 773  * the inode, so they are in B-tree format.
 774  * Allocate a buffer for the root of the B-tree
 775  * and copy the root into it.  The i_extents
 776  * field will remain NULL until all of the
 777  * extents are read in (when they are needed).
 778  */
 779 STATIC int
 780 xfs_iformat_btree(
 781         xfs_inode_t             *ip,
 782         xfs_dinode_t            *dip,
 783         int                     whichfork)
 784 {
 785         xfs_bmdr_block_t        *dfp;
 786         xfs_ifork_t             *ifp;
 787         /* REFERENCED */
 788         int                     nrecs;
 789         int                     size;
 790
 791         ifp = XFS_IFORK_PTR(ip, whichfork);
 792         dfp = (xfs_bmdr_block_t *)XFS_DFORK_PTR(dip, whichfork);
 793         size = XFS_BMAP_BROOT_SPACE(dfp);
 794         nrecs = be16_to_cpu(dfp->bb_numrecs);
 795
 796         /*
 797          * blow out if -- fork has less extents than can fit in
 798          * fork (fork shouldn't be a btree format), root btree
 799          * block has more records than can fit into the fork,
 800          * or the number of extents is greater than the number of
 801          * blocks.
 802          */
 803         if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <=
 804                         XFS_IFORK_MAXEXT(ip, whichfork) ||
 805                      XFS_BMDR_SPACE_CALC(nrecs) >
 806                         XFS_DFORK_SIZE(dip, ip->i_mount, whichfork) ||
 807                      XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) {
 808                 xfs_warn(ip->i_mount, "corrupt inode %Lu (btree).",
 809                         (unsigned long long) ip->i_ino);
 810                 XFS_CORRUPTION_ERROR("xfs_iformat_btree", XFS_ERRLEVEL_LOW,
 811                                  ip->i_mount, dip);
 812                 return XFS_ERROR(EFSCORRUPTED);
 813         }
 814
 815         ifp->if_broot_bytes = size;
 816         ifp->if_broot = kmem_alloc(size, KM_SLEEP | KM_NOFS);
 817         ASSERT(ifp->if_broot != NULL);
 818         /*
 819          * Copy and convert from the on-disk structure
 820          * to the in-memory structure.
 821          */
 822         xfs_bmdr_to_bmbt(ip->i_mount, dfp,
 823                          XFS_DFORK_SIZE(dip, ip->i_mount, whichfork),
 824                          ifp->if_broot, size);
 825         ifp->if_flags &= ~XFS_IFEXTENTS;
 826         ifp->if_flags |= XFS_IFBROOT;
 827
 828         return 0;
 829 }
 830
 831 STATIC void
 832 xfs_dinode_from_disk(
 833         xfs_icdinode_t          *to,
 834         xfs_dinode_t            *from)
 835 {
 836         to->di_magic = be16_to_cpu(from->di_magic);
 837         to->di_mode = be16_to_cpu(from->di_mode);
 838         to->di_version = from ->di_version;
 839         to->di_format = from->di_format;
 840         to->di_onlink = be16_to_cpu(from->di_onlink);
 841         to->di_uid = be32_to_cpu(from->di_uid);
 842         to->di_gid = be32_to_cpu(from->di_gid);
 843         to->di_nlink = be32_to_cpu(from->di_nlink);
 844         to->di_projid_lo = be16_to_cpu(from->di_projid_lo);
 845         to->di_projid_hi = be16_to_cpu(from->di_projid_hi);
 846         memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
 847         to->di_flushiter = be16_to_cpu(from->di_flushiter);
 848         to->di_atime.t_sec = be32_to_cpu(from->di_atime.t_sec);
 849         to->di_atime.t_nsec = be32_to_cpu(from->di_atime.t_nsec);
 850         to->di_mtime.t_sec = be32_to_cpu(from->di_mtime.t_sec);
 851         to->di_mtime.t_nsec = be32_to_cpu(from->di_mtime.t_nsec);
 852         to->di_ctime.t_sec = be32_to_cpu(from->di_ctime.t_sec);
 853         to->di_ctime.t_nsec = be32_to_cpu(from->di_ctime.t_nsec);
 854         to->di_size = be64_to_cpu(from->di_size);
 855         to->di_nblocks = be64_to_cpu(from->di_nblocks);
 856         to->di_extsize = be32_to_cpu(from->di_extsize);
 857         to->di_nextents = be32_to_cpu(from->di_nextents);
 858         to->di_anextents = be16_to_cpu(from->di_anextents);
 859         to->di_forkoff = from->di_forkoff;
 860         to->di_aformat  = from->di_aformat;
 861         to->di_dmevmask = be32_to_cpu(from->di_dmevmask);
 862         to->di_dmstate  = be16_to_cpu(from->di_dmstate);
 863         to->di_flags    = be16_to_cpu(from->di_flags);
 864         to->di_gen      = be32_to_cpu(from->di_gen);
 865 }
 866
 867 void
 868 xfs_dinode_to_disk(
 869         xfs_dinode_t            *to,
 870         xfs_icdinode_t          *from)
 871 {
 872         to->di_magic = cpu_to_be16(from->di_magic);
 873         to->di_mode = cpu_to_be16(from->di_mode);
 874         to->di_version = from ->di_version;
 875         to->di_format = from->di_format;
 876         to->di_onlink = cpu_to_be16(from->di_onlink);
 877         to->di_uid = cpu_to_be32(from->di_uid);
 878         to->di_gid = cpu_to_be32(from->di_gid);
 879         to->di_nlink = cpu_to_be32(from->di_nlink);
 880         to->di_projid_lo = cpu_to_be16(from->di_projid_lo);
 881         to->di_projid_hi = cpu_to_be16(from->di_projid_hi);
 882         memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
 883         to->di_flushiter = cpu_to_be16(from->di_flushiter);
 884         to->di_atime.t_sec = cpu_to_be32(from->di_atime.t_sec);
 885         to->di_atime.t_nsec = cpu_to_be32(from->di_atime.t_nsec);
 886         to->di_mtime.t_sec = cpu_to_be32(from->di_mtime.t_sec);
 887         to->di_mtime.t_nsec = cpu_to_be32(from->di_mtime.t_nsec);
 888         to->di_ctime.t_sec = cpu_to_be32(from->di_ctime.t_sec);
 889         to->di_ctime.t_nsec = cpu_to_be32(from->di_ctime.t_nsec);
 890         to->di_size = cpu_to_be64(from->di_size);
 891         to->di_nblocks = cpu_to_be64(from->di_nblocks);
 892         to->di_extsize = cpu_to_be32(from->di_extsize);
 893         to->di_nextents = cpu_to_be32(from->di_nextents);
 894         to->di_anextents = cpu_to_be16(from->di_anextents);
 895         to->di_forkoff = from->di_forkoff;
 896         to->di_aformat = from->di_aformat;
 897         to->di_dmevmask = cpu_to_be32(from->di_dmevmask);
 898         to->di_dmstate = cpu_to_be16(from->di_dmstate);
 899         to->di_flags = cpu_to_be16(from->di_flags);
 900         to->di_gen = cpu_to_be32(from->di_gen);
 901 }
 902
 903 STATIC uint
 904 _xfs_dic2xflags(
 905         __uint16_t              di_flags)
 906 {
 907         uint                    flags = 0;
 908
 909         if (di_flags & XFS_DIFLAG_ANY) {
 910                 if (di_flags & XFS_DIFLAG_REALTIME)
 911                         flags |= XFS_XFLAG_REALTIME;
 912                 if (di_flags & XFS_DIFLAG_PREALLOC)
 913                         flags |= XFS_XFLAG_PREALLOC;
 914                 if (di_flags & XFS_DIFLAG_IMMUTABLE)
 915                         flags |= XFS_XFLAG_IMMUTABLE;
 916                 if (di_flags & XFS_DIFLAG_APPEND)
 917                         flags |= XFS_XFLAG_APPEND;
 918                 if (di_flags & XFS_DIFLAG_SYNC)
 919                         flags |= XFS_XFLAG_SYNC;
 920                 if (di_flags & XFS_DIFLAG_NOATIME)
 921                         flags |= XFS_XFLAG_NOATIME;
 922                 if (di_flags & XFS_DIFLAG_NODUMP)
 923                         flags |= XFS_XFLAG_NODUMP;
 924                 if (di_flags & XFS_DIFLAG_RTINHERIT)
 925                         flags |= XFS_XFLAG_RTINHERIT;
 926                 if (di_flags & XFS_DIFLAG_PROJINHERIT)
 927                         flags |= XFS_XFLAG_PROJINHERIT;
 928                 if (di_flags & XFS_DIFLAG_NOSYMLINKS)
 929                         flags |= XFS_XFLAG_NOSYMLINKS;
 930                 if (di_flags & XFS_DIFLAG_EXTSIZE)
 931                         flags |= XFS_XFLAG_EXTSIZE;
 932                 if (di_flags & XFS_DIFLAG_EXTSZINHERIT)
 933                         flags |= XFS_XFLAG_EXTSZINHERIT;
 934                 if (di_flags & XFS_DIFLAG_NODEFRAG)
 935                         flags |= XFS_XFLAG_NODEFRAG;
 936                 if (di_flags & XFS_DIFLAG_FILESTREAM)
 937                         flags |= XFS_XFLAG_FILESTREAM;
 938         }
 939
 940         return flags;
 941 }
 942
 943 uint
 944 xfs_ip2xflags(
 945         xfs_inode_t             *ip)
 946 {
 947         xfs_icdinode_t          *dic = &ip->i_d;
 948
 949         return _xfs_dic2xflags(dic->di_flags) |
 950                                 (XFS_IFORK_Q(ip) ? XFS_XFLAG_HASATTR : 0);
 951 }
 952
 953 uint
 954 xfs_dic2xflags(
 955         xfs_dinode_t            *dip)
 956 {
 957         return _xfs_dic2xflags(be16_to_cpu(dip->di_flags)) |
 958                                 (XFS_DFORK_Q(dip) ? XFS_XFLAG_HASATTR : 0);
 959 }
 960
 961 /*
 962  * Read the disk inode attributes into the in-core inode structure.
 963  */
 964 int
 965 xfs_iread(
 966         xfs_mount_t     *mp,
 967         xfs_trans_t     *tp,
 968         xfs_inode_t     *ip,
 969         uint            iget_flags)
 970 {
 971         xfs_buf_t       *bp;
 972         xfs_dinode_t    *dip;
 973         int             error;
 974
 975         /*
 976          * Fill in the location information in the in-core inode.
 977          */
 978         error = xfs_imap(mp, tp, ip->i_ino, &ip->i_imap, iget_flags);
 979         if (error)
 980                 return error;
 981
 982         /*
 983          * Get pointers to the on-disk inode and the buffer containing it.
 984          */
 985         error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &bp, 0, iget_flags);
 986         if (error)
 987                 return error;
 988
 989         /*
 990          * If we got something that isn't an inode it means someone
 991          * (nfs or dmi) has a stale handle.
 992          */
 993         if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC)) {
 994 #ifdef DEBUG
 995                 xfs_alert(mp,
 996                         "%s: dip->di_magic (0x%x) != XFS_DINODE_MAGIC (0x%x)",
 997                         __func__, be16_to_cpu(dip->di_magic), XFS_DINODE_MAGIC);
 998 #endif /* DEBUG */
 999                 error = XFS_ERROR(EINVAL);
1000                 goto out_brelse;
1001         }
1002
1003         /*
1004          * If the on-disk inode is already linked to a directory
1005          * entry, copy all of the inode into the in-core inode.
1006          * xfs_iformat() handles copying in the inode format
1007          * specific information.
1008          * Otherwise, just get the truly permanent information.
1009          */
1010         if (dip->di_mode) {
1011                 xfs_dinode_from_disk(&ip->i_d, dip);
1012                 error = xfs_iformat(ip, dip);
1013                 if (error)  {
1014 #ifdef DEBUG
1015                         xfs_alert(mp, "%s: xfs_iformat() returned error %d",
1016                                 __func__, error);
1017 #endif /* DEBUG */
1018                         goto out_brelse;
1019                 }
1020         } else {
1021                 ip->i_d.di_magic = be16_to_cpu(dip->di_magic);
1022                 ip->i_d.di_version = dip->di_version;
1023                 ip->i_d.di_gen = be32_to_cpu(dip->di_gen);
1024                 ip->i_d.di_flushiter = be16_to_cpu(dip->di_flushiter);
1025                 /*
1026                  * Make sure to pull in the mode here as well in
1027                  * case the inode is released without being used.
1028                  * This ensures that xfs_inactive() will see that
1029                  * the inode is already free and not try to mess
1030                  * with the uninitialized part of it.
1031                  */
1032                 ip->i_d.di_mode = 0;
1033         }
1034
1035         /*
1036          * The inode format changed when we moved the link count and
1037          * made it 32 bits long.  If this is an old format inode,
1038          * convert it in memory to look like a new one.  If it gets
1039          * flushed to disk we will convert back before flushing or
1040          * logging it.  We zero out the new projid field and the old link
1041          * count field.  We'll handle clearing the pad field (the remains
1042          * of the old uuid field) when we actually convert the inode to
1043          * the new format. We don't change the version number so that we
1044          * can distinguish this from a real new format inode.
1045          */
1046         if (ip->i_d.di_version == 1) {
1047                 ip->i_d.di_nlink = ip->i_d.di_onlink;
1048                 ip->i_d.di_onlink = 0;
1049                 xfs_set_projid(ip, 0);
1050         }
1051
1052         ip->i_delayed_blks = 0;
1053
1054         /*
1055          * Mark the buffer containing the inode as something to keep
1056          * around for a while.  This helps to keep recently accessed
1057          * meta-data in-core longer.
1058          */
1059         xfs_buf_set_ref(bp, XFS_INO_REF);
1060
1061         /*
1062          * Use xfs_trans_brelse() to release the buffer containing the
1063          * on-disk inode, because it was acquired with xfs_trans_read_buf()
1064          * in xfs_imap_to_bp() above.  If tp is NULL, this is just a normal
1065          * brelse().  If we're within a transaction, then xfs_trans_brelse()
1066          * will only release the buffer if it is not dirty within the
1067          * transaction.  It will be OK to release the buffer in this case,
1068          * because inodes on disk are never destroyed and we will be
1069          * locking the new in-core inode before putting it in the hash
1070          * table where other processes can find it.  Thus we don't have
1071          * to worry about the inode being changed just because we released
1072          * the buffer.
1073          */
1074  out_brelse:
1075         xfs_trans_brelse(tp, bp);
1076         return error;
1077 }
1078
1079 /*
1080  * Read in extents from a btree-format inode.
1081  * Allocate and fill in if_extents.  Real work is done in xfs_bmap.c.
1082  */
1083 int
1084 xfs_iread_extents(
1085         xfs_trans_t     *tp,
1086         xfs_inode_t     *ip,
1087         int             whichfork)
1088 {
1089         int             error;
1090         xfs_ifork_t     *ifp;
1091         xfs_extnum_t    nextents;
1092
1093         if (unlikely(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) {
1094                 XFS_ERROR_REPORT("xfs_iread_extents", XFS_ERRLEVEL_LOW,
1095                                  ip->i_mount);
1096                 return XFS_ERROR(EFSCORRUPTED);
1097         }
1098         nextents = XFS_IFORK_NEXTENTS(ip, whichfork);
1099         ifp = XFS_IFORK_PTR(ip, whichfork);
1100
1101         /*
1102          * We know that the size is valid (it's checked in iformat_btree)
1103          */
1104         ifp->if_bytes = ifp->if_real_bytes = 0;
1105         ifp->if_flags |= XFS_IFEXTENTS;
1106         xfs_iext_add(ifp, 0, nextents);
1107         error = xfs_bmap_read_extents(tp, ip, whichfork);
1108         if (error) {
1109                 xfs_iext_destroy(ifp);
1110                 ifp->if_flags &= ~XFS_IFEXTENTS;
1111                 return error;
1112         }
1113         xfs_validate_extents(ifp, nextents, XFS_EXTFMT_INODE(ip));
1114         return 0;
1115 }
1116
1117 /*
1118  * Allocate an inode on disk and return a copy of its in-core version.
1119  * The in-core inode is locked exclusively.  Set mode, nlink, and rdev
1120  * appropriately within the inode.  The uid and gid for the inode are
1121  * set according to the contents of the given cred structure.
1122  *
1123  * Use xfs_dialloc() to allocate the on-disk inode. If xfs_dialloc()
1124  * has a free inode available, call xfs_iget() to obtain the in-core
1125  * version of the allocated inode.  Finally, fill in the inode and
1126  * log its initial contents.  In this case, ialloc_context would be
1127  * set to NULL.
1128  *
1129  * If xfs_dialloc() does not have an available inode, it will replenish
1130  * its supply by doing an allocation. Since we can only do one
1131  * allocation within a transaction without deadlocks, we must commit
1132  * the current transaction before returning the inode itself.
1133  * In this case, therefore, we will set ialloc_context and return.
1134  * The caller should then commit the current transaction, start a new
1135  * transaction, and call xfs_ialloc() again to actually get the inode.
1136  *
1137  * To ensure that some other process does not grab the inode that
1138  * was allocated during the first call to xfs_ialloc(), this routine
1139  * also returns the [locked] bp pointing to the head of the freelist
1140  * as ialloc_context.  The caller should hold this buffer across
1141  * the commit and pass it back into this routine on the second call.
1142  *
1143  * If we are allocating quota inodes, we do not have a parent inode
1144  * to attach to or associate with (i.e. pip == NULL) because they
1145  * are not linked into the directory structure - they are attached
1146  * directly to the superblock - and so have no parent.
1147  */
1148 int
1149 xfs_ialloc(
1150         xfs_trans_t     *tp,
1151         xfs_inode_t     *pip,
1152         umode_t         mode,
1153         xfs_nlink_t     nlink,
1154         xfs_dev_t       rdev,
1155         prid_t          prid,
1156         int             okalloc,
1157         xfs_buf_t       **ialloc_context,
1158         xfs_inode_t     **ipp)
1159 {
1160         xfs_ino_t       ino;
1161         xfs_inode_t     *ip;
1162         uint            flags;
1163         int             error;
1164         timespec_t      tv;
1165         int             filestreams = 0;
1166
1167         /*
1168          * Call the space management code to pick
1169          * the on-disk inode to be allocated.
1170          */
1171         error = xfs_dialloc(tp, pip ? pip->i_ino : 0, mode, okalloc,
1172                             ialloc_context, &ino);
1173         if (error)
1174                 return error;
1175         if (*ialloc_context || ino == NULLFSINO) {
1176                 *ipp = NULL;
1177                 return 0;
1178         }
1179         ASSERT(*ialloc_context == NULL);
1180
1181         /*
1182          * Get the in-core inode with the lock held exclusively.
1183          * This is because we're setting fields here we need
1184          * to prevent others from looking at until we're done.
1185          */
1186         error = xfs_iget(tp->t_mountp, tp, ino, XFS_IGET_CREATE,
1187                          XFS_ILOCK_EXCL, &ip);
1188         if (error)
1189                 return error;
1190         ASSERT(ip != NULL);
1191
1192         ip->i_d.di_mode = mode;
1193         ip->i_d.di_onlink = 0;
1194         ip->i_d.di_nlink = nlink;
1195         ASSERT(ip->i_d.di_nlink == nlink);
1196         ip->i_d.di_uid = current_fsuid();
1197         ip->i_d.di_gid = current_fsgid();
1198         xfs_set_projid(ip, prid);
1199         memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
1200
1201         /*
1202          * If the superblock version is up to where we support new format
1203          * inodes and this is currently an old format inode, then change
1204          * the inode version number now.  This way we only do the conversion
1205          * here rather than here and in the flush/logging code.
1206          */
1207         if (xfs_sb_version_hasnlink(&tp->t_mountp->m_sb) &&
1208             ip->i_d.di_version == 1) {
1209                 ip->i_d.di_version = 2;
1210                 /*
1211                  * We've already zeroed the old link count, the projid field,
1212                  * and the pad field.
1213                  */
1214         }
1215
1216         /*
1217          * Project ids won't be stored on disk if we are using a version 1 inode.
1218          */
1219         if ((prid != 0) && (ip->i_d.di_version == 1))
1220                 xfs_bump_ino_vers2(tp, ip);
1221
1222         if (pip && XFS_INHERIT_GID(pip)) {
1223                 ip->i_d.di_gid = pip->i_d.di_gid;
1224                 if ((pip->i_d.di_mode & S_ISGID) && S_ISDIR(mode)) {
1225                         ip->i_d.di_mode |= S_ISGID;
1226                 }
1227         }
1228
1229         /*
1230          * If the group ID of the new file does not match the effective group
1231          * ID or one of the supplementary group IDs, the S_ISGID bit is cleared
1232          * (and only if the irix_sgid_inherit compatibility variable is set).
1233          */
1234         if ((irix_sgid_inherit) &&
1235             (ip->i_d.di_mode & S_ISGID) &&
1236             (!in_group_p((gid_t)ip->i_d.di_gid))) {
1237                 ip->i_d.di_mode &= ~S_ISGID;
1238         }
1239
1240         ip->i_d.di_size = 0;
1241         ip->i_d.di_nextents = 0;
1242         ASSERT(ip->i_d.di_nblocks == 0);
1243
1244         nanotime(&tv);
1245         ip->i_d.di_mtime.t_sec = (__int32_t)tv.tv_sec;
1246         ip->i_d.di_mtime.t_nsec = (__int32_t)tv.tv_nsec;
1247         ip->i_d.di_atime = ip->i_d.di_mtime;
1248         ip->i_d.di_ctime = ip->i_d.di_mtime;
1249
1250         /*
1251          * di_gen will have been taken care of in xfs_iread.
1252          */
1253         ip->i_d.di_extsize = 0;
1254         ip->i_d.di_dmevmask = 0;
1255         ip->i_d.di_dmstate = 0;
1256         ip->i_d.di_flags = 0;
1257         flags = XFS_ILOG_CORE;
1258         switch (mode & S_IFMT) {
1259         case S_IFIFO:
1260         case S_IFCHR:
1261         case S_IFBLK:
1262         case S_IFSOCK:
1263                 ip->i_d.di_format = XFS_DINODE_FMT_DEV;
1264                 ip->i_df.if_u2.if_rdev = rdev;
1265                 ip->i_df.if_flags = 0;
1266                 flags |= XFS_ILOG_DEV;
1267                 break;
1268         case S_IFREG:
1269                 /*
1270                  * we can't set up filestreams until after the VFS inode
1271                  * is set up properly.
1272                  */
1273                 if (pip && xfs_inode_is_filestream(pip))
1274                         filestreams = 1;
1275                 /* fall through */
1276         case S_IFDIR:
1277                 if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY)) {
1278                         uint    di_flags = 0;
1279
1280                         if (S_ISDIR(mode)) {
1281                                 if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT)
1282                                         di_flags |= XFS_DIFLAG_RTINHERIT;
1283                                 if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) {
1284                                         di_flags |= XFS_DIFLAG_EXTSZINHERIT;
1285                                         ip->i_d.di_extsize = pip->i_d.di_extsize;
1286                                 }
1287                         } else if (S_ISREG(mode)) {
1288                                 if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT)
1289                                         di_flags |= XFS_DIFLAG_REALTIME;
1290                                 if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) {
1291                                         di_flags |= XFS_DIFLAG_EXTSIZE;
1292                                         ip->i_d.di_extsize = pip->i_d.di_extsize;
1293                                 }
1294                         }
1295                         if ((pip->i_d.di_flags & XFS_DIFLAG_NOATIME) &&
1296                             xfs_inherit_noatime)
1297                                 di_flags |= XFS_DIFLAG_NOATIME;
1298                         if ((pip->i_d.di_flags & XFS_DIFLAG_NODUMP) &&
1299                             xfs_inherit_nodump)
1300                                 di_flags |= XFS_DIFLAG_NODUMP;
1301                         if ((pip->i_d.di_flags & XFS_DIFLAG_SYNC) &&
1302                             xfs_inherit_sync)
1303                                 di_flags |= XFS_DIFLAG_SYNC;
1304                         if ((pip->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) &&
1305                             xfs_inherit_nosymlinks)
1306                                 di_flags |= XFS_DIFLAG_NOSYMLINKS;
1307                         if (pip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
1308                                 di_flags |= XFS_DIFLAG_PROJINHERIT;
1309                         if ((pip->i_d.di_flags & XFS_DIFLAG_NODEFRAG) &&
1310                             xfs_inherit_nodefrag)
1311                                 di_flags |= XFS_DIFLAG_NODEFRAG;
1312                         if (pip->i_d.di_flags & XFS_DIFLAG_FILESTREAM)
1313                                 di_flags |= XFS_DIFLAG_FILESTREAM;
1314                         ip->i_d.di_flags |= di_flags;
1315                 }
1316                 /* FALLTHROUGH */
1317         case S_IFLNK:
1318                 ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS;
1319                 ip->i_df.if_flags = XFS_IFEXTENTS;
1320                 ip->i_df.if_bytes = ip->i_df.if_real_bytes = 0;
1321                 ip->i_df.if_u1.if_extents = NULL;
1322                 break;
1323         default:
1324                 ASSERT(0);
1325         }
1326         /*
1327          * Attribute fork settings for new inode.
1328          */
1329         ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
1330         ip->i_d.di_anextents = 0;
1331
1332         /*
1333          * Log the new values stuffed into the inode.
1334          */
1335         xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
1336         xfs_trans_log_inode(tp, ip, flags);
1337
1338         /* now that we have an i_mode we can setup inode ops and unlock */
1339         xfs_setup_inode(ip);
1340
1341         /* now we have set up the vfs inode we can associate the filestream */
1342         if (filestreams) {
1343                 error = xfs_filestream_associate(pip, ip);
1344                 if (error < 0)
1345                         return -error;
1346                 if (!error)
1347                         xfs_iflags_set(ip, XFS_IFILESTREAM);
1348         }
1349
1350         *ipp = ip;
1351         return 0;
1352 }
1353
1354 /*
1355  * Free up the underlying blocks past new_size.  The new size must be smaller
1356  * than the current size.  This routine can be used both for the attribute and
1357  * data fork, and does not modify the inode size, which is left to the caller.
1358  *
1359  * The transaction passed to this routine must have made a permanent log
1360  * reservation of at least XFS_ITRUNCATE_LOG_RES.  This routine may commit the
1361  * given transaction and start new ones, so make sure everything involved in
1362  * the transaction is tidy before calling here.  Some transaction will be
1363  * returned to the caller to be committed.  The incoming transaction must
1364  * already include the inode, and both inode locks must be held exclusively.
1365  * The inode must also be "held" within the transaction.  On return the inode
1366  * will be "held" within the returned transaction.  This routine does NOT
1367  * require any disk space to be reserved for it within the transaction.
1368  *
1369  * If we get an error, we must return with the inode locked and linked into the
1370  * current transaction. This keeps things simple for the higher level code,
1371  * because it always knows that the inode is locked and held in the transaction
1372  * that returns to it whether errors occur or not.  We don't mark the inode
1373  * dirty on error so that transactions can be easily aborted if possible.
1374  */
1375 int
1376 xfs_itruncate_extents(
1377         struct xfs_trans        **tpp,
1378         struct xfs_inode        *ip,
1379         int                     whichfork,
1380         xfs_fsize_t             new_size)
1381 {
1382         struct xfs_mount        *mp = ip->i_mount;
1383         struct xfs_trans        *tp = *tpp;
1384         struct xfs_trans        *ntp;
1385         xfs_bmap_free_t         free_list;
1386         xfs_fsblock_t           first_block;
1387         xfs_fileoff_t           first_unmap_block;
1388         xfs_fileoff_t           last_block;
1389         xfs_filblks_t           unmap_len;
1390         int                     committed;
1391         int                     error = 0;
1392         int                     done = 0;
1393
1394         ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
1395         ASSERT(!atomic_read(&VFS_I(ip)->i_count) ||
1396                xfs_isilocked(ip, XFS_IOLOCK_EXCL));
1397         ASSERT(new_size <= XFS_ISIZE(ip));
1398         ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
1399         ASSERT(ip->i_itemp != NULL);
1400         ASSERT(ip->i_itemp->ili_lock_flags == 0);
1401         ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
1402
1403         trace_xfs_itruncate_extents_start(ip, new_size);
1404
1405         /*
1406          * Since it is possible for space to become allocated beyond
1407          * the end of the file (in a crash where the space is allocated
1408          * but the inode size is not yet updated), simply remove any
1409          * blocks which show up between the new EOF and the maximum
1410          * possible file size.  If the first block to be removed is
1411          * beyond the maximum file size (ie it is the same as last_block),
1412          * then there is nothing to do.
1413          */
1414         first_unmap_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size);
1415         last_block = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
1416         if (first_unmap_block == last_block)
1417                 return 0;
1418
1419         ASSERT(first_unmap_block < last_block);
1420         unmap_len = last_block - first_unmap_block + 1;
1421         while (!done) {
1422                 xfs_bmap_init(&free_list, &first_block);
1423                 error = xfs_bunmapi(tp, ip,
1424                                     first_unmap_block, unmap_len,
1425                                     xfs_bmapi_aflag(whichfork),
1426                                     XFS_ITRUNC_MAX_EXTENTS,
1427                                     &first_block, &free_list,
1428                                     &done);
1429                 if (error)
1430                         goto out_bmap_cancel;
1431
1432                 /*
1433                  * Duplicate the transaction that has the permanent
1434                  * reservation and commit the old transaction.
1435                  */
1436                 error = xfs_bmap_finish(&tp, &free_list, &committed);
1437                 if (committed)
1438                         xfs_trans_ijoin(tp, ip, 0);
1439                 if (error)
1440                         goto out_bmap_cancel;
1441
1442                 if (committed) {
1443                         /*
1444                          * Mark the inode dirty so it will be logged and
1445                          * moved forward in the log as part of every commit.
1446                          */
1447                         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1448                 }
1449
1450                 ntp = xfs_trans_dup(tp);
1451                 error = xfs_trans_commit(tp, 0);
1452                 tp = ntp;
1453
1454                 xfs_trans_ijoin(tp, ip, 0);
1455
1456                 if (error)
1457                         goto out;
1458
1459                 /*
1460                  * Transaction commit worked ok so we can drop the extra ticket
1461                  * reference that we gained in xfs_trans_dup()
1462                  */
1463                 xfs_log_ticket_put(tp->t_ticket);
1464                 error = xfs_trans_reserve(tp, 0,
1465                                         XFS_ITRUNCATE_LOG_RES(mp), 0,
1466                                         XFS_TRANS_PERM_LOG_RES,
1467                                         XFS_ITRUNCATE_LOG_COUNT);
1468                 if (error)
1469                         goto out;
1470         }
1471
1472         /*
1473          * Always re-log the inode so that our permanent transaction can keep
1474          * on rolling it forward in the log.
1475          */
1476         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1477
1478         trace_xfs_itruncate_extents_end(ip, new_size);
1479
1480 out:
1481         *tpp = tp;
1482         return error;
1483 out_bmap_cancel:
1484         /*
1485          * If the bunmapi call encounters an error, return to the caller where
1486          * the transaction can be properly aborted.  We just need to make sure
1487          * we're not holding any resources that we were not when we came in.
1488          */
1489         xfs_bmap_cancel(&free_list);
1490         goto out;
1491 }
1492
1493 /*
1494  * This is called when the inode's link count goes to 0.
1495  * We place the on-disk inode on a list in the AGI.  It
1496  * will be pulled from this list when the inode is freed.
1497  */
1498 int
1499 xfs_iunlink(
1500         xfs_trans_t     *tp,
1501         xfs_inode_t     *ip)
1502 {
1503         xfs_mount_t     *mp;
1504         xfs_agi_t       *agi;
1505         xfs_dinode_t    *dip;
1506         xfs_buf_t       *agibp;
1507         xfs_buf_t       *ibp;
1508         xfs_agino_t     agino;
1509         short           bucket_index;
1510         int             offset;
1511         int             error;
1512
1513         ASSERT(ip->i_d.di_nlink == 0);
1514         ASSERT(ip->i_d.di_mode != 0);
1515
1516         mp = tp->t_mountp;
1517
1518         /*
1519          * Get the agi buffer first.  It ensures lock ordering
1520          * on the list.
1521          */
1522         error = xfs_read_agi(mp, tp, XFS_INO_TO_AGNO(mp, ip->i_ino), &agibp);
1523         if (error)
1524                 return error;
1525         agi = XFS_BUF_TO_AGI(agibp);
1526
1527         /*
1528          * Get the index into the agi hash table for the
1529          * list this inode will go on.
1530          */
1531         agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
1532         ASSERT(agino != 0);
1533         bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
1534         ASSERT(agi->agi_unlinked[bucket_index]);
1535         ASSERT(be32_to_cpu(agi->agi_unlinked[bucket_index]) != agino);
1536
1537         if (agi->agi_unlinked[bucket_index] != cpu_to_be32(NULLAGINO)) {
1538                 /*
1539                  * There is already another inode in the bucket we need
1540                  * to add ourselves to.  Add us at the front of the list.
1541                  * Here we put the head pointer into our next pointer,
1542                  * and then we fall through to point the head at us.
1543                  */
1544                 error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp,
1545                                        0, 0);
1546                 if (error)
1547                         return error;
1548
1549                 ASSERT(dip->di_next_unlinked == cpu_to_be32(NULLAGINO));
1550                 dip->di_next_unlinked = agi->agi_unlinked[bucket_index];
1551                 offset = ip->i_imap.im_boffset +
1552                         offsetof(xfs_dinode_t, di_next_unlinked);
1553                 xfs_trans_inode_buf(tp, ibp);
1554                 xfs_trans_log_buf(tp, ibp, offset,
1555                                   (offset + sizeof(xfs_agino_t) - 1));
1556                 xfs_inobp_check(mp, ibp);
1557         }
1558
1559         /*
1560          * Point the bucket head pointer at the inode being inserted.
1561          */
1562         ASSERT(agino != 0);
1563         agi->agi_unlinked[bucket_index] = cpu_to_be32(agino);
1564         offset = offsetof(xfs_agi_t, agi_unlinked) +
1565                 (sizeof(xfs_agino_t) * bucket_index);
1566         xfs_trans_log_buf(tp, agibp, offset,
1567                           (offset + sizeof(xfs_agino_t) - 1));
1568         return 0;
1569 }
1570
1571 /*
1572  * Pull the on-disk inode from the AGI unlinked list.
1573  */
1574 STATIC int
1575 xfs_iunlink_remove(
1576         xfs_trans_t     *tp,
1577         xfs_inode_t     *ip)
1578 {
1579         xfs_ino_t       next_ino;
1580         xfs_mount_t     *mp;
1581         xfs_agi_t       *agi;
1582         xfs_dinode_t    *dip;
1583         xfs_buf_t       *agibp;
1584         xfs_buf_t       *ibp;
1585         xfs_agnumber_t  agno;
1586         xfs_agino_t     agino;
1587         xfs_agino_t     next_agino;
1588         xfs_buf_t       *last_ibp;
1589         xfs_dinode_t    *last_dip = NULL;
1590         short           bucket_index;
1591         int             offset, last_offset = 0;
1592         int             error;
1593
1594         mp = tp->t_mountp;
1595         agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
1596
1597         /*
1598          * Get the agi buffer first.  It ensures lock ordering
1599          * on the list.
1600          */
1601         error = xfs_read_agi(mp, tp, agno, &agibp);
1602         if (error)
1603                 return error;
1604
1605         agi = XFS_BUF_TO_AGI(agibp);
1606
1607         /*
1608          * Get the index into the agi hash table for the
1609          * list this inode will go on.
1610          */
1611         agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
1612         ASSERT(agino != 0);
1613         bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
1614         ASSERT(agi->agi_unlinked[bucket_index] != cpu_to_be32(NULLAGINO));
1615         ASSERT(agi->agi_unlinked[bucket_index]);
1616
1617         if (be32_to_cpu(agi->agi_unlinked[bucket_index]) == agino) {
1618                 /*
1619                  * We're at the head of the list.  Get the inode's on-disk
1620                  * buffer to see if there is anyone after us on the list.
1621                  * Only modify our next pointer if it is not already NULLAGINO.
1622                  * This saves us the overhead of dealing with the buffer when
1623                  * there is no need to change it.
1624                  */
1625                 error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp,
1626                                        0, 0);
1627                 if (error) {
1628                         xfs_warn(mp, "%s: xfs_imap_to_bp returned error %d.",
1629                                 __func__, error);
1630                         return error;
1631                 }
1632                 next_agino = be32_to_cpu(dip->di_next_unlinked);
1633                 ASSERT(next_agino != 0);
1634                 if (next_agino != NULLAGINO) {
1635                         dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
1636                         offset = ip->i_imap.im_boffset +
1637                                 offsetof(xfs_dinode_t, di_next_unlinked);
1638                         xfs_trans_inode_buf(tp, ibp);
1639                         xfs_trans_log_buf(tp, ibp, offset,
1640                                           (offset + sizeof(xfs_agino_t) - 1));
1641                         xfs_inobp_check(mp, ibp);
1642                 } else {
1643                         xfs_trans_brelse(tp, ibp);
1644                 }
1645                 /*
1646                  * Point the bucket head pointer at the next inode.
1647                  */
1648                 ASSERT(next_agino != 0);
1649                 ASSERT(next_agino != agino);
1650                 agi->agi_unlinked[bucket_index] = cpu_to_be32(next_agino);
1651                 offset = offsetof(xfs_agi_t, agi_unlinked) +
1652                         (sizeof(xfs_agino_t) * bucket_index);
1653                 xfs_trans_log_buf(tp, agibp, offset,
1654                                   (offset + sizeof(xfs_agino_t) - 1));
1655         } else {
1656                 /*
1657                  * We need to search the list for the inode being freed.
1658                  */
1659                 next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
1660                 last_ibp = NULL;
1661                 while (next_agino != agino) {
1662                         struct xfs_imap imap;
1663
1664                         if (last_ibp)
1665                                 xfs_trans_brelse(tp, last_ibp);
1666
1667                         imap.im_blkno = 0;
1668                         next_ino = XFS_AGINO_TO_INO(mp, agno, next_agino);
1669
1670                         error = xfs_imap(mp, tp, next_ino, &imap, 0);
1671                         if (error) {
1672                                 xfs_warn(mp,
1673         "%s: xfs_imap returned error %d.",
1674                                          __func__, error);
1675                                 return error;
1676                         }
1677
1678                         error = xfs_imap_to_bp(mp, tp, &imap, &last_dip,
1679                                                &last_ibp, 0, 0);
1680                         if (error) {
1681                                 xfs_warn(mp,
1682         "%s: xfs_imap_to_bp returned error %d.",
1683                                         __func__, error);
1684                                 return error;
1685                         }
1686
1687                         last_offset = imap.im_boffset;
1688                         next_agino = be32_to_cpu(last_dip->di_next_unlinked);
1689                         ASSERT(next_agino != NULLAGINO);
1690                         ASSERT(next_agino != 0);
1691                 }
1692
1693                 /*
1694                  * Now last_ibp points to the buffer previous to us on the
1695                  * unlinked list.  Pull us from the list.
1696                  */
1697                 error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp,
1698                                        0, 0);
1699                 if (error) {
1700                         xfs_warn(mp, "%s: xfs_imap_to_bp(2) returned error %d.",
1701                                 __func__, error);
1702                         return error;
1703                 }
1704                 next_agino = be32_to_cpu(dip->di_next_unlinked);
1705                 ASSERT(next_agino != 0);
1706                 ASSERT(next_agino != agino);
1707                 if (next_agino != NULLAGINO) {
1708                         dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
1709                         offset = ip->i_imap.im_boffset +
1710                                 offsetof(xfs_dinode_t, di_next_unlinked);
1711                         xfs_trans_inode_buf(tp, ibp);
1712                         xfs_trans_log_buf(tp, ibp, offset,
1713                                           (offset + sizeof(xfs_agino_t) - 1));
1714                         xfs_inobp_check(mp, ibp);
1715                 } else {
1716                         xfs_trans_brelse(tp, ibp);
1717                 }
1718                 /*
1719                  * Point the previous inode on the list to the next inode.
1720                  */
1721                 last_dip->di_next_unlinked = cpu_to_be32(next_agino);
1722                 ASSERT(next_agino != 0);
1723                 offset = last_offset + offsetof(xfs_dinode_t, di_next_unlinked);
1724                 xfs_trans_inode_buf(tp, last_ibp);
1725                 xfs_trans_log_buf(tp, last_ibp, offset,
1726                                   (offset + sizeof(xfs_agino_t) - 1));
1727                 xfs_inobp_check(mp, last_ibp);
1728         }
1729         return 0;
1730 }
1731
1732 /*
1733  * A big issue when freeing the inode cluster is is that we _cannot_ skip any
1734  * inodes that are in memory - they all must be marked stale and attached to
1735  * the cluster buffer.
1736  */
1737 STATIC int
1738 xfs_ifree_cluster(
1739         xfs_inode_t     *free_ip,
1740         xfs_trans_t     *tp,
1741         xfs_ino_t       inum)
1742 {
1743         xfs_mount_t             *mp = free_ip->i_mount;
1744         int                     blks_per_cluster;
1745         int                     nbufs;
1746         int                     ninodes;
1747         int                     i, j;
1748         xfs_daddr_t             blkno;
1749         xfs_buf_t               *bp;
1750         xfs_inode_t             *ip;
1751         xfs_inode_log_item_t    *iip;
1752         xfs_log_item_t          *lip;
1753         struct xfs_perag        *pag;
1754
1755         pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum));
1756         if (mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) {
1757                 blks_per_cluster = 1;
1758                 ninodes = mp->m_sb.sb_inopblock;
1759                 nbufs = XFS_IALLOC_BLOCKS(mp);
1760         } else {
1761                 blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) /
1762                                         mp->m_sb.sb_blocksize;
1763                 ninodes = blks_per_cluster * mp->m_sb.sb_inopblock;
1764                 nbufs = XFS_IALLOC_BLOCKS(mp) / blks_per_cluster;
1765         }
1766
1767         for (j = 0; j < nbufs; j++, inum += ninodes) {
1768                 blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum),
1769                                          XFS_INO_TO_AGBNO(mp, inum));
1770
1771                 /*
1772                  * We obtain and lock the backing buffer first in the process
1773                  * here, as we have to ensure that any dirty inode that we
1774                  * can't get the flush lock on is attached to the buffer.
1775                  * If we scan the in-memory inodes first, then buffer IO can
1776                  * complete before we get a lock on it, and hence we may fail
1777                  * to mark all the active inodes on the buffer stale.
1778                  */
1779                 bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno,
1780                                         mp->m_bsize * blks_per_cluster,
1781                                         XBF_UNMAPPED);
1782
1783                 if (!bp)
1784                         return ENOMEM;
1785
1786                 /*
1787                  * This buffer may not have been correctly initialised as we
1788                  * didn't read it from disk. That's not important because we are
1789                  * only using to mark the buffer as stale in the log, and to
1790                  * attach stale cached inodes on it. That means it will never be
1791                  * dispatched for IO. If it is, we want to know about it, and we
1792                  * want it to fail. We can acheive this by adding a write
1793                  * verifier to the buffer.
1794                  */
1795                  bp->b_pre_io = xfs_inode_buf_write_verify;
1796
1797                 /*
1798                  * Walk the inodes already attached to the buffer and mark them
1799                  * stale. These will all have the flush locks held, so an
1800                  * in-memory inode walk can't lock them. By marking them all
1801                  * stale first, we will not attempt to lock them in the loop
1802                  * below as the XFS_ISTALE flag will be set.
1803                  */
1804                 lip = bp->b_fspriv;
1805                 while (lip) {
1806                         if (lip->li_type == XFS_LI_INODE) {
1807                                 iip = (xfs_inode_log_item_t *)lip;
1808                                 ASSERT(iip->ili_logged == 1);
1809                                 lip->li_cb = xfs_istale_done;
1810                                 xfs_trans_ail_copy_lsn(mp->m_ail,
1811                                                         &iip->ili_flush_lsn,
1812                                                         &iip->ili_item.li_lsn);
1813                                 xfs_iflags_set(iip->ili_inode, XFS_ISTALE);
1814                         }
1815                         lip = lip->li_bio_list;
1816                 }
1817
1818
1819                 /*
1820                  * For each inode in memory attempt to add it to the inode
1821                  * buffer and set it up for being staled on buffer IO
1822                  * completion.  This is safe as we've locked out tail pushing
1823                  * and flushing by locking the buffer.
1824                  *
1825                  * We have already marked every inode that was part of a
1826                  * transaction stale above, which means there is no point in
1827                  * even trying to lock them.
1828                  */
1829                 for (i = 0; i < ninodes; i++) {
1830 retry:
1831                         rcu_read_lock();
1832                         ip = radix_tree_lookup(&pag->pag_ici_root,
1833                                         XFS_INO_TO_AGINO(mp, (inum + i)));
1834
1835                         /* Inode not in memory, nothing to do */
1836                         if (!ip) {
1837                                 rcu_read_unlock();
1838                                 continue;
1839                         }
1840
1841                         /*
1842                          * because this is an RCU protected lookup, we could
1843                          * find a recently freed or even reallocated inode
1844                          * during the lookup. We need to check under the
1845                          * i_flags_lock for a valid inode here. Skip it if it
1846                          * is not valid, the wrong inode or stale.
1847                          */
1848                         spin_lock(&ip->i_flags_lock);
1849                         if (ip->i_ino != inum + i ||
1850                             __xfs_iflags_test(ip, XFS_ISTALE)) {
1851                                 spin_unlock(&ip->i_flags_lock);
1852                                 rcu_read_unlock();
1853                                 continue;
1854                         }
1855                         spin_unlock(&ip->i_flags_lock);
1856
1857                         /*
1858                          * Don't try to lock/unlock the current inode, but we
1859                          * _cannot_ skip the other inodes that we did not find
1860                          * in the list attached to the buffer and are not
1861                          * already marked stale. If we can't lock it, back off
1862                          * and retry.
1863                          */
1864                         if (ip != free_ip &&
1865                             !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
1866                                 rcu_read_unlock();
1867                                 delay(1);
1868                                 goto retry;
1869                         }
1870                         rcu_read_unlock();
1871
1872                         xfs_iflock(ip);
1873                         xfs_iflags_set(ip, XFS_ISTALE);
1874
1875                         /*
1876                          * we don't need to attach clean inodes or those only
1877                          * with unlogged changes (which we throw away, anyway).
1878                          */
1879                         iip = ip->i_itemp;
1880                         if (!iip || xfs_inode_clean(ip)) {
1881                                 ASSERT(ip != free_ip);
1882                                 xfs_ifunlock(ip);
1883                                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1884                                 continue;
1885                         }
1886
1887                         iip->ili_last_fields = iip->ili_fields;
1888                         iip->ili_fields = 0;
1889                         iip->ili_logged = 1;
1890                         xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
1891                                                 &iip->ili_item.li_lsn);
1892
1893                         xfs_buf_attach_iodone(bp, xfs_istale_done,
1894                                                   &iip->ili_item);
1895
1896                         if (ip != free_ip)
1897                                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1898                 }
1899
1900                 xfs_trans_stale_inode_buf(tp, bp);
1901                 xfs_trans_binval(tp, bp);
1902         }
1903
1904         xfs_perag_put(pag);
1905         return 0;
1906 }
1907
1908 /*
1909  * This is called to return an inode to the inode free list.
1910  * The inode should already be truncated to 0 length and have
1911  * no pages associated with it.  This routine also assumes that
1912  * the inode is already a part of the transaction.
1913  *
1914  * The on-disk copy of the inode will have been added to the list
1915  * of unlinked inodes in the AGI. We need to remove the inode from
1916  * that list atomically with respect to freeing it here.
1917  */
1918 int
1919 xfs_ifree(
1920         xfs_trans_t     *tp,
1921         xfs_inode_t     *ip,
1922         xfs_bmap_free_t *flist)
1923 {
1924         int                     error;
1925         int                     delete;
1926         xfs_ino_t               first_ino;
1927         xfs_dinode_t            *dip;
1928         xfs_buf_t               *ibp;
1929
1930         ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
1931         ASSERT(ip->i_d.di_nlink == 0);
1932         ASSERT(ip->i_d.di_nextents == 0);
1933         ASSERT(ip->i_d.di_anextents == 0);
1934         ASSERT(ip->i_d.di_size == 0 || !S_ISREG(ip->i_d.di_mode));
1935         ASSERT(ip->i_d.di_nblocks == 0);
1936
1937         /*
1938          * Pull the on-disk inode from the AGI unlinked list.
1939          */
1940         error = xfs_iunlink_remove(tp, ip);
1941         if (error != 0) {
1942                 return error;
1943         }
1944
1945         error = xfs_difree(tp, ip->i_ino, flist, &delete, &first_ino);
1946         if (error != 0) {
1947                 return error;
1948         }
1949         ip->i_d.di_mode = 0;            /* mark incore inode as free */
1950         ip->i_d.di_flags = 0;
1951         ip->i_d.di_dmevmask = 0;
1952         ip->i_d.di_forkoff = 0;         /* mark the attr fork not in use */
1953         ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS;
1954         ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
1955         /*
1956          * Bump the generation count so no one will be confused
1957          * by reincarnations of this inode.
1958          */
1959         ip->i_d.di_gen++;
1960
1961         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1962
1963         error = xfs_imap_to_bp(ip->i_mount, tp, &ip->i_imap, &dip, &ibp,
1964                                0, 0);
1965         if (error)
1966                 return error;
1967
1968         /*
1969         * Clear the on-disk di_mode. This is to prevent xfs_bulkstat
1970         * from picking up this inode when it is reclaimed (its incore state
1971         * initialzed but not flushed to disk yet). The in-core di_mode is
1972         * already cleared  and a corresponding transaction logged.
1973         * The hack here just synchronizes the in-core to on-disk
1974         * di_mode value in advance before the actual inode sync to disk.
1975         * This is OK because the inode is already unlinked and would never
1976         * change its di_mode again for this inode generation.
1977         * This is a temporary hack that would require a proper fix
1978         * in the future.
1979         */
1980         dip->di_mode = 0;
1981
1982         if (delete) {
1983                 error = xfs_ifree_cluster(ip, tp, first_ino);
1984         }
1985
1986         return error;
1987 }
1988
1989 /*
1990  * Reallocate the space for if_broot based on the number of records
1991  * being added or deleted as indicated in rec_diff.  Move the records
1992  * and pointers in if_broot to fit the new size.  When shrinking this
1993  * will eliminate holes between the records and pointers created by
1994  * the caller.  When growing this will create holes to be filled in
1995  * by the caller.
1996  *
1997  * The caller must not request to add more records than would fit in
1998  * the on-disk inode root.  If the if_broot is currently NULL, then
1999  * if we adding records one will be allocated.  The caller must also
2000  * not request that the number of records go below zero, although
2001  * it can go to zero.
2002  *
2003  * ip -- the inode whose if_broot area is changing
2004  * ext_diff -- the change in the number of records, positive or negative,
2005  *       requested for the if_broot array.
2006  */
2007 void
2008 xfs_iroot_realloc(
2009         xfs_inode_t             *ip,
2010         int                     rec_diff,
2011         int                     whichfork)
2012 {
2013         struct xfs_mount        *mp = ip->i_mount;
2014         int                     cur_max;
2015         xfs_ifork_t             *ifp;
2016         struct xfs_btree_block  *new_broot;
2017         int                     new_max;
2018         size_t                  new_size;
2019         char                    *np;
2020         char                    *op;
2021
2022         /*
2023          * Handle the degenerate case quietly.
2024          */
2025         if (rec_diff == 0) {
2026                 return;
2027         }
2028
2029         ifp = XFS_IFORK_PTR(ip, whichfork);
2030         if (rec_diff > 0) {
2031                 /*
2032                  * If there wasn't any memory allocated before, just
2033                  * allocate it now and get out.
2034                  */
2035                 if (ifp->if_broot_bytes == 0) {
2036                         new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(rec_diff);
2037                         ifp->if_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS);
2038                         ifp->if_broot_bytes = (int)new_size;
2039                         return;
2040                 }
2041
2042                 /*
2043                  * If there is already an existing if_broot, then we need
2044                  * to realloc() it and shift the pointers to their new
2045                  * location.  The records don't change location because
2046                  * they are kept butted up against the btree block header.
2047                  */
2048                 cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0);
2049                 new_max = cur_max + rec_diff;
2050                 new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(new_max);
2051                 ifp->if_broot = kmem_realloc(ifp->if_broot, new_size,
2052                                 (size_t)XFS_BMAP_BROOT_SPACE_CALC(cur_max), /* old size */
2053                                 KM_SLEEP | KM_NOFS);
2054                 op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
2055                                                      ifp->if_broot_bytes);
2056                 np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
2057                                                      (int)new_size);
2058                 ifp->if_broot_bytes = (int)new_size;
2059                 ASSERT(ifp->if_broot_bytes <=
2060                         XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ);
2061                 memmove(np, op, cur_max * (uint)sizeof(xfs_dfsbno_t));
2062                 return;
2063         }
2064
2065         /*
2066          * rec_diff is less than 0.  In this case, we are shrinking the
2067          * if_broot buffer.  It must already exist.  If we go to zero
2068          * records, just get rid of the root and clear the status bit.
2069          */
2070         ASSERT((ifp->if_broot != NULL) && (ifp->if_broot_bytes > 0));
2071         cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0);
2072         new_max = cur_max + rec_diff;
2073         ASSERT(new_max >= 0);
2074         if (new_max > 0)
2075                 new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(new_max);
2076         else
2077                 new_size = 0;
2078         if (new_size > 0) {
2079                 new_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS);
2080                 /*
2081                  * First copy over the btree block header.
2082                  */
2083                 memcpy(new_broot, ifp->if_broot, XFS_BTREE_LBLOCK_LEN);
2084         } else {
2085                 new_broot = NULL;
2086                 ifp->if_flags &= ~XFS_IFBROOT;
2087         }
2088
2089         /*
2090          * Only copy the records and pointers if there are any.
2091          */
2092         if (new_max > 0) {
2093                 /*
2094                  * First copy the records.
2095                  */
2096                 op = (char *)XFS_BMBT_REC_ADDR(mp, ifp->if_broot, 1);
2097                 np = (char *)XFS_BMBT_REC_ADDR(mp, new_broot, 1);
2098                 memcpy(np, op, new_max * (uint)sizeof(xfs_bmbt_rec_t));
2099
2100                 /*
2101                  * Then copy the pointers.
2102                  */
2103                 op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
2104                                                      ifp->if_broot_bytes);
2105                 np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, new_broot, 1,
2106                                                      (int)new_size);
2107                 memcpy(np, op, new_max * (uint)sizeof(xfs_dfsbno_t));
2108         }
2109         kmem_free(ifp->if_broot);
2110         ifp->if_broot = new_broot;
2111         ifp->if_broot_bytes = (int)new_size;
2112         ASSERT(ifp->if_broot_bytes <=
2113                 XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ);
2114         return;
2115 }
2116
2117
2118 /*
2119  * This is called when the amount of space needed for if_data
2120  * is increased or decreased.  The change in size is indicated by
2121  * the number of bytes that need to be added or deleted in the
2122  * byte_diff parameter.
2123  *
2124  * If the amount of space needed has decreased below the size of the
2125  * inline buffer, then switch to using the inline buffer.  Otherwise,
2126  * use kmem_realloc() or kmem_alloc() to adjust the size of the buffer
2127  * to what is needed.
2128  *
2129  * ip -- the inode whose if_data area is changing
2130  * byte_diff -- the change in the number of bytes, positive or negative,
2131  *       requested for the if_data array.
2132  */
2133 void
2134 xfs_idata_realloc(
2135         xfs_inode_t     *ip,
2136         int             byte_diff,
2137         int             whichfork)
2138 {
2139         xfs_ifork_t     *ifp;
2140         int             new_size;
2141         int             real_size;
2142
2143         if (byte_diff == 0) {
2144                 return;
2145         }
2146
2147         ifp = XFS_IFORK_PTR(ip, whichfork);
2148         new_size = (int)ifp->if_bytes + byte_diff;
2149         ASSERT(new_size >= 0);
2150
2151         if (new_size == 0) {
2152                 if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
2153                         kmem_free(ifp->if_u1.if_data);
2154                 }
2155                 ifp->if_u1.if_data = NULL;
2156                 real_size = 0;
2157         } else if (new_size <= sizeof(ifp->if_u2.if_inline_data)) {
2158                 /*
2159                  * If the valid extents/data can fit in if_inline_ext/data,
2160                  * copy them from the malloc'd vector and free it.
2161                  */
2162                 if (ifp->if_u1.if_data == NULL) {
2163                         ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
2164                 } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
2165                         ASSERT(ifp->if_real_bytes != 0);
2166                         memcpy(ifp->if_u2.if_inline_data, ifp->if_u1.if_data,
2167                               new_size);
2168                         kmem_free(ifp->if_u1.if_data);
2169                         ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
2170                 }
2171                 real_size = 0;
2172         } else {
2173                 /*
2174                  * Stuck with malloc/realloc.
2175                  * For inline data, the underlying buffer must be
2176                  * a multiple of 4 bytes in size so that it can be
2177                  * logged and stay on word boundaries.  We enforce
2178                  * that here.
2179                  */
2180                 real_size = roundup(new_size, 4);
2181                 if (ifp->if_u1.if_data == NULL) {
2182                         ASSERT(ifp->if_real_bytes == 0);
2183                         ifp->if_u1.if_data = kmem_alloc(real_size,
2184                                                         KM_SLEEP | KM_NOFS);
2185                 } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
2186                         /*
2187                          * Only do the realloc if the underlying size
2188                          * is really changing.
2189                          */
2190                         if (ifp->if_real_bytes != real_size) {
2191                                 ifp->if_u1.if_data =
2192                                         kmem_realloc(ifp->if_u1.if_data,
2193                                                         real_size,
2194                                                         ifp->if_real_bytes,
2195                                                         KM_SLEEP | KM_NOFS);
2196                         }
2197                 } else {
2198                         ASSERT(ifp->if_real_bytes == 0);
2199                         ifp->if_u1.if_data = kmem_alloc(real_size,
2200                                                         KM_SLEEP | KM_NOFS);
2201                         memcpy(ifp->if_u1.if_data, ifp->if_u2.if_inline_data,
2202                                 ifp->if_bytes);
2203                 }
2204         }
2205         ifp->if_real_bytes = real_size;
2206         ifp->if_bytes = new_size;
2207         ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork));
2208 }
2209
2210 void
2211 xfs_idestroy_fork(
2212         xfs_inode_t     *ip,
2213         int             whichfork)
2214 {
2215         xfs_ifork_t     *ifp;
2216
2217         ifp = XFS_IFORK_PTR(ip, whichfork);
2218         if (ifp->if_broot != NULL) {
2219                 kmem_free(ifp->if_broot);
2220                 ifp->if_broot = NULL;
2221         }
2222
2223         /*
2224          * If the format is local, then we can't have an extents
2225          * array so just look for an inline data array.  If we're
2226          * not local then we may or may not have an extents list,
2227          * so check and free it up if we do.
2228          */
2229         if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
2230                 if ((ifp->if_u1.if_data != ifp->if_u2.if_inline_data) &&
2231                     (ifp->if_u1.if_data != NULL)) {
2232                         ASSERT(ifp->if_real_bytes != 0);
2233                         kmem_free(ifp->if_u1.if_data);
2234                         ifp->if_u1.if_data = NULL;
2235                         ifp->if_real_bytes = 0;
2236                 }
2237         } else if ((ifp->if_flags & XFS_IFEXTENTS) &&
2238                    ((ifp->if_flags & XFS_IFEXTIREC) ||
2239                     ((ifp->if_u1.if_extents != NULL) &&
2240                      (ifp->if_u1.if_extents != ifp->if_u2.if_inline_ext)))) {
2241                 ASSERT(ifp->if_real_bytes != 0);
2242                 xfs_iext_destroy(ifp);
2243         }
2244         ASSERT(ifp->if_u1.if_extents == NULL ||
2245                ifp->if_u1.if_extents == ifp->if_u2.if_inline_ext);
2246         ASSERT(ifp->if_real_bytes == 0);
2247         if (whichfork == XFS_ATTR_FORK) {
2248                 kmem_zone_free(xfs_ifork_zone, ip->i_afp);
2249                 ip->i_afp = NULL;
2250         }
2251 }
2252
2253 /*
2254  * This is called to unpin an inode.  The caller must have the inode locked
2255  * in at least shared mode so that the buffer cannot be subsequently pinned
2256  * once someone is waiting for it to be unpinned.
2257  */
2258 static void
2259 xfs_iunpin(
2260         struct xfs_inode        *ip)
2261 {
2262         ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
2263
2264         trace_xfs_inode_unpin_nowait(ip, _RET_IP_);
2265
2266         /* Give the log a push to start the unpinning I/O */
2267         xfs_log_force_lsn(ip->i_mount, ip->i_itemp->ili_last_lsn, 0);
2268
2269 }
2270
2271 static void
2272 __xfs_iunpin_wait(
2273         struct xfs_inode        *ip)
2274 {
2275         wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IPINNED_BIT);
2276         DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IPINNED_BIT);
2277
2278         xfs_iunpin(ip);
2279
2280         do {
2281                 prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
2282                 if (xfs_ipincount(ip))
2283                         io_schedule();
2284         } while (xfs_ipincount(ip));
2285         finish_wait(wq, &wait.wait);
2286 }
2287
2288 void
2289 xfs_iunpin_wait(
2290         struct xfs_inode        *ip)
2291 {
2292         if (xfs_ipincount(ip))
2293                 __xfs_iunpin_wait(ip);
2294 }
2295
2296 /*
2297  * xfs_iextents_copy()
2298  *
2299  * This is called to copy the REAL extents (as opposed to the delayed
2300  * allocation extents) from the inode into the given buffer.  It
2301  * returns the number of bytes copied into the buffer.
2302  *
2303  * If there are no delayed allocation extents, then we can just
2304  * memcpy() the extents into the buffer.  Otherwise, we need to
2305  * examine each extent in turn and skip those which are delayed.
2306  */
2307 int
2308 xfs_iextents_copy(
2309         xfs_inode_t             *ip,
2310         xfs_bmbt_rec_t          *dp,
2311         int                     whichfork)
2312 {
2313         int                     copied;
2314         int                     i;
2315         xfs_ifork_t             *ifp;
2316         int                     nrecs;
2317         xfs_fsblock_t           start_block;
2318
2319         ifp = XFS_IFORK_PTR(ip, whichfork);
2320         ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
2321         ASSERT(ifp->if_bytes > 0);
2322
2323         nrecs = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
2324         XFS_BMAP_TRACE_EXLIST(ip, nrecs, whichfork);
2325         ASSERT(nrecs > 0);
2326
2327         /*
2328          * There are some delayed allocation extents in the
2329          * inode, so copy the extents one at a time and skip
2330          * the delayed ones.  There must be at least one
2331          * non-delayed extent.
2332          */
2333         copied = 0;
2334         for (i = 0; i < nrecs; i++) {
2335                 xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
2336                 start_block = xfs_bmbt_get_startblock(ep);
2337                 if (isnullstartblock(start_block)) {
2338                         /*
2339                          * It's a delayed allocation extent, so skip it.
2340                          */
2341                         continue;
2342                 }
2343
2344                 /* Translate to on disk format */
2345                 put_unaligned(cpu_to_be64(ep->l0), &dp->l0);
2346                 put_unaligned(cpu_to_be64(ep->l1), &dp->l1);
2347                 dp++;
2348                 copied++;
2349         }
2350         ASSERT(copied != 0);
2351         xfs_validate_extents(ifp, copied, XFS_EXTFMT_INODE(ip));
2352
2353         return (copied * (uint)sizeof(xfs_bmbt_rec_t));
2354 }
2355
2356 /*
2357  * Each of the following cases stores data into the same region
2358  * of the on-disk inode, so only one of them can be valid at
2359  * any given time. While it is possible to have conflicting formats
2360  * and log flags, e.g. having XFS_ILOG_?DATA set when the fork is
2361  * in EXTENTS format, this can only happen when the fork has
2362  * changed formats after being modified but before being flushed.
2363  * In these cases, the format always takes precedence, because the
2364  * format indicates the current state of the fork.
2365  */
2366 /*ARGSUSED*/
2367 STATIC void
2368 xfs_iflush_fork(
2369         xfs_inode_t             *ip,
2370         xfs_dinode_t            *dip,
2371         xfs_inode_log_item_t    *iip,
2372         int                     whichfork,
2373         xfs_buf_t               *bp)
2374 {
2375         char                    *cp;
2376         xfs_ifork_t             *ifp;
2377         xfs_mount_t             *mp;
2378 #ifdef XFS_TRANS_DEBUG
2379         int                     first;
2380 #endif
2381         static const short      brootflag[2] =
2382                 { XFS_ILOG_DBROOT, XFS_ILOG_ABROOT };
2383         static const short      dataflag[2] =
2384                 { XFS_ILOG_DDATA, XFS_ILOG_ADATA };
2385         static const short      extflag[2] =
2386                 { XFS_ILOG_DEXT, XFS_ILOG_AEXT };
2387
2388         if (!iip)
2389                 return;
2390         ifp = XFS_IFORK_PTR(ip, whichfork);
2391         /*
2392          * This can happen if we gave up in iformat in an error path,
2393          * for the attribute fork.
2394          */
2395         if (!ifp) {
2396                 ASSERT(whichfork == XFS_ATTR_FORK);
2397                 return;
2398         }
2399         cp = XFS_DFORK_PTR(dip, whichfork);
2400         mp = ip->i_mount;
2401         switch (XFS_IFORK_FORMAT(ip, whichfork)) {
2402         case XFS_DINODE_FMT_LOCAL:
2403                 if ((iip->ili_fields & dataflag[whichfork]) &&
2404                     (ifp->if_bytes > 0)) {
2405                         ASSERT(ifp->if_u1.if_data != NULL);
2406                         ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork));
2407                         memcpy(cp, ifp->if_u1.if_data, ifp->if_bytes);
2408                 }
2409                 break;
2410
2411         case XFS_DINODE_FMT_EXTENTS:
2412                 ASSERT((ifp->if_flags & XFS_IFEXTENTS) ||
2413                        !(iip->ili_fields & extflag[whichfork]));
2414                 if ((iip->ili_fields & extflag[whichfork]) &&
2415                     (ifp->if_bytes > 0)) {
2416                         ASSERT(xfs_iext_get_ext(ifp, 0));
2417                         ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) > 0);
2418                         (void)xfs_iextents_copy(ip, (xfs_bmbt_rec_t *)cp,
2419                                 whichfork);
2420                 }
2421                 break;
2422
2423         case XFS_DINODE_FMT_BTREE:
2424                 if ((iip->ili_fields & brootflag[whichfork]) &&
2425                     (ifp->if_broot_bytes > 0)) {
2426                         ASSERT(ifp->if_broot != NULL);
2427                         ASSERT(ifp->if_broot_bytes <=
2428                                (XFS_IFORK_SIZE(ip, whichfork) +
2429                                 XFS_BROOT_SIZE_ADJ));
2430                         xfs_bmbt_to_bmdr(mp, ifp->if_broot, ifp->if_broot_bytes,
2431                                 (xfs_bmdr_block_t *)cp,
2432                                 XFS_DFORK_SIZE(dip, mp, whichfork));
2433                 }
2434                 break;
2435
2436         case XFS_DINODE_FMT_DEV:
2437                 if (iip->ili_fields & XFS_ILOG_DEV) {
2438                         ASSERT(whichfork == XFS_DATA_FORK);
2439                         xfs_dinode_put_rdev(dip, ip->i_df.if_u2.if_rdev);
2440                 }
2441                 break;
2442
2443         case XFS_DINODE_FMT_UUID:
2444                 if (iip->ili_fields & XFS_ILOG_UUID) {
2445                         ASSERT(whichfork == XFS_DATA_FORK);
2446                         memcpy(XFS_DFORK_DPTR(dip),
2447                                &ip->i_df.if_u2.if_uuid,
2448                                sizeof(uuid_t));
2449                 }
2450                 break;
2451
2452         default:
2453                 ASSERT(0);
2454                 break;
2455         }
2456 }
2457
2458 STATIC int
2459 xfs_iflush_cluster(
2460         xfs_inode_t     *ip,
2461         xfs_buf_t       *bp)
2462 {
2463         xfs_mount_t             *mp = ip->i_mount;
2464         struct xfs_perag        *pag;
2465         unsigned long           first_index, mask;
2466         unsigned long           inodes_per_cluster;
2467         int                     ilist_size;
2468         xfs_inode_t             **ilist;
2469         xfs_inode_t             *iq;
2470         int                     nr_found;
2471         int                     clcount = 0;
2472         int                     bufwasdelwri;
2473         int                     i;
2474
2475         pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
2476
2477         inodes_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog;
2478         ilist_size = inodes_per_cluster * sizeof(xfs_inode_t *);
2479         ilist = kmem_alloc(ilist_size, KM_MAYFAIL|KM_NOFS);
2480         if (!ilist)
2481                 goto out_put;
2482
2483         mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1);
2484         first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask;
2485         rcu_read_lock();
2486         /* really need a gang lookup range call here */
2487         nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist,
2488                                         first_index, inodes_per_cluster);
2489         if (nr_found == 0)
2490                 goto out_free;
2491
2492         for (i = 0; i < nr_found; i++) {
2493                 iq = ilist[i];
2494                 if (iq == ip)
2495                         continue;
2496
2497                 /*
2498                  * because this is an RCU protected lookup, we could find a
2499                  * recently freed or even reallocated inode during the lookup.
2500                  * We need to check under the i_flags_lock for a valid inode
2501                  * here. Skip it if it is not valid or the wrong inode.
2502                  */
2503                 spin_lock(&ip->i_flags_lock);
2504                 if (!ip->i_ino ||
2505                     (XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index) {
2506                         spin_unlock(&ip->i_flags_lock);
2507                         continue;
2508                 }
2509                 spin_unlock(&ip->i_flags_lock);
2510
2511                 /*
2512                  * Do an un-protected check to see if the inode is dirty and
2513                  * is a candidate for flushing.  These checks will be repeated
2514                  * later after the appropriate locks are acquired.
2515                  */
2516                 if (xfs_inode_clean(iq) && xfs_ipincount(iq) == 0)
2517                         continue;
2518
2519                 /*
2520                  * Try to get locks.  If any are unavailable or it is pinned,
2521                  * then this inode cannot be flushed and is skipped.
2522                  */
2523
2524                 if (!xfs_ilock_nowait(iq, XFS_ILOCK_SHARED))
2525                         continue;
2526                 if (!xfs_iflock_nowait(iq)) {
2527                         xfs_iunlock(iq, XFS_ILOCK_SHARED);
2528                         continue;
2529                 }
2530                 if (xfs_ipincount(iq)) {
2531                         xfs_ifunlock(iq);
2532                         xfs_iunlock(iq, XFS_ILOCK_SHARED);
2533                         continue;
2534                 }
2535
2536                 /*
2537                  * arriving here means that this inode can be flushed.  First
2538                  * re-check that it's dirty before flushing.
2539                  */
2540                 if (!xfs_inode_clean(iq)) {
2541                         int     error;
2542                         error = xfs_iflush_int(iq, bp);
2543                         if (error) {
2544                                 xfs_iunlock(iq, XFS_ILOCK_SHARED);
2545                                 goto cluster_corrupt_out;
2546                         }
2547                         clcount++;
2548                 } else {
2549                         xfs_ifunlock(iq);
2550                 }
2551                 xfs_iunlock(iq, XFS_ILOCK_SHARED);
2552         }
2553
2554         if (clcount) {
2555                 XFS_STATS_INC(xs_icluster_flushcnt);
2556                 XFS_STATS_ADD(xs_icluster_flushinode, clcount);
2557         }
2558
2559 out_free:
2560         rcu_read_unlock();
2561         kmem_free(ilist);
2562 out_put:
2563         xfs_perag_put(pag);
2564         return 0;
2565
2566
2567 cluster_corrupt_out:
2568         /*
2569          * Corruption detected in the clustering loop.  Invalidate the
2570          * inode buffer and shut down the filesystem.
2571          */
2572         rcu_read_unlock();
2573         /*
2574          * Clean up the buffer.  If it was delwri, just release it --
2575          * brelse can handle it with no problems.  If not, shut down the
2576          * filesystem before releasing the buffer.
2577          */
2578         bufwasdelwri = (bp->b_flags & _XBF_DELWRI_Q);
2579         if (bufwasdelwri)
2580                 xfs_buf_relse(bp);
2581
2582         xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
2583
2584         if (!bufwasdelwri) {
2585                 /*
2586                  * Just like incore_relse: if we have b_iodone functions,
2587                  * mark the buffer as an error and call them.  Otherwise
2588                  * mark it as stale and brelse.
2589                  */
2590                 if (bp->b_iodone) {
2591                         XFS_BUF_UNDONE(bp);
2592                         xfs_buf_stale(bp);
2593                         xfs_buf_ioerror(bp, EIO);
2594                         xfs_buf_ioend(bp, 0);
2595                 } else {
2596                         xfs_buf_stale(bp);
2597                         xfs_buf_relse(bp);
2598                 }
2599         }
2600
2601         /*
2602          * Unlocks the flush lock
2603          */
2604         xfs_iflush_abort(iq, false);
2605         kmem_free(ilist);
2606         xfs_perag_put(pag);
2607         return XFS_ERROR(EFSCORRUPTED);
2608 }
2609
2610 /*
2611  * Flush dirty inode metadata into the backing buffer.
2612  *
2613  * The caller must have the inode lock and the inode flush lock held.  The
2614  * inode lock will still be held upon return to the caller, and the inode
2615  * flush lock will be released after the inode has reached the disk.
2616  *
2617  * The caller must write out the buffer returned in *bpp and release it.
2618  */
2619 int
2620 xfs_iflush(
2621         struct xfs_inode        *ip,
2622         struct xfs_buf          **bpp)
2623 {
2624         struct xfs_mount        *mp = ip->i_mount;
2625         struct xfs_buf          *bp;
2626         struct xfs_dinode       *dip;
2627         int                     error;
2628
2629         XFS_STATS_INC(xs_iflush_count);
2630
2631         ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
2632         ASSERT(xfs_isiflocked(ip));
2633         ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
2634                ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
2635
2636         *bpp = NULL;
2637
2638         xfs_iunpin_wait(ip);
2639
2640         /*
2641          * For stale inodes we cannot rely on the backing buffer remaining
2642          * stale in cache for the remaining life of the stale inode and so
2643          * xfs_imap_to_bp() below may give us a buffer that no longer contains
2644          * inodes below. We have to check this after ensuring the inode is
2645          * unpinned so that it is safe to reclaim the stale inode after the
2646          * flush call.
2647          */
2648         if (xfs_iflags_test(ip, XFS_ISTALE)) {
2649                 xfs_ifunlock(ip);
2650                 return 0;
2651         }
2652
2653         /*
2654          * This may have been unpinned because the filesystem is shutting
2655          * down forcibly. If that's the case we must not write this inode
2656          * to disk, because the log record didn't make it to disk.
2657          *
2658          * We also have to remove the log item from the AIL in this case,
2659          * as we wait for an empty AIL as part of the unmount process.
2660          */
2661         if (XFS_FORCED_SHUTDOWN(mp)) {
2662                 error = XFS_ERROR(EIO);
2663                 goto abort_out;
2664         }
2665
2666         /*
2667          * Get the buffer containing the on-disk inode.
2668          */
2669         error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &bp, XBF_TRYLOCK,
2670                                0);
2671         if (error || !bp) {
2672                 xfs_ifunlock(ip);
2673                 return error;
2674         }
2675
2676         /*
2677          * First flush out the inode that xfs_iflush was called with.
2678          */
2679         error = xfs_iflush_int(ip, bp);
2680         if (error)
2681                 goto corrupt_out;
2682
2683         /*
2684          * If the buffer is pinned then push on the log now so we won't
2685          * get stuck waiting in the write for too long.
2686          */
2687         if (xfs_buf_ispinned(bp))
2688                 xfs_log_force(mp, 0);
2689
2690         /*
2691          * inode clustering:
2692          * see if other inodes can be gathered into this write
2693          */
2694         error = xfs_iflush_cluster(ip, bp);
2695         if (error)
2696                 goto cluster_corrupt_out;
2697
2698         *bpp = bp;
2699         return 0;
2700
2701 corrupt_out:
2702         xfs_buf_relse(bp);
2703         xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
2704 cluster_corrupt_out:
2705         error = XFS_ERROR(EFSCORRUPTED);
2706 abort_out:
2707         /*
2708          * Unlocks the flush lock
2709          */
2710         xfs_iflush_abort(ip, false);
2711         return error;
2712 }
2713
2714
2715 STATIC int
2716 xfs_iflush_int(
2717         xfs_inode_t             *ip,
2718         xfs_buf_t               *bp)
2719 {
2720         xfs_inode_log_item_t    *iip;
2721         xfs_dinode_t            *dip;
2722         xfs_mount_t             *mp;
2723 #ifdef XFS_TRANS_DEBUG
2724         int                     first;
2725 #endif
2726
2727         ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
2728         ASSERT(xfs_isiflocked(ip));
2729         ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
2730                ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
2731
2732         iip = ip->i_itemp;
2733         mp = ip->i_mount;
2734
2735         /* set *dip = inode's place in the buffer */
2736         dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset);
2737
2738         if (XFS_TEST_ERROR(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC),
2739                                mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) {
2740                 xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
2741                         "%s: Bad inode %Lu magic number 0x%x, ptr 0x%p",
2742                         __func__, ip->i_ino, be16_to_cpu(dip->di_magic), dip);
2743                 goto corrupt_out;
2744         }
2745         if (XFS_TEST_ERROR(ip->i_d.di_magic != XFS_DINODE_MAGIC,
2746                                 mp, XFS_ERRTAG_IFLUSH_2, XFS_RANDOM_IFLUSH_2)) {
2747                 xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
2748                         "%s: Bad inode %Lu, ptr 0x%p, magic number 0x%x",
2749                         __func__, ip->i_ino, ip, ip->i_d.di_magic);
2750                 goto corrupt_out;
2751         }
2752         if (S_ISREG(ip->i_d.di_mode)) {
2753                 if (XFS_TEST_ERROR(
2754                     (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) &&
2755                     (ip->i_d.di_format != XFS_DINODE_FMT_BTREE),
2756                     mp, XFS_ERRTAG_IFLUSH_3, XFS_RANDOM_IFLUSH_3)) {
2757                         xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
2758                                 "%s: Bad regular inode %Lu, ptr 0x%p",
2759                                 __func__, ip->i_ino, ip);
2760                         goto corrupt_out;
2761                 }
2762         } else if (S_ISDIR(ip->i_d.di_mode)) {
2763                 if (XFS_TEST_ERROR(
2764                     (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) &&
2765                     (ip->i_d.di_format != XFS_DINODE_FMT_BTREE) &&
2766                     (ip->i_d.di_format != XFS_DINODE_FMT_LOCAL),
2767                     mp, XFS_ERRTAG_IFLUSH_4, XFS_RANDOM_IFLUSH_4)) {
2768                         xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
2769                                 "%s: Bad directory inode %Lu, ptr 0x%p",
2770                                 __func__, ip->i_ino, ip);
2771                         goto corrupt_out;
2772                 }
2773         }
2774         if (XFS_TEST_ERROR(ip->i_d.di_nextents + ip->i_d.di_anextents >
2775                                 ip->i_d.di_nblocks, mp, XFS_ERRTAG_IFLUSH_5,
2776                                 XFS_RANDOM_IFLUSH_5)) {
2777                 xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
2778                         "%s: detected corrupt incore inode %Lu, "
2779                         "total extents = %d, nblocks = %Ld, ptr 0x%p",
2780                         __func__, ip->i_ino,
2781                         ip->i_d.di_nextents + ip->i_d.di_anextents,
2782                         ip->i_d.di_nblocks, ip);
2783                 goto corrupt_out;
2784         }
2785         if (XFS_TEST_ERROR(ip->i_d.di_forkoff > mp->m_sb.sb_inodesize,
2786                                 mp, XFS_ERRTAG_IFLUSH_6, XFS_RANDOM_IFLUSH_6)) {
2787                 xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
2788                         "%s: bad inode %Lu, forkoff 0x%x, ptr 0x%p",
2789                         __func__, ip->i_ino, ip->i_d.di_forkoff, ip);
2790                 goto corrupt_out;
2791         }
2792         /*
2793          * bump the flush iteration count, used to detect flushes which
2794          * postdate a log record during recovery.
2795          */
2796
2797         ip->i_d.di_flushiter++;
2798
2799         /*
2800          * Copy the dirty parts of the inode into the on-disk
2801          * inode.  We always copy out the core of the inode,
2802          * because if the inode is dirty at all the core must
2803          * be.
2804          */
2805         xfs_dinode_to_disk(dip, &ip->i_d);
2806
2807         /* Wrap, we never let the log put out DI_MAX_FLUSH */
2808         if (ip->i_d.di_flushiter == DI_MAX_FLUSH)
2809                 ip->i_d.di_flushiter = 0;
2810
2811         /*
2812          * If this is really an old format inode and the superblock version
2813          * has not been updated to support only new format inodes, then
2814          * convert back to the old inode format.  If the superblock version
2815          * has been updated, then make the conversion permanent.
2816          */
2817         ASSERT(ip->i_d.di_version == 1 || xfs_sb_version_hasnlink(&mp->m_sb));
2818         if (ip->i_d.di_version == 1) {
2819                 if (!xfs_sb_version_hasnlink(&mp->m_sb)) {
2820                         /*
2821                          * Convert it back.
2822                          */
2823                         ASSERT(ip->i_d.di_nlink <= XFS_MAXLINK_1);
2824                         dip->di_onlink = cpu_to_be16(ip->i_d.di_nlink);
2825                 } else {
2826                         /*
2827                          * The superblock version has already been bumped,
2828                          * so just make the conversion to the new inode
2829                          * format permanent.
2830                          */
2831                         ip->i_d.di_version = 2;
2832                         dip->di_version = 2;
2833                         ip->i_d.di_onlink = 0;
2834                         dip->di_onlink = 0;
2835                         memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
2836                         memset(&(dip->di_pad[0]), 0,
2837                               sizeof(dip->di_pad));
2838                         ASSERT(xfs_get_projid(ip) == 0);
2839                 }
2840         }
2841
2842         xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK, bp);
2843         if (XFS_IFORK_Q(ip))
2844                 xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK, bp);
2845         xfs_inobp_check(mp, bp);
2846
2847         /*
2848          * We've recorded everything logged in the inode, so we'd like to clear
2849          * the ili_fields bits so we don't log and flush things unnecessarily.
2850          * However, we can't stop logging all this information until the data
2851          * we've copied into the disk buffer is written to disk.  If we did we
2852          * might overwrite the copy of the inode in the log with all the data
2853          * after re-logging only part of it, and in the face of a crash we
2854          * wouldn't have all the data we need to recover.
2855          *
2856          * What we do is move the bits to the ili_last_fields field.  When
2857          * logging the inode, these bits are moved back to the ili_fields field.
2858          * In the xfs_iflush_done() routine we clear ili_last_fields, since we
2859          * know that the information those bits represent is permanently on
2860          * disk.  As long as the flush completes before the inode is logged
2861          * again, then both ili_fields and ili_last_fields will be cleared.
2862          *
2863          * We can play with the ili_fields bits here, because the inode lock
2864          * must be held exclusively in order to set bits there and the flush
2865          * lock protects the ili_last_fields bits.  Set ili_logged so the flush
2866          * done routine can tell whether or not to look in the AIL.  Also, store
2867          * the current LSN of the inode so that we can tell whether the item has
2868          * moved in the AIL from xfs_iflush_done().  In order to read the lsn we
2869          * need the AIL lock, because it is a 64 bit value that cannot be read
2870          * atomically.
2871          */
2872         if (iip != NULL && iip->ili_fields != 0) {
2873                 iip->ili_last_fields = iip->ili_fields;
2874                 iip->ili_fields = 0;
2875                 iip->ili_logged = 1;
2876
2877                 xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
2878                                         &iip->ili_item.li_lsn);
2879
2880                 /*
2881                  * Attach the function xfs_iflush_done to the inode's
2882                  * buffer.  This will remove the inode from the AIL
2883                  * and unlock the inode's flush lock when the inode is
2884                  * completely written to disk.
2885                  */
2886                 xfs_buf_attach_iodone(bp, xfs_iflush_done, &iip->ili_item);
2887
2888                 ASSERT(bp->b_fspriv != NULL);
2889                 ASSERT(bp->b_iodone != NULL);
2890         } else {
2891                 /*
2892                  * We're flushing an inode which is not in the AIL and has
2893                  * not been logged.  For this case we can immediately drop
2894                  * the inode flush lock because we can avoid the whole
2895                  * AIL state thing.  It's OK to drop the flush lock now,
2896                  * because we've already locked the buffer and to do anything
2897                  * you really need both.
2898                  */
2899                 if (iip != NULL) {
2900                         ASSERT(iip->ili_logged == 0);
2901                         ASSERT(iip->ili_last_fields == 0);
2902                         ASSERT((iip->ili_item.li_flags & XFS_LI_IN_AIL) == 0);
2903                 }
2904                 xfs_ifunlock(ip);
2905         }
2906
2907         return 0;
2908
2909 corrupt_out:
2910         return XFS_ERROR(EFSCORRUPTED);
2911 }
2912
2913 /*
2914  * Return a pointer to the extent record at file index idx.
2915  */
2916 xfs_bmbt_rec_host_t *
2917 xfs_iext_get_ext(
2918         xfs_ifork_t     *ifp,           /* inode fork pointer */
2919         xfs_extnum_t    idx)            /* index of target extent */
2920 {
2921         ASSERT(idx >= 0);
2922         ASSERT(idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t));
2923
2924         if ((ifp->if_flags & XFS_IFEXTIREC) && (idx == 0)) {
2925                 return ifp->if_u1.if_ext_irec->er_extbuf;
2926         } else if (ifp->if_flags & XFS_IFEXTIREC) {
2927                 xfs_ext_irec_t  *erp;           /* irec pointer */
2928                 int             erp_idx = 0;    /* irec index */
2929                 xfs_extnum_t    page_idx = idx; /* ext index in target list */
2930
2931                 erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 0);
2932                 return &erp->er_extbuf[page_idx];
2933         } else if (ifp->if_bytes) {
2934                 return &ifp->if_u1.if_extents[idx];
2935         } else {
2936                 return NULL;
2937         }
2938 }
2939
2940 /*
2941  * Insert new item(s) into the extent records for incore inode
2942  * fork 'ifp'.  'count' new items are inserted at index 'idx'.
2943  */
2944 void
2945 xfs_iext_insert(
2946         xfs_inode_t     *ip,            /* incore inode pointer */
2947         xfs_extnum_t    idx,            /* starting index of new items */
2948         xfs_extnum_t    count,          /* number of inserted items */
2949         xfs_bmbt_irec_t *new,           /* items to insert */
2950         int             state)          /* type of extent conversion */
2951 {
2952         xfs_ifork_t     *ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df;
2953         xfs_extnum_t    i;              /* extent record index */
2954
2955         trace_xfs_iext_insert(ip, idx, new, state, _RET_IP_);
2956
2957         ASSERT(ifp->if_flags & XFS_IFEXTENTS);
2958         xfs_iext_add(ifp, idx, count);
2959         for (i = idx; i < idx + count; i++, new++)
2960                 xfs_bmbt_set_all(xfs_iext_get_ext(ifp, i), new);
2961 }
2962
2963 /*
2964  * This is called when the amount of space required for incore file
2965  * extents needs to be increased. The ext_diff parameter stores the
2966  * number of new extents being added and the idx parameter contains
2967  * the extent index where the new extents will be added. If the new
2968  * extents are being appended, then we just need to (re)allocate and
2969  * initialize the space. Otherwise, if the new extents are being
2970  * inserted into the middle of the existing entries, a bit more work
2971  * is required to make room for the new extents to be inserted. The
2972  * caller is responsible for filling in the new extent entries upon
2973  * return.
2974  */
2975 void
2976 xfs_iext_add(
2977         xfs_ifork_t     *ifp,           /* inode fork pointer */
2978         xfs_extnum_t    idx,            /* index to begin adding exts */
2979         int             ext_diff)       /* number of extents to add */
2980 {
2981         int             byte_diff;      /* new bytes being added */
2982         int             new_size;       /* size of extents after adding */
2983         xfs_extnum_t    nextents;       /* number of extents in file */
2984
2985         nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
2986         ASSERT((idx >= 0) && (idx <= nextents));
2987         byte_diff = ext_diff * sizeof(xfs_bmbt_rec_t);
2988         new_size = ifp->if_bytes + byte_diff;
2989         /*
2990          * If the new number of extents (nextents + ext_diff)
2991          * fits inside the inode, then continue to use the inline
2992          * extent buffer.
2993          */
2994         if (nextents + ext_diff <= XFS_INLINE_EXTS) {
2995                 if (idx < nextents) {
2996                         memmove(&ifp->if_u2.if_inline_ext[idx + ext_diff],
2997                                 &ifp->if_u2.if_inline_ext[idx],
2998                                 (nextents - idx) * sizeof(xfs_bmbt_rec_t));
2999                         memset(&ifp->if_u2.if_inline_ext[idx], 0, byte_diff);
3000                 }
3001                 ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
3002                 ifp->if_real_bytes = 0;
3003         }
3004         /*
3005          * Otherwise use a linear (direct) extent list.
3006          * If the extents are currently inside the inode,
3007          * xfs_iext_realloc_direct will switch us from
3008          * inline to direct extent allocation mode.
3009          */
3010         else if (nextents + ext_diff <= XFS_LINEAR_EXTS) {
3011                 xfs_iext_realloc_direct(ifp, new_size);
3012                 if (idx < nextents) {
3013                         memmove(&ifp->if_u1.if_extents[idx + ext_diff],
3014                                 &ifp->if_u1.if_extents[idx],
3015                                 (nextents - idx) * sizeof(xfs_bmbt_rec_t));
3016                         memset(&ifp->if_u1.if_extents[idx], 0, byte_diff);
3017                 }
3018         }
3019         /* Indirection array */
3020         else {
3021                 xfs_ext_irec_t  *erp;
3022                 int             erp_idx = 0;
3023                 int             page_idx = idx;
3024
3025                 ASSERT(nextents + ext_diff > XFS_LINEAR_EXTS);
3026                 if (ifp->if_flags & XFS_IFEXTIREC) {
3027                         erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 1);
3028                 } else {
3029                         xfs_iext_irec_init(ifp);
3030                         ASSERT(ifp->if_flags & XFS_IFEXTIREC);
3031                         erp = ifp->if_u1.if_ext_irec;
3032                 }
3033                 /* Extents fit in target extent page */
3034                 if (erp && erp->er_extcount + ext_diff <= XFS_LINEAR_EXTS) {
3035                         if (page_idx < erp->er_extcount) {
3036                                 memmove(&erp->er_extbuf[page_idx + ext_diff],
3037                                         &erp->er_extbuf[page_idx],
3038                                         (erp->er_extcount - page_idx) *
3039                                         sizeof(xfs_bmbt_rec_t));
3040                                 memset(&erp->er_extbuf[page_idx], 0, byte_diff);
3041                         }
3042                         erp->er_extcount += ext_diff;
3043                         xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff);
3044                 }
3045                 /* Insert a new extent page */
3046                 else if (erp) {
3047                         xfs_iext_add_indirect_multi(ifp,
3048                                 erp_idx, page_idx, ext_diff);
3049                 }
3050                 /*
3051                  * If extent(s) are being appended to the last page in
3052                  * the indirection array and the new extent(s) don't fit
3053                  * in the page, then erp is NULL and erp_idx is set to
3054                  * the next index needed in the indirection array.
3055                  */
3056                 else {
3057                         int     count = ext_diff;
3058
3059                         while (count) {
3060                                 erp = xfs_iext_irec_new(ifp, erp_idx);
3061                                 erp->er_extcount = count;
3062                                 count -= MIN(count, (int)XFS_LINEAR_EXTS);
3063                                 if (count) {
3064                                         erp_idx++;
3065                                 }
3066                         }
3067                 }
3068         }
3069         ifp->if_bytes = new_size;
3070 }
3071
3072 /*
3073  * This is called when incore extents are being added to the indirection
3074  * array and the new extents do not fit in the target extent list. The
3075  * erp_idx parameter contains the irec index for the target extent list
3076  * in the indirection array, and the idx parameter contains the extent
3077  * index within the list. The number of extents being added is stored
3078  * in the count parameter.
3079  *
3080  *    |-------|   |-------|
3081  *    |       |   |       |    idx - number of extents before idx
3082  *    |  idx  |   | count |
3083  *    |       |   |       |    count - number of extents being inserted at idx
3084  *    |-------|   |-------|
3085  *    | count |   | nex2  |    nex2 - number of extents after idx + count
3086  *    |-------|   |-------|
3087  */
3088 void
3089 xfs_iext_add_indirect_multi(
3090         xfs_ifork_t     *ifp,                   /* inode fork pointer */
3091         int             erp_idx,                /* target extent irec index */
3092         xfs_extnum_t    idx,                    /* index within target list */
3093         int             count)                  /* new extents being added */
3094 {
3095         int             byte_diff;              /* new bytes being added */
3096         xfs_ext_irec_t  *erp;                   /* pointer to irec entry */
3097         xfs_extnum_t    ext_diff;               /* number of extents to add */
3098         xfs_extnum_t    ext_cnt;                /* new extents still needed */
3099         xfs_extnum_t    nex2;                   /* extents after idx + count */
3100         xfs_bmbt_rec_t  *nex2_ep = NULL;        /* temp list for nex2 extents */
3101         int             nlists;                 /* number of irec's (lists) */
3102
3103         ASSERT(ifp->if_flags & XFS_IFEXTIREC);
3104         erp = &ifp->if_u1.if_ext_irec[erp_idx];
3105         nex2 = erp->er_extcount - idx;
3106         nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
3107
3108         /*
3109          * Save second part of target extent list
3110          * (all extents past */
3111         if (nex2) {
3112                 byte_diff = nex2 * sizeof(xfs_bmbt_rec_t);
3113                 nex2_ep = (xfs_bmbt_rec_t *) kmem_alloc(byte_diff, KM_NOFS);
3114                 memmove(nex2_ep, &erp->er_extbuf[idx], byte_diff);
3115                 erp->er_extcount -= nex2;
3116                 xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -nex2);
3117                 memset(&erp->er_extbuf[idx], 0, byte_diff);
3118         }
3119
3120         /*
3121          * Add the new extents to the end of the target
3122          * list, then allocate new irec record(s) and
3123          * extent buffer(s) as needed to store the rest
3124          * of the new extents.
3125          */
3126         ext_cnt = count;
3127         ext_diff = MIN(ext_cnt, (int)XFS_LINEAR_EXTS - erp->er_extcount);
3128         if (ext_diff) {
3129                 erp->er_extcount += ext_diff;
3130                 xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff);
3131                 ext_cnt -= ext_diff;
3132         }
3133         while (ext_cnt) {
3134                 erp_idx++;
3135                 erp = xfs_iext_irec_new(ifp, erp_idx);
3136                 ext_diff = MIN(ext_cnt, (int)XFS_LINEAR_EXTS);
3137                 erp->er_extcount = ext_diff;
3138                 xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff);
3139                 ext_cnt -= ext_diff;
3140         }
3141
3142         /* Add nex2 extents back to indirection array */
3143         if (nex2) {
3144                 xfs_extnum_t    ext_avail;
3145                 int             i;
3146
3147                 byte_diff = nex2 * sizeof(xfs_bmbt_rec_t);
3148                 ext_avail = XFS_LINEAR_EXTS - erp->er_extcount;
3149                 i = 0;
3150                 /*
3151                  * If nex2 extents fit in the current page, append
3152                  * nex2_ep after the new extents.
3153                  */
3154                 if (nex2 <= ext_avail) {
3155                         i = erp->er_extcount;
3156                 }
3157                 /*
3158                  * Otherwise, check if space is available in the
3159                  * next page.
3160                  */
3161                 else if ((erp_idx < nlists - 1) &&
3162                          (nex2 <= (ext_avail = XFS_LINEAR_EXTS -
3163                           ifp->if_u1.if_ext_irec[erp_idx+1].er_extcount))) {
3164                         erp_idx++;
3165                         erp++;
3166                         /* Create a hole for nex2 extents */
3167                         memmove(&erp->er_extbuf[nex2], erp->er_extbuf,
3168                                 erp->er_extcount * sizeof(xfs_bmbt_rec_t));
3169                 }
3170                 /*
3171                  * Final choice, create a new extent page for
3172                  * nex2 extents.
3173                  */
3174                 else {
3175                         erp_idx++;
3176                         erp = xfs_iext_irec_new(ifp, erp_idx);
3177                 }
3178                 memmove(&erp->er_extbuf[i], nex2_ep, byte_diff);
3179                 kmem_free(nex2_ep);
3180                 erp->er_extcount += nex2;
3181                 xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, nex2);
3182         }
3183 }
3184
3185 /*
3186  * This is called when the amount of space required for incore file
3187  * extents needs to be decreased. The ext_diff parameter stores the
3188  * number of extents to be removed and the idx parameter contains
3189  * the extent index where the extents will be removed from.
3190  *
3191  * If the amount of space needed has decreased below the linear
3192  * limit, XFS_IEXT_BUFSZ, then switch to using the contiguous
3193  * extent array.  Otherwise, use kmem_realloc() to adjust the
3194  * size to what is needed.
3195  */
3196 void
3197 xfs_iext_remove(
3198         xfs_inode_t     *ip,            /* incore inode pointer */
3199         xfs_extnum_t    idx,            /* index to begin removing exts */
3200         int             ext_diff,       /* number of extents to remove */
3201         int             state)          /* type of extent conversion */
3202 {
3203         xfs_ifork_t     *ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df;
3204         xfs_extnum_t    nextents;       /* number of extents in file */
3205         int             new_size;       /* size of extents after removal */
3206
3207         trace_xfs_iext_remove(ip, idx, state, _RET_IP_);
3208
3209         ASSERT(ext_diff > 0);
3210         nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
3211         new_size = (nextents - ext_diff) * sizeof(xfs_bmbt_rec_t);
3212
3213         if (new_size == 0) {
3214                 xfs_iext_destroy(ifp);
3215         } else if (ifp->if_flags & XFS_IFEXTIREC) {
3216                 xfs_iext_remove_indirect(ifp, idx, ext_diff);
3217         } else if (ifp->if_real_bytes) {
3218                 xfs_iext_remove_direct(ifp, idx, ext_diff);
3219         } else {
3220                 xfs_iext_remove_inline(ifp, idx, ext_diff);
3221         }
3222         ifp->if_bytes = new_size;
3223 }
3224
3225 /*
3226  * This removes ext_diff extents from the inline buffer, beginning
3227  * at extent index idx.
3228  */
3229 void
3230 xfs_iext_remove_inline(
3231         xfs_ifork_t     *ifp,           /* inode fork pointer */
3232         xfs_extnum_t    idx,            /* index to begin removing exts */
3233         int             ext_diff)       /* number of extents to remove */
3234 {
3235         int             nextents;       /* number of extents in file */
3236
3237         ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
3238         ASSERT(idx < XFS_INLINE_EXTS);
3239         nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
3240         ASSERT(((nextents - ext_diff) > 0) &&
3241                 (nextents - ext_diff) < XFS_INLINE_EXTS);
3242
3243         if (idx + ext_diff < nextents) {
3244                 memmove(&ifp->if_u2.if_inline_ext[idx],
3245                         &ifp->if_u2.if_inline_ext[idx + ext_diff],
3246                         (nextents - (idx + ext_diff)) *
3247                          sizeof(xfs_bmbt_rec_t));
3248                 memset(&ifp->if_u2.if_inline_ext[nextents - ext_diff],
3249                         0, ext_diff * sizeof(xfs_bmbt_rec_t));
3250         } else {
3251                 memset(&ifp->if_u2.if_inline_ext[idx], 0,
3252                         ext_diff * sizeof(xfs_bmbt_rec_t));
3253         }
3254 }
3255
3256 /*
3257  * This removes ext_diff extents from a linear (direct) extent list,
3258  * beginning at extent index idx. If the extents are being removed
3259  * from the end of the list (ie. truncate) then we just need to re-
3260  * allocate the list to remove the extra space. Otherwise, if the
3261  * extents are being removed from the middle of the existing extent
3262  * entries, then we first need to move the extent records beginning
3263  * at idx + ext_diff up in the list to overwrite the records being
3264  * removed, then remove the extra space via kmem_realloc.
3265  */
3266 void
3267 xfs_iext_remove_direct(
3268         xfs_ifork_t     *ifp,           /* inode fork pointer */
3269         xfs_extnum_t    idx,            /* index to begin removing exts */
3270         int             ext_diff)       /* number of extents to remove */
3271 {
3272         xfs_extnum_t    nextents;       /* number of extents in file */
3273         int             new_size;       /* size of extents after removal */
3274
3275         ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
3276         new_size = ifp->if_bytes -
3277                 (ext_diff * sizeof(xfs_bmbt_rec_t));
3278         nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
3279
3280         if (new_size == 0) {
3281                 xfs_iext_destroy(ifp);
3282                 return;
3283         }
3284         /* Move extents up in the list (if needed) */
3285         if (idx + ext_diff < nextents) {
3286                 memmove(&ifp->if_u1.if_extents[idx],
3287                         &ifp->if_u1.if_extents[idx + ext_diff],
3288                         (nextents - (idx + ext_diff)) *
3289                          sizeof(xfs_bmbt_rec_t));
3290         }
3291         memset(&ifp->if_u1.if_extents[nextents - ext_diff],
3292                 0, ext_diff * sizeof(xfs_bmbt_rec_t));
3293         /*
3294          * Reallocate the direct extent list. If the extents
3295          * will fit inside the inode then xfs_iext_realloc_direct
3296          * will switch from direct to inline extent allocation
3297          * mode for us.
3298          */
3299         xfs_iext_realloc_direct(ifp, new_size);
3300         ifp->if_bytes = new_size;
3301 }
3302
3303 /*
3304  * This is called when incore extents are being removed from the
3305  * indirection array and the extents being removed span multiple extent
3306  * buffers. The idx parameter contains the file extent index where we
3307  * want to begin removing extents, and the count parameter contains
3308  * how many extents need to be removed.
3309  *
3310  *    |-------|   |-------|
3311  *    | nex1  |   |       |    nex1 - number of extents before idx
3312  *    |-------|   | count |
3313  *    |       |   |       |    count - number of extents being removed at idx
3314  *    | count |   |-------|
3315  *    |       |   | nex2  |    nex2 - number of extents after idx + count
3316  *    |-------|   |-------|
3317  */
3318 void
3319 xfs_iext_remove_indirect(
3320         xfs_ifork_t     *ifp,           /* inode fork pointer */
3321         xfs_extnum_t    idx,            /* index to begin removing extents */
3322         int             count)          /* number of extents to remove */
3323 {
3324         xfs_ext_irec_t  *erp;           /* indirection array pointer */
3325         int             erp_idx = 0;    /* indirection array index */
3326         xfs_extnum_t    ext_cnt;        /* extents left to remove */
3327         xfs_extnum_t    ext_diff;       /* extents to remove in current list */
3328         xfs_extnum_t    nex1;           /* number of extents before idx */
3329         xfs_extnum_t    nex2;           /* extents after idx + count */
3330         int             page_idx = idx; /* index in target extent list */
3331
3332         ASSERT(ifp->if_flags & XFS_IFEXTIREC);
3333         erp = xfs_iext_idx_to_irec(ifp,  &page_idx, &erp_idx, 0);
3334         ASSERT(erp != NULL);
3335         nex1 = page_idx;
3336         ext_cnt = count;
3337         while (ext_cnt) {
3338                 nex2 = MAX((erp->er_extcount - (nex1 + ext_cnt)), 0);
3339                 ext_diff = MIN(ext_cnt, (erp->er_extcount - nex1));
3340                 /*
3341                  * Check for deletion of entire list;
3342                  * xfs_iext_irec_remove() updates extent offsets.
3343                  */
3344                 if (ext_diff == erp->er_extcount) {
3345                         xfs_iext_irec_remove(ifp, erp_idx);
3346                         ext_cnt -= ext_diff;
3347                         nex1 = 0;
3348                         if (ext_cnt) {
3349                                 ASSERT(erp_idx < ifp->if_real_bytes /
3350                                         XFS_IEXT_BUFSZ);
3351                                 erp = &ifp->if_u1.if_ext_irec[erp_idx];
3352                                 nex1 = 0;
3353                                 continue;
3354                         } else {
3355                                 break;
3356                         }
3357                 }
3358                 /* Move extents up (if needed) */
3359                 if (nex2) {
3360                         memmove(&erp->er_extbuf[nex1],
3361                                 &erp->er_extbuf[nex1 + ext_diff],
3362                                 nex2 * sizeof(xfs_bmbt_rec_t));
3363                 }
3364                 /* Zero out rest of page */
3365                 memset(&erp->er_extbuf[nex1 + nex2], 0, (XFS_IEXT_BUFSZ -
3366                         ((nex1 + nex2) * sizeof(xfs_bmbt_rec_t))));
3367                 /* Update remaining counters */
3368                 erp->er_extcount -= ext_diff;
3369                 xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -ext_diff);
3370                 ext_cnt -= ext_diff;
3371                 nex1 = 0;
3372                 erp_idx++;
3373                 erp++;
3374         }
3375         ifp->if_bytes -= count * sizeof(xfs_bmbt_rec_t);
3376         xfs_iext_irec_compact(ifp);
3377 }
3378
3379 /*
3380  * Create, destroy, or resize a linear (direct) block of extents.
3381  */
3382 void
3383 xfs_iext_realloc_direct(
3384         xfs_ifork_t     *ifp,           /* inode fork pointer */
3385         int             new_size)       /* new size of extents */
3386 {
3387         int             rnew_size;      /* real new size of extents */
3388
3389         rnew_size = new_size;
3390
3391         ASSERT(!(ifp->if_flags & XFS_IFEXTIREC) ||
3392                 ((new_size >= 0) && (new_size <= XFS_IEXT_BUFSZ) &&
3393                  (new_size != ifp->if_real_bytes)));
3394
3395         /* Free extent records */
3396         if (new_size == 0) {
3397                 xfs_iext_destroy(ifp);
3398         }
3399         /* Resize direct extent list and zero any new bytes */
3400         else if (ifp->if_real_bytes) {
3401                 /* Check if extents will fit inside the inode */
3402                 if (new_size <= XFS_INLINE_EXTS * sizeof(xfs_bmbt_rec_t)) {
3403                         xfs_iext_direct_to_inline(ifp, new_size /
3404                                 (uint)sizeof(xfs_bmbt_rec_t));
3405                         ifp->if_bytes = new_size;
3406                         return;
3407                 }
3408                 if (!is_power_of_2(new_size)){
3409                         rnew_size = roundup_pow_of_two(new_size);
3410                 }
3411                 if (rnew_size != ifp->if_real_bytes) {
3412                         ifp->if_u1.if_extents =
3413                                 kmem_realloc(ifp->if_u1.if_extents,
3414                                                 rnew_size,
3415                                                 ifp->if_real_bytes, KM_NOFS);
3416                 }
3417                 if (rnew_size > ifp->if_real_bytes) {
3418                         memset(&ifp->if_u1.if_extents[ifp->if_bytes /
3419                                 (uint)sizeof(xfs_bmbt_rec_t)], 0,
3420                                 rnew_size - ifp->if_real_bytes);
3421                 }
3422         }
3423         /*
3424          * Switch from the inline extent buffer to a direct
3425          * extent list. Be sure to include the inline extent
3426          * bytes in new_size.
3427          */
3428         else {
3429                 new_size += ifp->if_bytes;
3430                 if (!is_power_of_2(new_size)) {
3431                         rnew_size = roundup_pow_of_two(new_size);
3432                 }
3433                 xfs_iext_inline_to_direct(ifp, rnew_size);
3434         }
3435         ifp->if_real_bytes = rnew_size;
3436         ifp->if_bytes = new_size;
3437 }
3438
3439 /*
3440  * Switch from linear (direct) extent records to inline buffer.
3441  */
3442 void
3443 xfs_iext_direct_to_inline(
3444         xfs_ifork_t     *ifp,           /* inode fork pointer */
3445         xfs_extnum_t    nextents)       /* number of extents in file */
3446 {
3447         ASSERT(ifp->if_flags & XFS_IFEXTENTS);
3448         ASSERT(nextents <= XFS_INLINE_EXTS);
3449         /*
3450          * The inline buffer was zeroed when we switched
3451          * from inline to direct extent allocation mode,
3452          * so we don't need to clear it here.
3453          */
3454         memcpy(ifp->if_u2.if_inline_ext, ifp->if_u1.if_extents,
3455                 nextents * sizeof(xfs_bmbt_rec_t));
3456         kmem_free(ifp->if_u1.if_extents);
3457         ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
3458         ifp->if_real_bytes = 0;
3459 }
3460
3461 /*
3462  * Switch from inline buffer to linear (direct) extent records.
3463  * new_size should already be rounded up to the next power of 2
3464  * by the caller (when appropriate), so use new_size as it is.
3465  * However, since new_size may be rounded up, we can't update
3466  * if_bytes here. It is the caller's responsibility to update
3467  * if_bytes upon return.
3468  */
3469 void
3470 xfs_iext_inline_to_direct(
3471         xfs_ifork_t     *ifp,           /* inode fork pointer */
3472         int             new_size)       /* number of extents in file */
3473 {
3474         ifp->if_u1.if_extents = kmem_alloc(new_size, KM_NOFS);
3475         memset(ifp->if_u1.if_extents, 0, new_size);
3476         if (ifp->if_bytes) {
3477                 memcpy(ifp->if_u1.if_extents, ifp->if_u2.if_inline_ext,
3478                         ifp->if_bytes);
3479                 memset(ifp->if_u2.if_inline_ext, 0, XFS_INLINE_EXTS *
3480                         sizeof(xfs_bmbt_rec_t));
3481         }
3482         ifp->if_real_bytes = new_size;
3483 }
3484
3485 /*
3486  * Resize an extent indirection array to new_size bytes.
3487  */
3488 STATIC void
3489 xfs_iext_realloc_indirect(
3490         xfs_ifork_t     *ifp,           /* inode fork pointer */
3491         int             new_size)       /* new indirection array size */
3492 {
3493         int             nlists;         /* number of irec's (ex lists) */
3494         int             size;           /* current indirection array size */
3495
3496         ASSERT(ifp->if_flags & XFS_IFEXTIREC);
3497         nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
3498         size = nlists * sizeof(xfs_ext_irec_t);
3499         ASSERT(ifp->if_real_bytes);
3500         ASSERT((new_size >= 0) && (new_size != size));
3501         if (new_size == 0) {
3502                 xfs_iext_destroy(ifp);
3503         } else {
3504                 ifp->if_u1.if_ext_irec = (xfs_ext_irec_t *)
3505                         kmem_realloc(ifp->if_u1.if_ext_irec,
3506                                 new_size, size, KM_NOFS);
3507         }
3508 }
3509
3510 /*
3511  * Switch from indirection array to linear (direct) extent allocations.
3512  */
3513 STATIC void
3514 xfs_iext_indirect_to_direct(
3515          xfs_ifork_t    *ifp)           /* inode fork pointer */
3516 {
3517         xfs_bmbt_rec_host_t *ep;        /* extent record pointer */
3518         xfs_extnum_t    nextents;       /* number of extents in file */
3519         int             size;           /* size of file extents */
3520
3521         ASSERT(ifp->if_flags & XFS_IFEXTIREC);
3522         nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
3523         ASSERT(nextents <= XFS_LINEAR_EXTS);
3524         size = nextents * sizeof(xfs_bmbt_rec_t);
3525
3526         xfs_iext_irec_compact_pages(ifp);
3527         ASSERT(ifp->if_real_bytes == XFS_IEXT_BUFSZ);
3528
3529         ep = ifp->if_u1.if_ext_irec->er_extbuf;
3530         kmem_free(ifp->if_u1.if_ext_irec);
3531         ifp->if_flags &= ~XFS_IFEXTIREC;
3532         ifp->if_u1.if_extents = ep;
3533         ifp->if_bytes = size;
3534         if (nextents < XFS_LINEAR_EXTS) {
3535                 xfs_iext_realloc_direct(ifp, size);
3536         }
3537 }
3538
3539 /*
3540  * Free incore file extents.
3541  */
3542 void
3543 xfs_iext_destroy(
3544         xfs_ifork_t     *ifp)           /* inode fork pointer */
3545 {
3546         if (ifp->if_flags & XFS_IFEXTIREC) {
3547                 int     erp_idx;
3548                 int     nlists;
3549
3550                 nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
3551                 for (erp_idx = nlists - 1; erp_idx >= 0 ; erp_idx--) {
3552                         xfs_iext_irec_remove(ifp, erp_idx);
3553                 }
3554                 ifp->if_flags &= ~XFS_IFEXTIREC;
3555         } else if (ifp->if_real_bytes) {
3556                 kmem_free(ifp->if_u1.if_extents);
3557         } else if (ifp->if_bytes) {
3558                 memset(ifp->if_u2.if_inline_ext, 0, XFS_INLINE_EXTS *
3559                         sizeof(xfs_bmbt_rec_t));
3560         }
3561         ifp->if_u1.if_extents = NULL;
3562         ifp->if_real_bytes = 0;
3563         ifp->if_bytes = 0;
3564 }
3565
3566 /*
3567  * Return a pointer to the extent record for file system block bno.
3568  */
3569 xfs_bmbt_rec_host_t *                   /* pointer to found extent record */
3570 xfs_iext_bno_to_ext(
3571         xfs_ifork_t     *ifp,           /* inode fork pointer */
3572         xfs_fileoff_t   bno,            /* block number to search for */
3573         xfs_extnum_t    *idxp)          /* index of target extent */
3574 {
3575         xfs_bmbt_rec_host_t *base;      /* pointer to first extent */
3576         xfs_filblks_t   blockcount = 0; /* number of blocks in extent */
3577         xfs_bmbt_rec_host_t *ep = NULL; /* pointer to target extent */
3578         xfs_ext_irec_t  *erp = NULL;    /* indirection array pointer */
3579         int             high;           /* upper boundary in search */
3580         xfs_extnum_t    idx = 0;        /* index of target extent */
3581         int             low;            /* lower boundary in search */
3582         xfs_extnum_t    nextents;       /* number of file extents */
3583         xfs_fileoff_t   startoff = 0;   /* start offset of extent */
3584
3585         nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
3586         if (nextents == 0) {
3587                 *idxp = 0;
3588                 return NULL;
3589         }
3590         low = 0;
3591         if (ifp->if_flags & XFS_IFEXTIREC) {
3592                 /* Find target extent list */
3593                 int     erp_idx = 0;
3594                 erp = xfs_iext_bno_to_irec(ifp, bno, &erp_idx);
3595                 base = erp->er_extbuf;
3596                 high = erp->er_extcount - 1;
3597         } else {
3598                 base = ifp->if_u1.if_extents;
3599                 high = nextents - 1;
3600         }
3601         /* Binary search extent records */
3602         while (low <= high) {
3603                 idx = (low + high) >> 1;
3604                 ep = base + idx;
3605                 startoff = xfs_bmbt_get_startoff(ep);
3606                 blockcount = xfs_bmbt_get_blockcount(ep);
3607                 if (bno < startoff) {
3608                         high = idx - 1;
3609                 } else if (bno >= startoff + blockcount) {
3610                         low = idx + 1;
3611                 } else {
3612                         /* Convert back to file-based extent index */
3613                         if (ifp->if_flags & XFS_IFEXTIREC) {
3614                                 idx += erp->er_extoff;
3615                         }
3616                         *idxp = idx;
3617                         return ep;
3618                 }
3619         }
3620         /* Convert back to file-based extent index */
3621         if (ifp->if_flags & XFS_IFEXTIREC) {
3622                 idx += erp->er_extoff;
3623         }
3624         if (bno >= startoff + blockcount) {
3625                 if (++idx == nextents) {
3626                         ep = NULL;
3627                 } else {
3628                         ep = xfs_iext_get_ext(ifp, idx);
3629                 }
3630         }
3631         *idxp = idx;
3632         return ep;
3633 }
3634
3635 /*
3636  * Return a pointer to the indirection array entry containing the
3637  * extent record for filesystem block bno. Store the index of the
3638  * target irec in *erp_idxp.
3639  */
3640 xfs_ext_irec_t *                        /* pointer to found extent record */
3641 xfs_iext_bno_to_irec(
3642         xfs_ifork_t     *ifp,           /* inode fork pointer */
3643         xfs_fileoff_t   bno,            /* block number to search for */
3644         int             *erp_idxp)      /* irec index of target ext list */
3645 {
3646         xfs_ext_irec_t  *erp = NULL;    /* indirection array pointer */
3647         xfs_ext_irec_t  *erp_next;      /* next indirection array entry */
3648         int             erp_idx;        /* indirection array index */
3649         int             nlists;         /* number of extent irec's (lists) */
3650         int             high;           /* binary search upper limit */
3651         int             low;            /* binary search lower limit */
3652
3653         ASSERT(ifp->if_flags & XFS_IFEXTIREC);
3654         nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
3655         erp_idx = 0;
3656         low = 0;
3657         high = nlists - 1;
3658         while (low <= high) {
3659                 erp_idx = (low + high) >> 1;
3660                 erp = &ifp->if_u1.if_ext_irec[erp_idx];
3661                 erp_next = erp_idx < nlists - 1 ? erp + 1 : NULL;
3662                 if (bno < xfs_bmbt_get_startoff(erp->er_extbuf)) {
3663                         high = erp_idx - 1;
3664                 } else if (erp_next && bno >=
3665                            xfs_bmbt_get_startoff(erp_next->er_extbuf)) {
3666                         low = erp_idx + 1;
3667                 } else {
3668                         break;
3669                 }
3670         }
3671         *erp_idxp = erp_idx;
3672         return erp;
3673 }
3674
3675 /*
3676  * Return a pointer to the indirection array entry containing the
3677  * extent record at file extent index *idxp. Store the index of the
3678  * target irec in *erp_idxp and store the page index of the target
3679  * extent record in *idxp.
3680  */
3681 xfs_ext_irec_t *
3682 xfs_iext_idx_to_irec(
3683         xfs_ifork_t     *ifp,           /* inode fork pointer */
3684         xfs_extnum_t    *idxp,          /* extent index (file -> page) */
3685         int             *erp_idxp,      /* pointer to target irec */
3686         int             realloc)        /* new bytes were just added */
3687 {
3688         xfs_ext_irec_t  *prev;          /* pointer to previous irec */
3689         xfs_ext_irec_t  *erp = NULL;    /* pointer to current irec */
3690         int             erp_idx;        /* indirection array index */
3691         int             nlists;         /* number of irec's (ex lists) */
3692         int             high;           /* binary search upper limit */
3693         int             low;            /* binary search lower limit */
3694         xfs_extnum_t    page_idx = *idxp; /* extent index in target list */
3695
3696         ASSERT(ifp->if_flags & XFS_IFEXTIREC);
3697         ASSERT(page_idx >= 0);
3698         ASSERT(page_idx <= ifp->if_bytes / sizeof(xfs_bmbt_rec_t));
3699         ASSERT(page_idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t) || realloc);
3700
3701         nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
3702         erp_idx = 0;
3703         low = 0;
3704         high = nlists - 1;
3705
3706         /* Binary search extent irec's */
3707         while (low <= high) {
3708                 erp_idx = (low + high) >> 1;
3709                 erp = &ifp->if_u1.if_ext_irec[erp_idx];
3710                 prev = erp_idx > 0 ? erp - 1 : NULL;
3711                 if (page_idx < erp->er_extoff || (page_idx == erp->er_extoff &&
3712                      realloc && prev && prev->er_extcount < XFS_LINEAR_EXTS)) {
3713                         high = erp_idx - 1;
3714                 } else if (page_idx > erp->er_extoff + erp->er_extcount ||
3715                            (page_idx == erp->er_extoff + erp->er_extcount &&
3716                             !realloc)) {
3717                         low = erp_idx + 1;
3718                 } else if (page_idx == erp->er_extoff + erp->er_extcount &&
3719                            erp->er_extcount == XFS_LINEAR_EXTS) {
3720                         ASSERT(realloc);
3721                         page_idx = 0;
3722                         erp_idx++;
3723                         erp = erp_idx < nlists ? erp + 1 : NULL;
3724                         break;
3725                 } else {
3726                         page_idx -= erp->er_extoff;
3727                         break;
3728                 }
3729         }
3730         *idxp = page_idx;
3731         *erp_idxp = erp_idx;
3732         return(erp);
3733 }
3734
3735 /*
3736  * Allocate and initialize an indirection array once the space needed
3737  * for incore extents increases above XFS_IEXT_BUFSZ.
3738  */
3739 void
3740 xfs_iext_irec_init(
3741         xfs_ifork_t     *ifp)           /* inode fork pointer */
3742 {
3743         xfs_ext_irec_t  *erp;           /* indirection array pointer */
3744         xfs_extnum_t    nextents;       /* number of extents in file */
3745
3746         ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
3747         nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
3748         ASSERT(nextents <= XFS_LINEAR_EXTS);
3749
3750         erp = kmem_alloc(sizeof(xfs_ext_irec_t), KM_NOFS);
3751
3752         if (nextents == 0) {
3753                 ifp->if_u1.if_extents = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS);
3754         } else if (!ifp->if_real_bytes) {
3755                 xfs_iext_inline_to_direct(ifp, XFS_IEXT_BUFSZ);
3756         } else if (ifp->if_real_bytes < XFS_IEXT_BUFSZ) {
3757                 xfs_iext_realloc_direct(ifp, XFS_IEXT_BUFSZ);
3758         }
3759         erp->er_extbuf = ifp->if_u1.if_extents;
3760         erp->er_extcount = nextents;
3761         erp->er_extoff = 0;
3762
3763         ifp->if_flags |= XFS_IFEXTIREC;
3764         ifp->if_real_bytes = XFS_IEXT_BUFSZ;
3765         ifp->if_bytes = nextents * sizeof(xfs_bmbt_rec_t);
3766         ifp->if_u1.if_ext_irec = erp;
3767
3768         return;
3769 }
3770
3771 /*
3772  * Allocate and initialize a new entry in the indirection array.
3773  */
3774 xfs_ext_irec_t *
3775 xfs_iext_irec_new(
3776         xfs_ifork_t     *ifp,           /* inode fork pointer */
3777         int             erp_idx)        /* index for new irec */
3778 {
3779         xfs_ext_irec_t  *erp;           /* indirection array pointer */
3780         int             i;              /* loop counter */
3781         int             nlists;         /* number of irec's (ex lists) */
3782
3783         ASSERT(ifp->if_flags & XFS_IFEXTIREC);
3784         nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
3785
3786         /* Resize indirection array */
3787         xfs_iext_realloc_indirect(ifp, ++nlists *
3788                                   sizeof(xfs_ext_irec_t));
3789         /*
3790          * Move records down in the array so the
3791          * new page can use erp_idx.
3792          */
3793         erp = ifp->if_u1.if_ext_irec;
3794         for (i = nlists - 1; i > erp_idx; i--) {
3795                 memmove(&erp[i], &erp[i-1], sizeof(xfs_ext_irec_t));
3796         }
3797         ASSERT(i == erp_idx);
3798
3799         /* Initialize new extent record */
3800         erp = ifp->if_u1.if_ext_irec;
3801         erp[erp_idx].er_extbuf = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS);
3802         ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ;
3803         memset(erp[erp_idx].er_extbuf, 0, XFS_IEXT_BUFSZ);
3804         erp[erp_idx].er_extcount = 0;
3805         erp[erp_idx].er_extoff = erp_idx > 0 ?
3806                 erp[erp_idx-1].er_extoff + erp[erp_idx-1].er_extcount : 0;
3807         return (&erp[erp_idx]);
3808 }
3809
3810 /*
3811  * Remove a record from the indirection array.
3812  */
3813 void
3814 xfs_iext_irec_remove(
3815         xfs_ifork_t     *ifp,           /* inode fork pointer */
3816         int             erp_idx)        /* irec index to remove */
3817 {
3818         xfs_ext_irec_t  *erp;           /* indirection array pointer */
3819         int             i;              /* loop counter */
3820         int             nlists;         /* number of irec's (ex lists) */
3821
3822         ASSERT(ifp->if_flags & XFS_IFEXTIREC);
3823         nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
3824         erp = &ifp->if_u1.if_ext_irec[erp_idx];
3825         if (erp->er_extbuf) {
3826                 xfs_iext_irec_update_extoffs(ifp, erp_idx + 1,
3827                         -erp->er_extcount);
3828                 kmem_free(erp->er_extbuf);
3829         }
3830         /* Compact extent records */
3831         erp = ifp->if_u1.if_ext_irec;
3832         for (i = erp_idx; i < nlists - 1; i++) {
3833                 memmove(&erp[i], &erp[i+1], sizeof(xfs_ext_irec_t));
3834         }
3835         /*
3836          * Manually free the last extent record from the indirection
3837          * array.  A call to xfs_iext_realloc_indirect() with a size
3838          * of zero would result in a call to xfs_iext_destroy() which
3839          * would in turn call this function again, creating a nasty
3840          * infinite loop.
3841          */
3842         if (--nlists) {
3843                 xfs_iext_realloc_indirect(ifp,
3844                         nlists * sizeof(xfs_ext_irec_t));
3845         } else {
3846                 kmem_free(ifp->if_u1.if_ext_irec);
3847         }
3848         ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ;
3849 }
3850
3851 /*
3852  * This is called to clean up large amounts of unused memory allocated
3853  * by the indirection array.  Before compacting anything though, verify
3854  * that the indirection array is still needed and switch back to the
3855  * linear extent list (or even the inline buffer) if possible.  The
3856  * compaction policy is as follows:
3857  *
3858  *    Full Compaction: Extents fit into a single page (or inline buffer)
3859  * Partial Compaction: Extents occupy less than 50% of allocated space
3860  *      No Compaction: Extents occupy at least 50% of allocated space
3861  */
3862 void
3863 xfs_iext_irec_compact(
3864         xfs_ifork_t     *ifp)           /* inode fork pointer */
3865 {
3866         xfs_extnum_t    nextents;       /* number of extents in file */
3867         int             nlists;         /* number of irec's (ex lists) */
3868
3869         ASSERT(ifp->if_flags & XFS_IFEXTIREC);
3870         nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
3871         nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
3872
3873         if (nextents == 0) {
3874                 xfs_iext_destroy(ifp);
3875         } else if (nextents <= XFS_INLINE_EXTS) {
3876                 xfs_iext_indirect_to_direct(ifp);
3877                 xfs_iext_direct_to_inline(ifp, nextents);
3878         } else if (nextents <= XFS_LINEAR_EXTS) {
3879                 xfs_iext_indirect_to_direct(ifp);
3880         } else if (nextents < (nlists * XFS_LINEAR_EXTS) >> 1) {
3881                 xfs_iext_irec_compact_pages(ifp);
3882         }
3883 }
3884
3885 /*
3886  * Combine extents from neighboring extent pages.
3887  */
3888 void
3889 xfs_iext_irec_compact_pages(
3890         xfs_ifork_t     *ifp)           /* inode fork pointer */
3891 {
3892         xfs_ext_irec_t  *erp, *erp_next;/* pointers to irec entries */
3893         int             erp_idx = 0;    /* indirection array index */
3894         int             nlists;         /* number of irec's (ex lists) */
3895
3896         ASSERT(ifp->if_flags & XFS_IFEXTIREC);
3897         nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
3898         while (erp_idx < nlists - 1) {
3899                 erp = &ifp->if_u1.if_ext_irec[erp_idx];
3900                 erp_next = erp + 1;
3901                 if (erp_next->er_extcount <=
3902                     (XFS_LINEAR_EXTS - erp->er_extcount)) {
3903                         memcpy(&erp->er_extbuf[erp->er_extcount],
3904                                 erp_next->er_extbuf, erp_next->er_extcount *
3905                                 sizeof(xfs_bmbt_rec_t));
3906                         erp->er_extcount += erp_next->er_extcount;
3907                         /*
3908                          * Free page before removing extent record
3909                          * so er_extoffs don't get modified in
3910                          * xfs_iext_irec_remove.
3911                          */
3912                         kmem_free(erp_next->er_extbuf);
3913                         erp_next->er_extbuf = NULL;
3914                         xfs_iext_irec_remove(ifp, erp_idx + 1);
3915                         nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
3916                 } else {
3917                         erp_idx++;
3918                 }
3919         }
3920 }
3921
3922 /*
3923  * This is called to update the er_extoff field in the indirection
3924  * array when extents have been added or removed from one of the
3925  * extent lists. erp_idx contains the irec index to begin updating
3926  * at and ext_diff contains the number of extents that were added
3927  * or removed.
3928  */
3929 void
3930 xfs_iext_irec_update_extoffs(
3931         xfs_ifork_t     *ifp,           /* inode fork pointer */
3932         int             erp_idx,        /* irec index to update */
3933         int             ext_diff)       /* number of new extents */
3934 {
3935         int             i;              /* loop counter */
3936         int             nlists;         /* number of irec's (ex lists */
3937
3938         ASSERT(ifp->if_flags & XFS_IFEXTIREC);
3939         nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
3940         for (i = erp_idx; i < nlists; i++) {
3941                 ifp->if_u1.if_ext_irec[i].er_extoff += ext_diff;
3942         }
3943 }
3944
3945 /*
3946  * Test whether it is appropriate to check an inode for and free post EOF
3947  * blocks. The 'force' parameter determines whether we should also consider
3948  * regular files that are marked preallocated or append-only.
3949  */
3950 bool
3951 xfs_can_free_eofblocks(struct xfs_inode *ip, bool force)
3952 {
3953         /* prealloc/delalloc exists only on regular files */
3954         if (!S_ISREG(ip->i_d.di_mode))
3955                 return false;
3956
3957         /*
3958          * Zero sized files with no cached pages and delalloc blocks will not
3959          * have speculative prealloc/delalloc blocks to remove.
3960          */
3961         if (VFS_I(ip)->i_size == 0 &&
3962             VN_CACHED(VFS_I(ip)) == 0 &&
3963             ip->i_delayed_blks == 0)
3964                 return false;
3965
3966         /* If we haven't read in the extent list, then don't do it now. */
3967         if (!(ip->i_df.if_flags & XFS_IFEXTENTS))
3968                 return false;
3969
3970         /*
3971          * Do not free real preallocated or append-only files unless the file
3972          * has delalloc blocks and we are forced to remove them.
3973          */
3974         if (ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND))
3975                 if (!force || ip->i_delayed_blks == 0)
3976                         return false;
3977
3978         return true;
3979 }
3980