fs/xfs/xfs_inode.c

   1 /*
   2  * Copyright (c) 2000-2006 Silicon Graphics, Inc.
   3  * All Rights Reserved.
   4  *
   5  * This program is free software; you can redistribute it and/or
   6  * modify it under the terms of the GNU General Public License as
   7  * published by the Free Software Foundation.
   8  *
   9  * This program is distributed in the hope that it would be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12  * GNU General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU General Public License
  15  * along with this program; if not, write the Free Software Foundation,
  16  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  17  */
  18 #include <linux/log2.h>
  19
  20 #include "xfs.h"
  21 #include "xfs_fs.h"
  22 #include "xfs_shared.h"
  23 #include "xfs_format.h"
  24 #include "xfs_log_format.h"
  25 #include "xfs_trans_resv.h"
  26 #include "xfs_inum.h"
  27 #include "xfs_sb.h"
  28 #include "xfs_ag.h"
  29 #include "xfs_mount.h"
  30 #include "xfs_inode.h"
  31 #include "xfs_da_format.h"
  32 #include "xfs_da_btree.h"
  33 #include "xfs_dir2.h"
  34 #include "xfs_attr_sf.h"
  35 #include "xfs_attr.h"
  36 #include "xfs_trans_space.h"
  37 #include "xfs_trans.h"
  38 #include "xfs_buf_item.h"
  39 #include "xfs_inode_item.h"
  40 #include "xfs_ialloc.h"
  41 #include "xfs_bmap.h"
  42 #include "xfs_bmap_util.h"
  43 #include "xfs_error.h"
  44 #include "xfs_quota.h"
  45 #include "xfs_dinode.h"
  46 #include "xfs_filestream.h"
  47 #include "xfs_cksum.h"
  48 #include "xfs_trace.h"
  49 #include "xfs_icache.h"
  50 #include "xfs_symlink.h"
  51 #include "xfs_trans_priv.h"
  52 #include "xfs_log.h"
  53 #include "xfs_bmap_btree.h"
  54
  55 kmem_zone_t *xfs_inode_zone;
  56
  57 /*
  58  * Used in xfs_itruncate_extents().  This is the maximum number of extents
  59  * freed from a file in a single transaction.
  60  */
  61 #define XFS_ITRUNC_MAX_EXTENTS  2
  62
  63 STATIC int xfs_iflush_int(xfs_inode_t *, xfs_buf_t *);
  64
  65 /*
  66  * helper function to extract extent size hint from inode
  67  */
  68 xfs_extlen_t
  69 xfs_get_extsz_hint(
  70         struct xfs_inode        *ip)
  71 {
  72         if ((ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE) && ip->i_d.di_extsize)
  73                 return ip->i_d.di_extsize;
  74         if (XFS_IS_REALTIME_INODE(ip))
  75                 return ip->i_mount->m_sb.sb_rextsize;
  76         return 0;
  77 }
  78
  79 /*
  80  * This is a wrapper routine around the xfs_ilock() routine used to centralize
  81  * some grungy code.  It is used in places that wish to lock the inode solely
  82  * for reading the extents.  The reason these places can't just call
  83  * xfs_ilock(SHARED) is that the inode lock also guards to bringing in of the
  84  * extents from disk for a file in b-tree format.  If the inode is in b-tree
  85  * format, then we need to lock the inode exclusively until the extents are read
  86  * in.  Locking it exclusively all the time would limit our parallelism
  87  * unnecessarily, though.  What we do instead is check to see if the extents
  88  * have been read in yet, and only lock the inode exclusively if they have not.
  89  *
  90  * The function returns a value which should be given to the corresponding
  91  * xfs_iunlock() call.
  92  */
  93 uint
  94 xfs_ilock_data_map_shared(
  95         struct xfs_inode        *ip)
  96 {
  97         uint                    lock_mode = XFS_ILOCK_SHARED;
  98
  99         if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE &&
 100             (ip->i_df.if_flags & XFS_IFEXTENTS) == 0)
 101                 lock_mode = XFS_ILOCK_EXCL;
 102         xfs_ilock(ip, lock_mode);
 103         return lock_mode;
 104 }
 105
 106 /*
 107  * The xfs inode contains 2 locks: a multi-reader lock called the
 108  * i_iolock and a multi-reader lock called the i_lock.  This routine
 109  * allows either or both of the locks to be obtained.
 110  *
 111  * The 2 locks should always be ordered so that the IO lock is
 112  * obtained first in order to prevent deadlock.
 113  *
 114  * ip -- the inode being locked
 115  * lock_flags -- this parameter indicates the inode's locks
 116  *       to be locked.  It can be:
 117  *              XFS_IOLOCK_SHARED,
 118  *              XFS_IOLOCK_EXCL,
 119  *              XFS_ILOCK_SHARED,
 120  *              XFS_ILOCK_EXCL,
 121  *              XFS_IOLOCK_SHARED | XFS_ILOCK_SHARED,
 122  *              XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL,
 123  *              XFS_IOLOCK_EXCL | XFS_ILOCK_SHARED,
 124  *              XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL
 125  */
 126 void
 127 xfs_ilock(
 128         xfs_inode_t             *ip,
 129         uint                    lock_flags)
 130 {
 131         trace_xfs_ilock(ip, lock_flags, _RET_IP_);
 132
 133         /*
 134          * You can't set both SHARED and EXCL for the same lock,
 135          * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
 136          * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
 137          */
 138         ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
 139                (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
 140         ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
 141                (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
 142         ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
 143
 144         if (lock_flags & XFS_IOLOCK_EXCL)
 145                 mrupdate_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags));
 146         else if (lock_flags & XFS_IOLOCK_SHARED)
 147                 mraccess_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags));
 148
 149         if (lock_flags & XFS_ILOCK_EXCL)
 150                 mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
 151         else if (lock_flags & XFS_ILOCK_SHARED)
 152                 mraccess_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
 153 }
 154
 155 /*
 156  * This is just like xfs_ilock(), except that the caller
 157  * is guaranteed not to sleep.  It returns 1 if it gets
 158  * the requested locks and 0 otherwise.  If the IO lock is
 159  * obtained but the inode lock cannot be, then the IO lock
 160  * is dropped before returning.
 161  *
 162  * ip -- the inode being locked
 163  * lock_flags -- this parameter indicates the inode's locks to be
 164  *       to be locked.  See the comment for xfs_ilock() for a list
 165  *       of valid values.
 166  */
 167 int
 168 xfs_ilock_nowait(
 169         xfs_inode_t             *ip,
 170         uint                    lock_flags)
 171 {
 172         trace_xfs_ilock_nowait(ip, lock_flags, _RET_IP_);
 173
 174         /*
 175          * You can't set both SHARED and EXCL for the same lock,
 176          * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
 177          * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
 178          */
 179         ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
 180                (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
 181         ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
 182                (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
 183         ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
 184
 185         if (lock_flags & XFS_IOLOCK_EXCL) {
 186                 if (!mrtryupdate(&ip->i_iolock))
 187                         goto out;
 188         } else if (lock_flags & XFS_IOLOCK_SHARED) {
 189                 if (!mrtryaccess(&ip->i_iolock))
 190                         goto out;
 191         }
 192         if (lock_flags & XFS_ILOCK_EXCL) {
 193                 if (!mrtryupdate(&ip->i_lock))
 194                         goto out_undo_iolock;
 195         } else if (lock_flags & XFS_ILOCK_SHARED) {
 196                 if (!mrtryaccess(&ip->i_lock))
 197                         goto out_undo_iolock;
 198         }
 199         return 1;
 200
 201  out_undo_iolock:
 202         if (lock_flags & XFS_IOLOCK_EXCL)
 203                 mrunlock_excl(&ip->i_iolock);
 204         else if (lock_flags & XFS_IOLOCK_SHARED)
 205                 mrunlock_shared(&ip->i_iolock);
 206  out:
 207         return 0;
 208 }
 209
 210 /*
 211  * xfs_iunlock() is used to drop the inode locks acquired with
 212  * xfs_ilock() and xfs_ilock_nowait().  The caller must pass
 213  * in the flags given to xfs_ilock() or xfs_ilock_nowait() so
 214  * that we know which locks to drop.
 215  *
 216  * ip -- the inode being unlocked
 217  * lock_flags -- this parameter indicates the inode's locks to be
 218  *       to be unlocked.  See the comment for xfs_ilock() for a list
 219  *       of valid values for this parameter.
 220  *
 221  */
 222 void
 223 xfs_iunlock(
 224         xfs_inode_t             *ip,
 225         uint                    lock_flags)
 226 {
 227         /*
 228          * You can't set both SHARED and EXCL for the same lock,
 229          * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
 230          * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
 231          */
 232         ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
 233                (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
 234         ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
 235                (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
 236         ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
 237         ASSERT(lock_flags != 0);
 238
 239         if (lock_flags & XFS_IOLOCK_EXCL)
 240                 mrunlock_excl(&ip->i_iolock);
 241         else if (lock_flags & XFS_IOLOCK_SHARED)
 242                 mrunlock_shared(&ip->i_iolock);
 243
 244         if (lock_flags & XFS_ILOCK_EXCL)
 245                 mrunlock_excl(&ip->i_lock);
 246         else if (lock_flags & XFS_ILOCK_SHARED)
 247                 mrunlock_shared(&ip->i_lock);
 248
 249         trace_xfs_iunlock(ip, lock_flags, _RET_IP_);
 250 }
 251
 252 /*
 253  * give up write locks.  the i/o lock cannot be held nested
 254  * if it is being demoted.
 255  */
 256 void
 257 xfs_ilock_demote(
 258         xfs_inode_t             *ip,
 259         uint                    lock_flags)
 260 {
 261         ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL));
 262         ASSERT((lock_flags & ~(XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)) == 0);
 263
 264         if (lock_flags & XFS_ILOCK_EXCL)
 265                 mrdemote(&ip->i_lock);
 266         if (lock_flags & XFS_IOLOCK_EXCL)
 267                 mrdemote(&ip->i_iolock);
 268
 269         trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_);
 270 }
 271
 272 #if defined(DEBUG) || defined(XFS_WARN)
 273 int
 274 xfs_isilocked(
 275         xfs_inode_t             *ip,
 276         uint                    lock_flags)
 277 {
 278         if (lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) {
 279                 if (!(lock_flags & XFS_ILOCK_SHARED))
 280                         return !!ip->i_lock.mr_writer;
 281                 return rwsem_is_locked(&ip->i_lock.mr_lock);
 282         }
 283
 284         if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) {
 285                 if (!(lock_flags & XFS_IOLOCK_SHARED))
 286                         return !!ip->i_iolock.mr_writer;
 287                 return rwsem_is_locked(&ip->i_iolock.mr_lock);
 288         }
 289
 290         ASSERT(0);
 291         return 0;
 292 }
 293 #endif
 294
 295 #ifdef DEBUG
 296 int xfs_locked_n;
 297 int xfs_small_retries;
 298 int xfs_middle_retries;
 299 int xfs_lots_retries;
 300 int xfs_lock_delays;
 301 #endif
 302
 303 /*
 304  * Bump the subclass so xfs_lock_inodes() acquires each lock with
 305  * a different value
 306  */
 307 static inline int
 308 xfs_lock_inumorder(int lock_mode, int subclass)
 309 {
 310         if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))
 311                 lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_IOLOCK_SHIFT;
 312         if (lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))
 313                 lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_ILOCK_SHIFT;
 314
 315         return lock_mode;
 316 }
 317
 318 /*
 319  * The following routine will lock n inodes in exclusive mode.
 320  * We assume the caller calls us with the inodes in i_ino order.
 321  *
 322  * We need to detect deadlock where an inode that we lock
 323  * is in the AIL and we start waiting for another inode that is locked
 324  * by a thread in a long running transaction (such as truncate). This can
 325  * result in deadlock since the long running trans might need to wait
 326  * for the inode we just locked in order to push the tail and free space
 327  * in the log.
 328  */
 329 void
 330 xfs_lock_inodes(
 331         xfs_inode_t     **ips,
 332         int             inodes,
 333         uint            lock_mode)
 334 {
 335         int             attempts = 0, i, j, try_lock;
 336         xfs_log_item_t  *lp;
 337
 338         ASSERT(ips && (inodes >= 2)); /* we need at least two */
 339
 340         try_lock = 0;
 341         i = 0;
 342
 343 again:
 344         for (; i < inodes; i++) {
 345                 ASSERT(ips[i]);
 346
 347                 if (i && (ips[i] == ips[i-1]))  /* Already locked */
 348                         continue;
 349
 350                 /*
 351                  * If try_lock is not set yet, make sure all locked inodes
 352                  * are not in the AIL.
 353                  * If any are, set try_lock to be used later.
 354                  */
 355
 356                 if (!try_lock) {
 357                         for (j = (i - 1); j >= 0 && !try_lock; j--) {
 358                                 lp = (xfs_log_item_t *)ips[j]->i_itemp;
 359                                 if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
 360                                         try_lock++;
 361                                 }
 362                         }
 363                 }
 364
 365                 /*
 366                  * If any of the previous locks we have locked is in the AIL,
 367                  * we must TRY to get the second and subsequent locks. If
 368                  * we can't get any, we must release all we have
 369                  * and try again.
 370                  */
 371
 372                 if (try_lock) {
 373                         /* try_lock must be 0 if i is 0. */
 374                         /*
 375                          * try_lock means we have an inode locked
 376                          * that is in the AIL.
 377                          */
 378                         ASSERT(i != 0);
 379                         if (!xfs_ilock_nowait(ips[i], xfs_lock_inumorder(lock_mode, i))) {
 380                                 attempts++;
 381
 382                                 /*
 383                                  * Unlock all previous guys and try again.
 384                                  * xfs_iunlock will try to push the tail
 385                                  * if the inode is in the AIL.
 386                                  */
 387
 388                                 for(j = i - 1; j >= 0; j--) {
 389
 390                                         /*
 391                                          * Check to see if we've already
 392                                          * unlocked this one.
 393                                          * Not the first one going back,
 394                                          * and the inode ptr is the same.
 395                                          */
 396                                         if ((j != (i - 1)) && ips[j] ==
 397                                                                 ips[j+1])
 398                                                 continue;
 399
 400                                         xfs_iunlock(ips[j], lock_mode);
 401                                 }
 402
 403                                 if ((attempts % 5) == 0) {
 404                                         delay(1); /* Don't just spin the CPU */
 405 #ifdef DEBUG
 406                                         xfs_lock_delays++;
 407 #endif
 408                                 }
 409                                 i = 0;
 410                                 try_lock = 0;
 411                                 goto again;
 412                         }
 413                 } else {
 414                         xfs_ilock(ips[i], xfs_lock_inumorder(lock_mode, i));
 415                 }
 416         }
 417
 418 #ifdef DEBUG
 419         if (attempts) {
 420                 if (attempts < 5) xfs_small_retries++;
 421                 else if (attempts < 100) xfs_middle_retries++;
 422                 else xfs_lots_retries++;
 423         } else {
 424                 xfs_locked_n++;
 425         }
 426 #endif
 427 }
 428
 429 /*
 430  * xfs_lock_two_inodes() can only be used to lock one type of lock
 431  * at a time - the iolock or the ilock, but not both at once. If
 432  * we lock both at once, lockdep will report false positives saying
 433  * we have violated locking orders.
 434  */
 435 void
 436 xfs_lock_two_inodes(
 437         xfs_inode_t             *ip0,
 438         xfs_inode_t             *ip1,
 439         uint                    lock_mode)
 440 {
 441         xfs_inode_t             *temp;
 442         int                     attempts = 0;
 443         xfs_log_item_t          *lp;
 444
 445         if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))
 446                 ASSERT((lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) == 0);
 447         ASSERT(ip0->i_ino != ip1->i_ino);
 448
 449         if (ip0->i_ino > ip1->i_ino) {
 450                 temp = ip0;
 451                 ip0 = ip1;
 452                 ip1 = temp;
 453         }
 454
 455  again:
 456         xfs_ilock(ip0, xfs_lock_inumorder(lock_mode, 0));
 457
 458         /*
 459          * If the first lock we have locked is in the AIL, we must TRY to get
 460          * the second lock. If we can't get it, we must release the first one
 461          * and try again.
 462          */
 463         lp = (xfs_log_item_t *)ip0->i_itemp;
 464         if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
 465                 if (!xfs_ilock_nowait(ip1, xfs_lock_inumorder(lock_mode, 1))) {
 466                         xfs_iunlock(ip0, lock_mode);
 467                         if ((++attempts % 5) == 0)
 468                                 delay(1); /* Don't just spin the CPU */
 469                         goto again;
 470                 }
 471         } else {
 472                 xfs_ilock(ip1, xfs_lock_inumorder(lock_mode, 1));
 473         }
 474 }
 475
 476
 477 void
 478 __xfs_iflock(
 479         struct xfs_inode        *ip)
 480 {
 481         wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IFLOCK_BIT);
 482         DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT);
 483
 484         do {
 485                 prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
 486                 if (xfs_isiflocked(ip))
 487                         io_schedule();
 488         } while (!xfs_iflock_nowait(ip));
 489
 490         finish_wait(wq, &wait.wait);
 491 }
 492
 493 STATIC uint
 494 _xfs_dic2xflags(
 495         __uint16_t              di_flags)
 496 {
 497         uint                    flags = 0;
 498
 499         if (di_flags & XFS_DIFLAG_ANY) {
 500                 if (di_flags & XFS_DIFLAG_REALTIME)
 501                         flags |= XFS_XFLAG_REALTIME;
 502                 if (di_flags & XFS_DIFLAG_PREALLOC)
 503                         flags |= XFS_XFLAG_PREALLOC;
 504                 if (di_flags & XFS_DIFLAG_IMMUTABLE)
 505                         flags |= XFS_XFLAG_IMMUTABLE;
 506                 if (di_flags & XFS_DIFLAG_APPEND)
 507                         flags |= XFS_XFLAG_APPEND;
 508                 if (di_flags & XFS_DIFLAG_SYNC)
 509                         flags |= XFS_XFLAG_SYNC;
 510                 if (di_flags & XFS_DIFLAG_NOATIME)
 511                         flags |= XFS_XFLAG_NOATIME;
 512                 if (di_flags & XFS_DIFLAG_NODUMP)
 513                         flags |= XFS_XFLAG_NODUMP;
 514                 if (di_flags & XFS_DIFLAG_RTINHERIT)
 515                         flags |= XFS_XFLAG_RTINHERIT;
 516                 if (di_flags & XFS_DIFLAG_PROJINHERIT)
 517                         flags |= XFS_XFLAG_PROJINHERIT;
 518                 if (di_flags & XFS_DIFLAG_NOSYMLINKS)
 519                         flags |= XFS_XFLAG_NOSYMLINKS;
 520                 if (di_flags & XFS_DIFLAG_EXTSIZE)
 521                         flags |= XFS_XFLAG_EXTSIZE;
 522                 if (di_flags & XFS_DIFLAG_EXTSZINHERIT)
 523                         flags |= XFS_XFLAG_EXTSZINHERIT;
 524                 if (di_flags & XFS_DIFLAG_NODEFRAG)
 525                         flags |= XFS_XFLAG_NODEFRAG;
 526                 if (di_flags & XFS_DIFLAG_FILESTREAM)
 527                         flags |= XFS_XFLAG_FILESTREAM;
 528         }
 529
 530         return flags;
 531 }
 532
 533 uint
 534 xfs_ip2xflags(
 535         xfs_inode_t             *ip)
 536 {
 537         xfs_icdinode_t          *dic = &ip->i_d;
 538
 539         return _xfs_dic2xflags(dic->di_flags) |
 540                                 (XFS_IFORK_Q(ip) ? XFS_XFLAG_HASATTR : 0);
 541 }
 542
 543 uint
 544 xfs_dic2xflags(
 545         xfs_dinode_t            *dip)
 546 {
 547         return _xfs_dic2xflags(be16_to_cpu(dip->di_flags)) |
 548                                 (XFS_DFORK_Q(dip) ? XFS_XFLAG_HASATTR : 0);
 549 }
 550
 551 /*
 552  * Lookups up an inode from "name". If ci_name is not NULL, then a CI match
 553  * is allowed, otherwise it has to be an exact match. If a CI match is found,
 554  * ci_name->name will point to a the actual name (caller must free) or
 555  * will be set to NULL if an exact match is found.
 556  */
 557 int
 558 xfs_lookup(
 559         xfs_inode_t             *dp,
 560         struct xfs_name         *name,
 561         xfs_inode_t             **ipp,
 562         struct xfs_name         *ci_name)
 563 {
 564         xfs_ino_t               inum;
 565         int                     error;
 566         uint                    lock_mode;
 567
 568         trace_xfs_lookup(dp, name);
 569
 570         if (XFS_FORCED_SHUTDOWN(dp->i_mount))
 571                 return XFS_ERROR(EIO);
 572
 573         lock_mode = xfs_ilock_data_map_shared(dp);
 574         error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name);
 575         xfs_iunlock(dp, lock_mode);
 576
 577         if (error)
 578                 goto out;
 579
 580         error = xfs_iget(dp->i_mount, NULL, inum, 0, 0, ipp);
 581         if (error)
 582                 goto out_free_name;
 583
 584         return 0;
 585
 586 out_free_name:
 587         if (ci_name)
 588                 kmem_free(ci_name->name);
 589 out:
 590         *ipp = NULL;
 591         return error;
 592 }
 593
 594 /*
 595  * Allocate an inode on disk and return a copy of its in-core version.
 596  * The in-core inode is locked exclusively.  Set mode, nlink, and rdev
 597  * appropriately within the inode.  The uid and gid for the inode are
 598  * set according to the contents of the given cred structure.
 599  *
 600  * Use xfs_dialloc() to allocate the on-disk inode. If xfs_dialloc()
 601  * has a free inode available, call xfs_iget() to obtain the in-core
 602  * version of the allocated inode.  Finally, fill in the inode and
 603  * log its initial contents.  In this case, ialloc_context would be
 604  * set to NULL.
 605  *
 606  * If xfs_dialloc() does not have an available inode, it will replenish
 607  * its supply by doing an allocation. Since we can only do one
 608  * allocation within a transaction without deadlocks, we must commit
 609  * the current transaction before returning the inode itself.
 610  * In this case, therefore, we will set ialloc_context and return.
 611  * The caller should then commit the current transaction, start a new
 612  * transaction, and call xfs_ialloc() again to actually get the inode.
 613  *
 614  * To ensure that some other process does not grab the inode that
 615  * was allocated during the first call to xfs_ialloc(), this routine
 616  * also returns the [locked] bp pointing to the head of the freelist
 617  * as ialloc_context.  The caller should hold this buffer across
 618  * the commit and pass it back into this routine on the second call.
 619  *
 620  * If we are allocating quota inodes, we do not have a parent inode
 621  * to attach to or associate with (i.e. pip == NULL) because they
 622  * are not linked into the directory structure - they are attached
 623  * directly to the superblock - and so have no parent.
 624  */
 625 int
 626 xfs_ialloc(
 627         xfs_trans_t     *tp,
 628         xfs_inode_t     *pip,
 629         umode_t         mode,
 630         xfs_nlink_t     nlink,
 631         xfs_dev_t       rdev,
 632         prid_t          prid,
 633         int             okalloc,
 634         xfs_buf_t       **ialloc_context,
 635         xfs_inode_t     **ipp)
 636 {
 637         struct xfs_mount *mp = tp->t_mountp;
 638         xfs_ino_t       ino;
 639         xfs_inode_t     *ip;
 640         uint            flags;
 641         int             error;
 642         timespec_t      tv;
 643         int             filestreams = 0;
 644
 645         /*
 646          * Call the space management code to pick
 647          * the on-disk inode to be allocated.
 648          */
 649         error = xfs_dialloc(tp, pip ? pip->i_ino : 0, mode, okalloc,
 650                             ialloc_context, &ino);
 651         if (error)
 652                 return error;
 653         if (*ialloc_context || ino == NULLFSINO) {
 654                 *ipp = NULL;
 655                 return 0;
 656         }
 657         ASSERT(*ialloc_context == NULL);
 658
 659         /*
 660          * Get the in-core inode with the lock held exclusively.
 661          * This is because we're setting fields here we need
 662          * to prevent others from looking at until we're done.
 663          */
 664         error = xfs_iget(mp, tp, ino, XFS_IGET_CREATE,
 665                          XFS_ILOCK_EXCL, &ip);
 666         if (error)
 667                 return error;
 668         ASSERT(ip != NULL);
 669
 670         ip->i_d.di_mode = mode;
 671         ip->i_d.di_onlink = 0;
 672         ip->i_d.di_nlink = nlink;
 673         ASSERT(ip->i_d.di_nlink == nlink);
 674         ip->i_d.di_uid = xfs_kuid_to_uid(current_fsuid());
 675         ip->i_d.di_gid = xfs_kgid_to_gid(current_fsgid());
 676         xfs_set_projid(ip, prid);
 677         memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
 678
 679         /*
 680          * If the superblock version is up to where we support new format
 681          * inodes and this is currently an old format inode, then change
 682          * the inode version number now.  This way we only do the conversion
 683          * here rather than here and in the flush/logging code.
 684          */
 685         if (xfs_sb_version_hasnlink(&mp->m_sb) &&
 686             ip->i_d.di_version == 1) {
 687                 ip->i_d.di_version = 2;
 688                 /*
 689                  * We've already zeroed the old link count, the projid field,
 690                  * and the pad field.
 691                  */
 692         }
 693
 694         /*
 695          * Project ids won't be stored on disk if we are using a version 1 inode.
 696          */
 697         if ((prid != 0) && (ip->i_d.di_version == 1))
 698                 xfs_bump_ino_vers2(tp, ip);
 699
 700         if (pip && XFS_INHERIT_GID(pip)) {
 701                 ip->i_d.di_gid = pip->i_d.di_gid;
 702                 if ((pip->i_d.di_mode & S_ISGID) && S_ISDIR(mode)) {
 703                         ip->i_d.di_mode |= S_ISGID;
 704                 }
 705         }
 706
 707         /*
 708          * If the group ID of the new file does not match the effective group
 709          * ID or one of the supplementary group IDs, the S_ISGID bit is cleared
 710          * (and only if the irix_sgid_inherit compatibility variable is set).
 711          */
 712         if ((irix_sgid_inherit) &&
 713             (ip->i_d.di_mode & S_ISGID) &&
 714             (!in_group_p(xfs_gid_to_kgid(ip->i_d.di_gid)))) {
 715                 ip->i_d.di_mode &= ~S_ISGID;
 716         }
 717
 718         ip->i_d.di_size = 0;
 719         ip->i_d.di_nextents = 0;
 720         ASSERT(ip->i_d.di_nblocks == 0);
 721
 722         nanotime(&tv);
 723         ip->i_d.di_mtime.t_sec = (__int32_t)tv.tv_sec;
 724         ip->i_d.di_mtime.t_nsec = (__int32_t)tv.tv_nsec;
 725         ip->i_d.di_atime = ip->i_d.di_mtime;
 726         ip->i_d.di_ctime = ip->i_d.di_mtime;
 727
 728         /*
 729          * di_gen will have been taken care of in xfs_iread.
 730          */
 731         ip->i_d.di_extsize = 0;
 732         ip->i_d.di_dmevmask = 0;
 733         ip->i_d.di_dmstate = 0;
 734         ip->i_d.di_flags = 0;
 735
 736         if (ip->i_d.di_version == 3) {
 737                 ASSERT(ip->i_d.di_ino == ino);
 738                 ASSERT(uuid_equal(&ip->i_d.di_uuid, &mp->m_sb.sb_uuid));
 739                 ip->i_d.di_crc = 0;
 740                 ip->i_d.di_changecount = 1;
 741                 ip->i_d.di_lsn = 0;
 742                 ip->i_d.di_flags2 = 0;
 743                 memset(&(ip->i_d.di_pad2[0]), 0, sizeof(ip->i_d.di_pad2));
 744                 ip->i_d.di_crtime = ip->i_d.di_mtime;
 745         }
 746
 747
 748         flags = XFS_ILOG_CORE;
 749         switch (mode & S_IFMT) {
 750         case S_IFIFO:
 751         case S_IFCHR:
 752         case S_IFBLK:
 753         case S_IFSOCK:
 754                 ip->i_d.di_format = XFS_DINODE_FMT_DEV;
 755                 ip->i_df.if_u2.if_rdev = rdev;
 756                 ip->i_df.if_flags = 0;
 757                 flags |= XFS_ILOG_DEV;
 758                 break;
 759         case S_IFREG:
 760                 /*
 761                  * we can't set up filestreams until after the VFS inode
 762                  * is set up properly.
 763                  */
 764                 if (pip && xfs_inode_is_filestream(pip))
 765                         filestreams = 1;
 766                 /* fall through */
 767         case S_IFDIR:
 768                 if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY)) {
 769                         uint    di_flags = 0;
 770
 771                         if (S_ISDIR(mode)) {
 772                                 if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT)
 773                                         di_flags |= XFS_DIFLAG_RTINHERIT;
 774                                 if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) {
 775                                         di_flags |= XFS_DIFLAG_EXTSZINHERIT;
 776                                         ip->i_d.di_extsize = pip->i_d.di_extsize;
 777                                 }
 778                         } else if (S_ISREG(mode)) {
 779                                 if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT)
 780                                         di_flags |= XFS_DIFLAG_REALTIME;
 781                                 if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) {
 782                                         di_flags |= XFS_DIFLAG_EXTSIZE;
 783                                         ip->i_d.di_extsize = pip->i_d.di_extsize;
 784                                 }
 785                         }
 786                         if ((pip->i_d.di_flags & XFS_DIFLAG_NOATIME) &&
 787                             xfs_inherit_noatime)
 788                                 di_flags |= XFS_DIFLAG_NOATIME;
 789                         if ((pip->i_d.di_flags & XFS_DIFLAG_NODUMP) &&
 790                             xfs_inherit_nodump)
 791                                 di_flags |= XFS_DIFLAG_NODUMP;
 792                         if ((pip->i_d.di_flags & XFS_DIFLAG_SYNC) &&
 793                             xfs_inherit_sync)
 794                                 di_flags |= XFS_DIFLAG_SYNC;
 795                         if ((pip->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) &&
 796                             xfs_inherit_nosymlinks)
 797                                 di_flags |= XFS_DIFLAG_NOSYMLINKS;
 798                         if (pip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
 799                                 di_flags |= XFS_DIFLAG_PROJINHERIT;
 800                         if ((pip->i_d.di_flags & XFS_DIFLAG_NODEFRAG) &&
 801                             xfs_inherit_nodefrag)
 802                                 di_flags |= XFS_DIFLAG_NODEFRAG;
 803                         if (pip->i_d.di_flags & XFS_DIFLAG_FILESTREAM)
 804                                 di_flags |= XFS_DIFLAG_FILESTREAM;
 805                         ip->i_d.di_flags |= di_flags;
 806                 }
 807                 /* FALLTHROUGH */
 808         case S_IFLNK:
 809                 ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS;
 810                 ip->i_df.if_flags = XFS_IFEXTENTS;
 811                 ip->i_df.if_bytes = ip->i_df.if_real_bytes = 0;
 812                 ip->i_df.if_u1.if_extents = NULL;
 813                 break;
 814         default:
 815                 ASSERT(0);
 816         }
 817         /*
 818          * Attribute fork settings for new inode.
 819          */
 820         ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
 821         ip->i_d.di_anextents = 0;
 822
 823         /*
 824          * Log the new values stuffed into the inode.
 825          */
 826         xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
 827         xfs_trans_log_inode(tp, ip, flags);
 828
 829         /* now that we have an i_mode we can setup inode ops and unlock */
 830         xfs_setup_inode(ip);
 831
 832         /* now we have set up the vfs inode we can associate the filestream */
 833         if (filestreams) {
 834                 error = xfs_filestream_associate(pip, ip);
 835                 if (error < 0)
 836                         return -error;
 837                 if (!error)
 838                         xfs_iflags_set(ip, XFS_IFILESTREAM);
 839         }
 840
 841         *ipp = ip;
 842         return 0;
 843 }
 844
 845 /*
 846  * Allocates a new inode from disk and return a pointer to the
 847  * incore copy. This routine will internally commit the current
 848  * transaction and allocate a new one if the Space Manager needed
 849  * to do an allocation to replenish the inode free-list.
 850  *
 851  * This routine is designed to be called from xfs_create and
 852  * xfs_create_dir.
 853  *
 854  */
 855 int
 856 xfs_dir_ialloc(
 857         xfs_trans_t     **tpp,          /* input: current transaction;
 858                                            output: may be a new transaction. */
 859         xfs_inode_t     *dp,            /* directory within whose allocate
 860                                            the inode. */
 861         umode_t         mode,
 862         xfs_nlink_t     nlink,
 863         xfs_dev_t       rdev,
 864         prid_t          prid,           /* project id */
 865         int             okalloc,        /* ok to allocate new space */
 866         xfs_inode_t     **ipp,          /* pointer to inode; it will be
 867                                            locked. */
 868         int             *committed)
 869
 870 {
 871         xfs_trans_t     *tp;
 872         xfs_trans_t     *ntp;
 873         xfs_inode_t     *ip;
 874         xfs_buf_t       *ialloc_context = NULL;
 875         int             code;
 876         void            *dqinfo;
 877         uint            tflags;
 878
 879         tp = *tpp;
 880         ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
 881
 882         /*
 883          * xfs_ialloc will return a pointer to an incore inode if
 884          * the Space Manager has an available inode on the free
 885          * list. Otherwise, it will do an allocation and replenish
 886          * the freelist.  Since we can only do one allocation per
 887          * transaction without deadlocks, we will need to commit the
 888          * current transaction and start a new one.  We will then
 889          * need to call xfs_ialloc again to get the inode.
 890          *
 891          * If xfs_ialloc did an allocation to replenish the freelist,
 892          * it returns the bp containing the head of the freelist as
 893          * ialloc_context. We will hold a lock on it across the
 894          * transaction commit so that no other process can steal
 895          * the inode(s) that we've just allocated.
 896          */
 897         code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid, okalloc,
 898                           &ialloc_context, &ip);
 899
 900         /*
 901          * Return an error if we were unable to allocate a new inode.
 902          * This should only happen if we run out of space on disk or
 903          * encounter a disk error.
 904          */
 905         if (code) {
 906                 *ipp = NULL;
 907                 return code;
 908         }
 909         if (!ialloc_context && !ip) {
 910                 *ipp = NULL;
 911                 return XFS_ERROR(ENOSPC);
 912         }
 913
 914         /*
 915          * If the AGI buffer is non-NULL, then we were unable to get an
 916          * inode in one operation.  We need to commit the current
 917          * transaction and call xfs_ialloc() again.  It is guaranteed
 918          * to succeed the second time.
 919          */
 920         if (ialloc_context) {
 921                 struct xfs_trans_res tres;
 922
 923                 /*
 924                  * Normally, xfs_trans_commit releases all the locks.
 925                  * We call bhold to hang on to the ialloc_context across
 926                  * the commit.  Holding this buffer prevents any other
 927                  * processes from doing any allocations in this
 928                  * allocation group.
 929                  */
 930                 xfs_trans_bhold(tp, ialloc_context);
 931                 /*
 932                  * Save the log reservation so we can use
 933                  * them in the next transaction.
 934                  */
 935                 tres.tr_logres = xfs_trans_get_log_res(tp);
 936                 tres.tr_logcount = xfs_trans_get_log_count(tp);
 937
 938                 /*
 939                  * We want the quota changes to be associated with the next
 940                  * transaction, NOT this one. So, detach the dqinfo from this
 941                  * and attach it to the next transaction.
 942                  */
 943                 dqinfo = NULL;
 944                 tflags = 0;
 945                 if (tp->t_dqinfo) {
 946                         dqinfo = (void *)tp->t_dqinfo;
 947                         tp->t_dqinfo = NULL;
 948                         tflags = tp->t_flags & XFS_TRANS_DQ_DIRTY;
 949                         tp->t_flags &= ~(XFS_TRANS_DQ_DIRTY);
 950                 }
 951
 952                 ntp = xfs_trans_dup(tp);
 953                 code = xfs_trans_commit(tp, 0);
 954                 tp = ntp;
 955                 if (committed != NULL) {
 956                         *committed = 1;
 957                 }
 958                 /*
 959                  * If we get an error during the commit processing,
 960                  * release the buffer that is still held and return
 961                  * to the caller.
 962                  */
 963                 if (code) {
 964                         xfs_buf_relse(ialloc_context);
 965                         if (dqinfo) {
 966                                 tp->t_dqinfo = dqinfo;
 967                                 xfs_trans_free_dqinfo(tp);
 968                         }
 969                         *tpp = ntp;
 970                         *ipp = NULL;
 971                         return code;
 972                 }
 973
 974                 /*
 975                  * transaction commit worked ok so we can drop the extra ticket
 976                  * reference that we gained in xfs_trans_dup()
 977                  */
 978                 xfs_log_ticket_put(tp->t_ticket);
 979                 tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
 980                 code = xfs_trans_reserve(tp, &tres, 0, 0);
 981
 982                 /*
 983                  * Re-attach the quota info that we detached from prev trx.
 984                  */
 985                 if (dqinfo) {
 986                         tp->t_dqinfo = dqinfo;
 987                         tp->t_flags |= tflags;
 988                 }
 989
 990                 if (code) {
 991                         xfs_buf_relse(ialloc_context);
 992                         *tpp = ntp;
 993                         *ipp = NULL;
 994                         return code;
 995                 }
 996                 xfs_trans_bjoin(tp, ialloc_context);
 997
 998                 /*
 999                  * Call ialloc again. Since we've locked out all
1000                  * other allocations in this allocation group,
1001                  * this call should always succeed.
1002                  */
1003                 code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid,
1004                                   okalloc, &ialloc_context, &ip);
1005
1006                 /*
1007                  * If we get an error at this point, return to the caller
1008                  * so that the current transaction can be aborted.
1009                  */
1010                 if (code) {
1011                         *tpp = tp;
1012                         *ipp = NULL;
1013                         return code;
1014                 }
1015                 ASSERT(!ialloc_context && ip);
1016
1017         } else {
1018                 if (committed != NULL)
1019                         *committed = 0;
1020         }
1021
1022         *ipp = ip;
1023         *tpp = tp;
1024
1025         return 0;
1026 }
1027
1028 /*
1029  * Decrement the link count on an inode & log the change.
1030  * If this causes the link count to go to zero, initiate the
1031  * logging activity required to truncate a file.
1032  */
1033 int                             /* error */
1034 xfs_droplink(
1035         xfs_trans_t *tp,
1036         xfs_inode_t *ip)
1037 {
1038         int     error;
1039
1040         xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
1041
1042         ASSERT (ip->i_d.di_nlink > 0);
1043         ip->i_d.di_nlink--;
1044         drop_nlink(VFS_I(ip));
1045         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1046
1047         error = 0;
1048         if (ip->i_d.di_nlink == 0) {
1049                 /*
1050                  * We're dropping the last link to this file.
1051                  * Move the on-disk inode to the AGI unlinked list.
1052                  * From xfs_inactive() we will pull the inode from
1053                  * the list and free it.
1054                  */
1055                 error = xfs_iunlink(tp, ip);
1056         }
1057         return error;
1058 }
1059
1060 /*
1061  * This gets called when the inode's version needs to be changed from 1 to 2.
1062  * Currently this happens when the nlink field overflows the old 16-bit value
1063  * or when chproj is called to change the project for the first time.
1064  * As a side effect the superblock version will also get rev'd
1065  * to contain the NLINK bit.
1066  */
1067 void
1068 xfs_bump_ino_vers2(
1069         xfs_trans_t     *tp,
1070         xfs_inode_t     *ip)
1071 {
1072         xfs_mount_t     *mp;
1073
1074         ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
1075         ASSERT(ip->i_d.di_version == 1);
1076
1077         ip->i_d.di_version = 2;
1078         ip->i_d.di_onlink = 0;
1079         memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
1080         mp = tp->t_mountp;
1081         if (!xfs_sb_version_hasnlink(&mp->m_sb)) {
1082                 spin_lock(&mp->m_sb_lock);
1083                 if (!xfs_sb_version_hasnlink(&mp->m_sb)) {
1084                         xfs_sb_version_addnlink(&mp->m_sb);
1085                         spin_unlock(&mp->m_sb_lock);
1086                         xfs_mod_sb(tp, XFS_SB_VERSIONNUM);
1087                 } else {
1088                         spin_unlock(&mp->m_sb_lock);
1089                 }
1090         }
1091         /* Caller must log the inode */
1092 }
1093
1094 /*
1095  * Increment the link count on an inode & log the change.
1096  */
1097 int
1098 xfs_bumplink(
1099         xfs_trans_t *tp,
1100         xfs_inode_t *ip)
1101 {
1102         xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
1103
1104         ASSERT(ip->i_d.di_nlink > 0);
1105         ip->i_d.di_nlink++;
1106         inc_nlink(VFS_I(ip));
1107         if ((ip->i_d.di_version == 1) &&
1108             (ip->i_d.di_nlink > XFS_MAXLINK_1)) {
1109                 /*
1110                  * The inode has increased its number of links beyond
1111                  * what can fit in an old format inode.  It now needs
1112                  * to be converted to a version 2 inode with a 32 bit
1113                  * link count.  If this is the first inode in the file
1114                  * system to do this, then we need to bump the superblock
1115                  * version number as well.
1116                  */
1117                 xfs_bump_ino_vers2(tp, ip);
1118         }
1119
1120         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1121         return 0;
1122 }
1123
1124 int
1125 xfs_create(
1126         xfs_inode_t             *dp,
1127         struct xfs_name         *name,
1128         umode_t                 mode,
1129         xfs_dev_t               rdev,
1130         xfs_inode_t             **ipp)
1131 {
1132         int                     is_dir = S_ISDIR(mode);
1133         struct xfs_mount        *mp = dp->i_mount;
1134         struct xfs_inode        *ip = NULL;
1135         struct xfs_trans        *tp = NULL;
1136         int                     error;
1137         xfs_bmap_free_t         free_list;
1138         xfs_fsblock_t           first_block;
1139         bool                    unlock_dp_on_error = false;
1140         uint                    cancel_flags;
1141         int                     committed;
1142         prid_t                  prid;
1143         struct xfs_dquot        *udqp = NULL;
1144         struct xfs_dquot        *gdqp = NULL;
1145         struct xfs_dquot        *pdqp = NULL;
1146         struct xfs_trans_res    tres;
1147         uint                    resblks;
1148
1149         trace_xfs_create(dp, name);
1150
1151         if (XFS_FORCED_SHUTDOWN(mp))
1152                 return XFS_ERROR(EIO);
1153
1154         if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
1155                 prid = xfs_get_projid(dp);
1156         else
1157                 prid = XFS_PROJID_DEFAULT;
1158
1159         /*
1160          * Make sure that we have allocated dquot(s) on disk.
1161          */
1162         error = xfs_qm_vop_dqalloc(dp, xfs_kuid_to_uid(current_fsuid()),
1163                                         xfs_kgid_to_gid(current_fsgid()), prid,
1164                                         XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
1165                                         &udqp, &gdqp, &pdqp);
1166         if (error)
1167                 return error;
1168
1169         if (is_dir) {
1170                 rdev = 0;
1171                 resblks = XFS_MKDIR_SPACE_RES(mp, name->len);
1172                 tres.tr_logres = M_RES(mp)->tr_mkdir.tr_logres;
1173                 tres.tr_logcount = XFS_MKDIR_LOG_COUNT;
1174                 tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR);
1175         } else {
1176                 resblks = XFS_CREATE_SPACE_RES(mp, name->len);
1177                 tres.tr_logres = M_RES(mp)->tr_create.tr_logres;
1178                 tres.tr_logcount = XFS_CREATE_LOG_COUNT;
1179                 tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE);
1180         }
1181
1182         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
1183
1184         /*
1185          * Initially assume that the file does not exist and
1186          * reserve the resources for that case.  If that is not
1187          * the case we'll drop the one we have and get a more
1188          * appropriate transaction later.
1189          */
1190         tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
1191         error = xfs_trans_reserve(tp, &tres, resblks, 0);
1192         if (error == ENOSPC) {
1193                 /* flush outstanding delalloc blocks and retry */
1194                 xfs_flush_inodes(mp);
1195                 error = xfs_trans_reserve(tp, &tres, resblks, 0);
1196         }
1197         if (error == ENOSPC) {
1198                 /* No space at all so try a "no-allocation" reservation */
1199                 resblks = 0;
1200                 error = xfs_trans_reserve(tp, &tres, 0, 0);
1201         }
1202         if (error) {
1203                 cancel_flags = 0;
1204                 goto out_trans_cancel;
1205         }
1206
1207         xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
1208         unlock_dp_on_error = true;
1209
1210         xfs_bmap_init(&free_list, &first_block);
1211
1212         /*
1213          * Reserve disk quota and the inode.
1214          */
1215         error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp,
1216                                                 pdqp, resblks, 1, 0);
1217         if (error)
1218                 goto out_trans_cancel;
1219
1220         error = xfs_dir_canenter(tp, dp, name, resblks);
1221         if (error)
1222                 goto out_trans_cancel;
1223
1224         /*
1225          * A newly created regular or special file just has one directory
1226          * entry pointing to them, but a directory also the "." entry
1227          * pointing to itself.
1228          */
1229         error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev,
1230                                prid, resblks > 0, &ip, &committed);
1231         if (error) {
1232                 if (error == ENOSPC)
1233                         goto out_trans_cancel;
1234                 goto out_trans_abort;
1235         }
1236
1237         /*
1238          * Now we join the directory inode to the transaction.  We do not do it
1239          * earlier because xfs_dir_ialloc might commit the previous transaction
1240          * (and release all the locks).  An error from here on will result in
1241          * the transaction cancel unlocking dp so don't do it explicitly in the
1242          * error path.
1243          */
1244         xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
1245         unlock_dp_on_error = false;
1246
1247         error = xfs_dir_createname(tp, dp, name, ip->i_ino,
1248                                         &first_block, &free_list, resblks ?
1249                                         resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
1250         if (error) {
1251                 ASSERT(error != ENOSPC);
1252                 goto out_trans_abort;
1253         }
1254         xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
1255         xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
1256
1257         if (is_dir) {
1258                 error = xfs_dir_init(tp, ip, dp);
1259                 if (error)
1260                         goto out_bmap_cancel;
1261
1262                 error = xfs_bumplink(tp, dp);
1263                 if (error)
1264                         goto out_bmap_cancel;
1265         }
1266
1267         /*
1268          * If this is a synchronous mount, make sure that the
1269          * create transaction goes to disk before returning to
1270          * the user.
1271          */
1272         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
1273                 xfs_trans_set_sync(tp);
1274
1275         /*
1276          * Attach the dquot(s) to the inodes and modify them incore.
1277          * These ids of the inode couldn't have changed since the new
1278          * inode has been locked ever since it was created.
1279          */
1280         xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp);
1281
1282         error = xfs_bmap_finish(&tp, &free_list, &committed);
1283         if (error)
1284                 goto out_bmap_cancel;
1285
1286         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1287         if (error)
1288                 goto out_release_inode;
1289
1290         xfs_qm_dqrele(udqp);
1291         xfs_qm_dqrele(gdqp);
1292         xfs_qm_dqrele(pdqp);
1293
1294         *ipp = ip;
1295         return 0;
1296
1297  out_bmap_cancel:
1298         xfs_bmap_cancel(&free_list);
1299  out_trans_abort:
1300         cancel_flags |= XFS_TRANS_ABORT;
1301  out_trans_cancel:
1302         xfs_trans_cancel(tp, cancel_flags);
1303  out_release_inode:
1304         /*
1305          * Wait until after the current transaction is aborted to
1306          * release the inode.  This prevents recursive transactions
1307          * and deadlocks from xfs_inactive.
1308          */
1309         if (ip)
1310                 IRELE(ip);
1311
1312         xfs_qm_dqrele(udqp);
1313         xfs_qm_dqrele(gdqp);
1314         xfs_qm_dqrele(pdqp);
1315
1316         if (unlock_dp_on_error)
1317                 xfs_iunlock(dp, XFS_ILOCK_EXCL);
1318         return error;
1319 }
1320
1321 int
1322 xfs_link(
1323         xfs_inode_t             *tdp,
1324         xfs_inode_t             *sip,
1325         struct xfs_name         *target_name)
1326 {
1327         xfs_mount_t             *mp = tdp->i_mount;
1328         xfs_trans_t             *tp;
1329         int                     error;
1330         xfs_bmap_free_t         free_list;
1331         xfs_fsblock_t           first_block;
1332         int                     cancel_flags;
1333         int                     committed;
1334         int                     resblks;
1335
1336         trace_xfs_link(tdp, target_name);
1337
1338         ASSERT(!S_ISDIR(sip->i_d.di_mode));
1339
1340         if (XFS_FORCED_SHUTDOWN(mp))
1341                 return XFS_ERROR(EIO);
1342
1343         error = xfs_qm_dqattach(sip, 0);
1344         if (error)
1345                 goto std_return;
1346
1347         error = xfs_qm_dqattach(tdp, 0);
1348         if (error)
1349                 goto std_return;
1350
1351         tp = xfs_trans_alloc(mp, XFS_TRANS_LINK);
1352         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
1353         resblks = XFS_LINK_SPACE_RES(mp, target_name->len);
1354         error = xfs_trans_reserve(tp, &M_RES(mp)->tr_link, resblks, 0);
1355         if (error == ENOSPC) {
1356                 resblks = 0;
1357                 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_link, 0, 0);
1358         }
1359         if (error) {
1360                 cancel_flags = 0;
1361                 goto error_return;
1362         }
1363
1364         xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL);
1365
1366         xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL);
1367         xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL);
1368
1369         /*
1370          * If we are using project inheritance, we only allow hard link
1371          * creation in our tree when the project IDs are the same; else
1372          * the tree quota mechanism could be circumvented.
1373          */
1374         if (unlikely((tdp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
1375                      (xfs_get_projid(tdp) != xfs_get_projid(sip)))) {
1376                 error = XFS_ERROR(EXDEV);
1377                 goto error_return;
1378         }
1379
1380         error = xfs_dir_canenter(tp, tdp, target_name, resblks);
1381         if (error)
1382                 goto error_return;
1383
1384         xfs_bmap_init(&free_list, &first_block);
1385
1386         error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino,
1387                                         &first_block, &free_list, resblks);
1388         if (error)
1389                 goto abort_return;
1390         xfs_trans_ichgtime(tp, tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
1391         xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
1392
1393         error = xfs_bumplink(tp, sip);
1394         if (error)
1395                 goto abort_return;
1396
1397         /*
1398          * If this is a synchronous mount, make sure that the
1399          * link transaction goes to disk before returning to
1400          * the user.
1401          */
1402         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
1403                 xfs_trans_set_sync(tp);
1404         }
1405
1406         error = xfs_bmap_finish (&tp, &free_list, &committed);
1407         if (error) {
1408                 xfs_bmap_cancel(&free_list);
1409                 goto abort_return;
1410         }
1411
1412         return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1413
1414  abort_return:
1415         cancel_flags |= XFS_TRANS_ABORT;
1416  error_return:
1417         xfs_trans_cancel(tp, cancel_flags);
1418  std_return:
1419         return error;
1420 }
1421
1422 /*
1423  * Free up the underlying blocks past new_size.  The new size must be smaller
1424  * than the current size.  This routine can be used both for the attribute and
1425  * data fork, and does not modify the inode size, which is left to the caller.
1426  *
1427  * The transaction passed to this routine must have made a permanent log
1428  * reservation of at least XFS_ITRUNCATE_LOG_RES.  This routine may commit the
1429  * given transaction and start new ones, so make sure everything involved in
1430  * the transaction is tidy before calling here.  Some transaction will be
1431  * returned to the caller to be committed.  The incoming transaction must
1432  * already include the inode, and both inode locks must be held exclusively.
1433  * The inode must also be "held" within the transaction.  On return the inode
1434  * will be "held" within the returned transaction.  This routine does NOT
1435  * require any disk space to be reserved for it within the transaction.
1436  *
1437  * If we get an error, we must return with the inode locked and linked into the
1438  * current transaction. This keeps things simple for the higher level code,
1439  * because it always knows that the inode is locked and held in the transaction
1440  * that returns to it whether errors occur or not.  We don't mark the inode
1441  * dirty on error so that transactions can be easily aborted if possible.
1442  */
1443 int
1444 xfs_itruncate_extents(
1445         struct xfs_trans        **tpp,
1446         struct xfs_inode        *ip,
1447         int                     whichfork,
1448         xfs_fsize_t             new_size)
1449 {
1450         struct xfs_mount        *mp = ip->i_mount;
1451         struct xfs_trans        *tp = *tpp;
1452         struct xfs_trans        *ntp;
1453         xfs_bmap_free_t         free_list;
1454         xfs_fsblock_t           first_block;
1455         xfs_fileoff_t           first_unmap_block;
1456         xfs_fileoff_t           last_block;
1457         xfs_filblks_t           unmap_len;
1458         int                     committed;
1459         int                     error = 0;
1460         int                     done = 0;
1461
1462         ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
1463         ASSERT(!atomic_read(&VFS_I(ip)->i_count) ||
1464                xfs_isilocked(ip, XFS_IOLOCK_EXCL));
1465         ASSERT(new_size <= XFS_ISIZE(ip));
1466         ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
1467         ASSERT(ip->i_itemp != NULL);
1468         ASSERT(ip->i_itemp->ili_lock_flags == 0);
1469         ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
1470
1471         trace_xfs_itruncate_extents_start(ip, new_size);
1472
1473         /*
1474          * Since it is possible for space to become allocated beyond
1475          * the end of the file (in a crash where the space is allocated
1476          * but the inode size is not yet updated), simply remove any
1477          * blocks which show up between the new EOF and the maximum
1478          * possible file size.  If the first block to be removed is
1479          * beyond the maximum file size (ie it is the same as last_block),
1480          * then there is nothing to do.
1481          */
1482         first_unmap_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size);
1483         last_block = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
1484         if (first_unmap_block == last_block)
1485                 return 0;
1486
1487         ASSERT(first_unmap_block < last_block);
1488         unmap_len = last_block - first_unmap_block + 1;
1489         while (!done) {
1490                 xfs_bmap_init(&free_list, &first_block);
1491                 error = xfs_bunmapi(tp, ip,
1492                                     first_unmap_block, unmap_len,
1493                                     xfs_bmapi_aflag(whichfork),
1494                                     XFS_ITRUNC_MAX_EXTENTS,
1495                                     &first_block, &free_list,
1496                                     &done);
1497                 if (error)
1498                         goto out_bmap_cancel;
1499
1500                 /*
1501                  * Duplicate the transaction that has the permanent
1502                  * reservation and commit the old transaction.
1503                  */
1504                 error = xfs_bmap_finish(&tp, &free_list, &committed);
1505                 if (committed)
1506                         xfs_trans_ijoin(tp, ip, 0);
1507                 if (error)
1508                         goto out_bmap_cancel;
1509
1510                 if (committed) {
1511                         /*
1512                          * Mark the inode dirty so it will be logged and
1513                          * moved forward in the log as part of every commit.
1514                          */
1515                         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1516                 }
1517
1518                 ntp = xfs_trans_dup(tp);
1519                 error = xfs_trans_commit(tp, 0);
1520                 tp = ntp;
1521
1522                 xfs_trans_ijoin(tp, ip, 0);
1523
1524                 if (error)
1525                         goto out;
1526
1527                 /*
1528                  * Transaction commit worked ok so we can drop the extra ticket
1529                  * reference that we gained in xfs_trans_dup()
1530                  */
1531                 xfs_log_ticket_put(tp->t_ticket);
1532                 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
1533                 if (error)
1534                         goto out;
1535         }
1536
1537         /*
1538          * Always re-log the inode so that our permanent transaction can keep
1539          * on rolling it forward in the log.
1540          */
1541         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1542
1543         trace_xfs_itruncate_extents_end(ip, new_size);
1544
1545 out:
1546         *tpp = tp;
1547         return error;
1548 out_bmap_cancel:
1549         /*
1550          * If the bunmapi call encounters an error, return to the caller where
1551          * the transaction can be properly aborted.  We just need to make sure
1552          * we're not holding any resources that we were not when we came in.
1553          */
1554         xfs_bmap_cancel(&free_list);
1555         goto out;
1556 }
1557
1558 int
1559 xfs_release(
1560         xfs_inode_t     *ip)
1561 {
1562         xfs_mount_t     *mp = ip->i_mount;
1563         int             error;
1564
1565         if (!S_ISREG(ip->i_d.di_mode) || (ip->i_d.di_mode == 0))
1566                 return 0;
1567
1568         /* If this is a read-only mount, don't do this (would generate I/O) */
1569         if (mp->m_flags & XFS_MOUNT_RDONLY)
1570                 return 0;
1571
1572         if (!XFS_FORCED_SHUTDOWN(mp)) {
1573                 int truncated;
1574
1575                 /*
1576                  * If we are using filestreams, and we have an unlinked
1577                  * file that we are processing the last close on, then nothing
1578                  * will be able to reopen and write to this file. Purge this
1579                  * inode from the filestreams cache so that it doesn't delay
1580                  * teardown of the inode.
1581                  */
1582                 if ((ip->i_d.di_nlink == 0) && xfs_inode_is_filestream(ip))
1583                         xfs_filestream_deassociate(ip);
1584
1585                 /*
1586                  * If we previously truncated this file and removed old data
1587                  * in the process, we want to initiate "early" writeout on
1588                  * the last close.  This is an attempt to combat the notorious
1589                  * NULL files problem which is particularly noticeable from a
1590                  * truncate down, buffered (re-)write (delalloc), followed by
1591                  * a crash.  What we are effectively doing here is
1592                  * significantly reducing the time window where we'd otherwise
1593                  * be exposed to that problem.
1594                  */
1595                 truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED);
1596                 if (truncated) {
1597                         xfs_iflags_clear(ip, XFS_IDIRTY_RELEASE);
1598                         if (VN_DIRTY(VFS_I(ip)) && ip->i_delayed_blks > 0) {
1599                                 error = -filemap_flush(VFS_I(ip)->i_mapping);
1600                                 if (error)
1601                                         return error;
1602                         }
1603                 }
1604         }
1605
1606         if (ip->i_d.di_nlink == 0)
1607                 return 0;
1608
1609         if (xfs_can_free_eofblocks(ip, false)) {
1610
1611                 /*
1612                  * If we can't get the iolock just skip truncating the blocks
1613                  * past EOF because we could deadlock with the mmap_sem
1614                  * otherwise.  We'll get another chance to drop them once the
1615                  * last reference to the inode is dropped, so we'll never leak
1616                  * blocks permanently.
1617                  *
1618                  * Further, check if the inode is being opened, written and
1619                  * closed frequently and we have delayed allocation blocks
1620                  * outstanding (e.g. streaming writes from the NFS server),
1621                  * truncating the blocks past EOF will cause fragmentation to
1622                  * occur.
1623                  *
1624                  * In this case don't do the truncation, either, but we have to
1625                  * be careful how we detect this case. Blocks beyond EOF show
1626                  * up as i_delayed_blks even when the inode is clean, so we
1627                  * need to truncate them away first before checking for a dirty
1628                  * release. Hence on the first dirty close we will still remove
1629                  * the speculative allocation, but after that we will leave it
1630                  * in place.
1631                  */
1632                 if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE))
1633                         return 0;
1634
1635                 error = xfs_free_eofblocks(mp, ip, true);
1636                 if (error && error != EAGAIN)
1637                         return error;
1638
1639                 /* delalloc blocks after truncation means it really is dirty */
1640                 if (ip->i_delayed_blks)
1641                         xfs_iflags_set(ip, XFS_IDIRTY_RELEASE);
1642         }
1643         return 0;
1644 }
1645
1646 /*
1647  * xfs_inactive_truncate
1648  *
1649  * Called to perform a truncate when an inode becomes unlinked.
1650  */
1651 STATIC int
1652 xfs_inactive_truncate(
1653         struct xfs_inode *ip)
1654 {
1655         struct xfs_mount        *mp = ip->i_mount;
1656         struct xfs_trans        *tp;
1657         int                     error;
1658
1659         tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
1660         error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
1661         if (error) {
1662                 ASSERT(XFS_FORCED_SHUTDOWN(mp));
1663                 xfs_trans_cancel(tp, 0);
1664                 return error;
1665         }
1666
1667         xfs_ilock(ip, XFS_ILOCK_EXCL);
1668         xfs_trans_ijoin(tp, ip, 0);
1669
1670         /*
1671          * Log the inode size first to prevent stale data exposure in the event
1672          * of a system crash before the truncate completes. See the related
1673          * comment in xfs_setattr_size() for details.
1674          */
1675         ip->i_d.di_size = 0;
1676         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1677
1678         error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0);
1679         if (error)
1680                 goto error_trans_cancel;
1681
1682         ASSERT(ip->i_d.di_nextents == 0);
1683
1684         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1685         if (error)
1686                 goto error_unlock;
1687
1688         xfs_iunlock(ip, XFS_ILOCK_EXCL);
1689         return 0;
1690
1691 error_trans_cancel:
1692         xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
1693 error_unlock:
1694         xfs_iunlock(ip, XFS_ILOCK_EXCL);
1695         return error;
1696 }
1697
1698 /*
1699  * xfs_inactive_ifree()
1700  *
1701  * Perform the inode free when an inode is unlinked.
1702  */
1703 STATIC int
1704 xfs_inactive_ifree(
1705         struct xfs_inode *ip)
1706 {
1707         xfs_bmap_free_t         free_list;
1708         xfs_fsblock_t           first_block;
1709         int                     committed;
1710         struct xfs_mount        *mp = ip->i_mount;
1711         struct xfs_trans        *tp;
1712         int                     error;
1713
1714         tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
1715         error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ifree, 0, 0);
1716         if (error) {
1717                 ASSERT(XFS_FORCED_SHUTDOWN(mp));
1718                 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES);
1719                 return error;
1720         }
1721
1722         xfs_ilock(ip, XFS_ILOCK_EXCL);
1723         xfs_trans_ijoin(tp, ip, 0);
1724
1725         xfs_bmap_init(&free_list, &first_block);
1726         error = xfs_ifree(tp, ip, &free_list);
1727         if (error) {
1728                 /*
1729                  * If we fail to free the inode, shut down.  The cancel
1730                  * might do that, we need to make sure.  Otherwise the
1731                  * inode might be lost for a long time or forever.
1732                  */
1733                 if (!XFS_FORCED_SHUTDOWN(mp)) {
1734                         xfs_notice(mp, "%s: xfs_ifree returned error %d",
1735                                 __func__, error);
1736                         xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
1737                 }
1738                 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
1739                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1740                 return error;
1741         }
1742
1743         /*
1744          * Credit the quota account(s). The inode is gone.
1745          */
1746         xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_ICOUNT, -1);
1747
1748         /*
1749          * Just ignore errors at this point.  There is nothing we can
1750          * do except to try to keep going. Make sure it's not a silent
1751          * error.
1752          */
1753         error = xfs_bmap_finish(&tp,  &free_list, &committed);
1754         if (error)
1755                 xfs_notice(mp, "%s: xfs_bmap_finish returned error %d",
1756                         __func__, error);
1757         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1758         if (error)
1759                 xfs_notice(mp, "%s: xfs_trans_commit returned error %d",
1760                         __func__, error);
1761
1762         xfs_iunlock(ip, XFS_ILOCK_EXCL);
1763         return 0;
1764 }
1765
1766 /*
1767  * xfs_inactive
1768  *
1769  * This is called when the vnode reference count for the vnode
1770  * goes to zero.  If the file has been unlinked, then it must
1771  * now be truncated.  Also, we clear all of the read-ahead state
1772  * kept for the inode here since the file is now closed.
1773  */
1774 void
1775 xfs_inactive(
1776         xfs_inode_t     *ip)
1777 {
1778         struct xfs_mount        *mp;
1779         int                     error;
1780         int                     truncate = 0;
1781
1782         /*
1783          * If the inode is already free, then there can be nothing
1784          * to clean up here.
1785          */
1786         if (ip->i_d.di_mode == 0) {
1787                 ASSERT(ip->i_df.if_real_bytes == 0);
1788                 ASSERT(ip->i_df.if_broot_bytes == 0);
1789                 return;
1790         }
1791
1792         mp = ip->i_mount;
1793
1794         /* If this is a read-only mount, don't do this (would generate I/O) */
1795         if (mp->m_flags & XFS_MOUNT_RDONLY)
1796                 return;
1797
1798         if (ip->i_d.di_nlink != 0) {
1799                 /*
1800                  * force is true because we are evicting an inode from the
1801                  * cache. Post-eof blocks must be freed, lest we end up with
1802                  * broken free space accounting.
1803                  */
1804                 if (xfs_can_free_eofblocks(ip, true))
1805                         xfs_free_eofblocks(mp, ip, false);
1806
1807                 return;
1808         }
1809
1810         if (S_ISREG(ip->i_d.di_mode) &&
1811             (ip->i_d.di_size != 0 || XFS_ISIZE(ip) != 0 ||
1812              ip->i_d.di_nextents > 0 || ip->i_delayed_blks > 0))
1813                 truncate = 1;
1814
1815         error = xfs_qm_dqattach(ip, 0);
1816         if (error)
1817                 return;
1818
1819         if (S_ISLNK(ip->i_d.di_mode))
1820                 error = xfs_inactive_symlink(ip);
1821         else if (truncate)
1822                 error = xfs_inactive_truncate(ip);
1823         if (error)
1824                 return;
1825
1826         /*
1827          * If there are attributes associated with the file then blow them away
1828          * now.  The code calls a routine that recursively deconstructs the
1829          * attribute fork.  We need to just commit the current transaction
1830          * because we can't use it for xfs_attr_inactive().
1831          */
1832         if (ip->i_d.di_anextents > 0) {
1833                 ASSERT(ip->i_d.di_forkoff != 0);
1834
1835                 error = xfs_attr_inactive(ip);
1836                 if (error)
1837                         return;
1838         }
1839
1840         if (ip->i_afp)
1841                 xfs_idestroy_fork(ip, XFS_ATTR_FORK);
1842
1843         ASSERT(ip->i_d.di_anextents == 0);
1844
1845         /*
1846          * Free the inode.
1847          */
1848         error = xfs_inactive_ifree(ip);
1849         if (error)
1850                 return;
1851
1852         /*
1853          * Release the dquots held by inode, if any.
1854          */
1855         xfs_qm_dqdetach(ip);
1856 }
1857
1858 /*
1859  * This is called when the inode's link count goes to 0.
1860  * We place the on-disk inode on a list in the AGI.  It
1861  * will be pulled from this list when the inode is freed.
1862  */
1863 int
1864 xfs_iunlink(
1865         xfs_trans_t     *tp,
1866         xfs_inode_t     *ip)
1867 {
1868         xfs_mount_t     *mp;
1869         xfs_agi_t       *agi;
1870         xfs_dinode_t    *dip;
1871         xfs_buf_t       *agibp;
1872         xfs_buf_t       *ibp;
1873         xfs_agino_t     agino;
1874         short           bucket_index;
1875         int             offset;
1876         int             error;
1877
1878         ASSERT(ip->i_d.di_nlink == 0);
1879         ASSERT(ip->i_d.di_mode != 0);
1880
1881         mp = tp->t_mountp;
1882
1883         /*
1884          * Get the agi buffer first.  It ensures lock ordering
1885          * on the list.
1886          */
1887         error = xfs_read_agi(mp, tp, XFS_INO_TO_AGNO(mp, ip->i_ino), &agibp);
1888         if (error)
1889                 return error;
1890         agi = XFS_BUF_TO_AGI(agibp);
1891
1892         /*
1893          * Get the index into the agi hash table for the
1894          * list this inode will go on.
1895          */
1896         agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
1897         ASSERT(agino != 0);
1898         bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
1899         ASSERT(agi->agi_unlinked[bucket_index]);
1900         ASSERT(be32_to_cpu(agi->agi_unlinked[bucket_index]) != agino);
1901
1902         if (agi->agi_unlinked[bucket_index] != cpu_to_be32(NULLAGINO)) {
1903                 /*
1904                  * There is already another inode in the bucket we need
1905                  * to add ourselves to.  Add us at the front of the list.
1906                  * Here we put the head pointer into our next pointer,
1907                  * and then we fall through to point the head at us.
1908                  */
1909                 error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp,
1910                                        0, 0);
1911                 if (error)
1912                         return error;
1913
1914                 ASSERT(dip->di_next_unlinked == cpu_to_be32(NULLAGINO));
1915                 dip->di_next_unlinked = agi->agi_unlinked[bucket_index];
1916                 offset = ip->i_imap.im_boffset +
1917                         offsetof(xfs_dinode_t, di_next_unlinked);
1918
1919                 /* need to recalc the inode CRC if appropriate */
1920                 xfs_dinode_calc_crc(mp, dip);
1921
1922                 xfs_trans_inode_buf(tp, ibp);
1923                 xfs_trans_log_buf(tp, ibp, offset,
1924                                   (offset + sizeof(xfs_agino_t) - 1));
1925                 xfs_inobp_check(mp, ibp);
1926         }
1927
1928         /*
1929          * Point the bucket head pointer at the inode being inserted.
1930          */
1931         ASSERT(agino != 0);
1932         agi->agi_unlinked[bucket_index] = cpu_to_be32(agino);
1933         offset = offsetof(xfs_agi_t, agi_unlinked) +
1934                 (sizeof(xfs_agino_t) * bucket_index);
1935         xfs_trans_log_buf(tp, agibp, offset,
1936                           (offset + sizeof(xfs_agino_t) - 1));
1937         return 0;
1938 }
1939
1940 /*
1941  * Pull the on-disk inode from the AGI unlinked list.
1942  */
1943 STATIC int
1944 xfs_iunlink_remove(
1945         xfs_trans_t     *tp,
1946         xfs_inode_t     *ip)
1947 {
1948         xfs_ino_t       next_ino;
1949         xfs_mount_t     *mp;
1950         xfs_agi_t       *agi;
1951         xfs_dinode_t    *dip;
1952         xfs_buf_t       *agibp;
1953         xfs_buf_t       *ibp;
1954         xfs_agnumber_t  agno;
1955         xfs_agino_t     agino;
1956         xfs_agino_t     next_agino;
1957         xfs_buf_t       *last_ibp;
1958         xfs_dinode_t    *last_dip = NULL;
1959         short           bucket_index;
1960         int             offset, last_offset = 0;
1961         int             error;
1962
1963         mp = tp->t_mountp;
1964         agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
1965
1966         /*
1967          * Get the agi buffer first.  It ensures lock ordering
1968          * on the list.
1969          */
1970         error = xfs_read_agi(mp, tp, agno, &agibp);
1971         if (error)
1972                 return error;
1973
1974         agi = XFS_BUF_TO_AGI(agibp);
1975
1976         /*
1977          * Get the index into the agi hash table for the
1978          * list this inode will go on.
1979          */
1980         agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
1981         ASSERT(agino != 0);
1982         bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
1983         ASSERT(agi->agi_unlinked[bucket_index] != cpu_to_be32(NULLAGINO));
1984         ASSERT(agi->agi_unlinked[bucket_index]);
1985
1986         if (be32_to_cpu(agi->agi_unlinked[bucket_index]) == agino) {
1987                 /*
1988                  * We're at the head of the list.  Get the inode's on-disk
1989                  * buffer to see if there is anyone after us on the list.
1990                  * Only modify our next pointer if it is not already NULLAGINO.
1991                  * This saves us the overhead of dealing with the buffer when
1992                  * there is no need to change it.
1993                  */
1994                 error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp,
1995                                        0, 0);
1996                 if (error) {
1997                         xfs_warn(mp, "%s: xfs_imap_to_bp returned error %d.",
1998                                 __func__, error);
1999                         return error;
2000                 }
2001                 next_agino = be32_to_cpu(dip->di_next_unlinked);
2002                 ASSERT(next_agino != 0);
2003                 if (next_agino != NULLAGINO) {
2004                         dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
2005                         offset = ip->i_imap.im_boffset +
2006                                 offsetof(xfs_dinode_t, di_next_unlinked);
2007
2008                         /* need to recalc the inode CRC if appropriate */
2009                         xfs_dinode_calc_crc(mp, dip);
2010
2011                         xfs_trans_inode_buf(tp, ibp);
2012                         xfs_trans_log_buf(tp, ibp, offset,
2013                                           (offset + sizeof(xfs_agino_t) - 1));
2014                         xfs_inobp_check(mp, ibp);
2015                 } else {
2016                         xfs_trans_brelse(tp, ibp);
2017                 }
2018                 /*
2019                  * Point the bucket head pointer at the next inode.
2020                  */
2021                 ASSERT(next_agino != 0);
2022                 ASSERT(next_agino != agino);
2023                 agi->agi_unlinked[bucket_index] = cpu_to_be32(next_agino);
2024                 offset = offsetof(xfs_agi_t, agi_unlinked) +
2025                         (sizeof(xfs_agino_t) * bucket_index);
2026                 xfs_trans_log_buf(tp, agibp, offset,
2027                                   (offset + sizeof(xfs_agino_t) - 1));
2028         } else {
2029                 /*
2030                  * We need to search the list for the inode being freed.
2031                  */
2032                 next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
2033                 last_ibp = NULL;
2034                 while (next_agino != agino) {
2035                         struct xfs_imap imap;
2036
2037                         if (last_ibp)
2038                                 xfs_trans_brelse(tp, last_ibp);
2039
2040                         imap.im_blkno = 0;
2041                         next_ino = XFS_AGINO_TO_INO(mp, agno, next_agino);
2042
2043                         error = xfs_imap(mp, tp, next_ino, &imap, 0);
2044                         if (error) {
2045                                 xfs_warn(mp,
2046         "%s: xfs_imap returned error %d.",
2047                                          __func__, error);
2048                                 return error;
2049                         }
2050
2051                         error = xfs_imap_to_bp(mp, tp, &imap, &last_dip,
2052                                                &last_ibp, 0, 0);
2053                         if (error) {
2054                                 xfs_warn(mp,
2055         "%s: xfs_imap_to_bp returned error %d.",
2056                                         __func__, error);
2057                                 return error;
2058                         }
2059
2060                         last_offset = imap.im_boffset;
2061                         next_agino = be32_to_cpu(last_dip->di_next_unlinked);
2062                         ASSERT(next_agino != NULLAGINO);
2063                         ASSERT(next_agino != 0);
2064                 }
2065
2066                 /*
2067                  * Now last_ibp points to the buffer previous to us on the
2068                  * unlinked list.  Pull us from the list.
2069                  */
2070                 error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp,
2071                                        0, 0);
2072                 if (error) {
2073                         xfs_warn(mp, "%s: xfs_imap_to_bp(2) returned error %d.",
2074                                 __func__, error);
2075                         return error;
2076                 }
2077                 next_agino = be32_to_cpu(dip->di_next_unlinked);
2078                 ASSERT(next_agino != 0);
2079                 ASSERT(next_agino != agino);
2080                 if (next_agino != NULLAGINO) {
2081                         dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
2082                         offset = ip->i_imap.im_boffset +
2083                                 offsetof(xfs_dinode_t, di_next_unlinked);
2084
2085                         /* need to recalc the inode CRC if appropriate */
2086                         xfs_dinode_calc_crc(mp, dip);
2087
2088                         xfs_trans_inode_buf(tp, ibp);
2089                         xfs_trans_log_buf(tp, ibp, offset,
2090                                           (offset + sizeof(xfs_agino_t) - 1));
2091                         xfs_inobp_check(mp, ibp);
2092                 } else {
2093                         xfs_trans_brelse(tp, ibp);
2094                 }
2095                 /*
2096                  * Point the previous inode on the list to the next inode.
2097                  */
2098                 last_dip->di_next_unlinked = cpu_to_be32(next_agino);
2099                 ASSERT(next_agino != 0);
2100                 offset = last_offset + offsetof(xfs_dinode_t, di_next_unlinked);
2101
2102                 /* need to recalc the inode CRC if appropriate */
2103                 xfs_dinode_calc_crc(mp, last_dip);
2104
2105                 xfs_trans_inode_buf(tp, last_ibp);
2106                 xfs_trans_log_buf(tp, last_ibp, offset,
2107                                   (offset + sizeof(xfs_agino_t) - 1));
2108                 xfs_inobp_check(mp, last_ibp);
2109         }
2110         return 0;
2111 }
2112
2113 /*
2114  * A big issue when freeing the inode cluster is that we _cannot_ skip any
2115  * inodes that are in memory - they all must be marked stale and attached to
2116  * the cluster buffer.
2117  */
2118 STATIC int
2119 xfs_ifree_cluster(
2120         xfs_inode_t     *free_ip,
2121         xfs_trans_t     *tp,
2122         xfs_ino_t       inum)
2123 {
2124         xfs_mount_t             *mp = free_ip->i_mount;
2125         int                     blks_per_cluster;
2126         int                     nbufs;
2127         int                     ninodes;
2128         int                     i, j;
2129         xfs_daddr_t             blkno;
2130         xfs_buf_t               *bp;
2131         xfs_inode_t             *ip;
2132         xfs_inode_log_item_t    *iip;
2133         xfs_log_item_t          *lip;
2134         struct xfs_perag        *pag;
2135
2136         pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum));
2137         if (mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) {
2138                 blks_per_cluster = 1;
2139                 ninodes = mp->m_sb.sb_inopblock;
2140                 nbufs = XFS_IALLOC_BLOCKS(mp);
2141         } else {
2142                 blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) /
2143                                         mp->m_sb.sb_blocksize;
2144                 ninodes = blks_per_cluster * mp->m_sb.sb_inopblock;
2145                 nbufs = XFS_IALLOC_BLOCKS(mp) / blks_per_cluster;
2146         }
2147
2148         for (j = 0; j < nbufs; j++, inum += ninodes) {
2149                 blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum),
2150                                          XFS_INO_TO_AGBNO(mp, inum));
2151
2152                 /*
2153                  * We obtain and lock the backing buffer first in the process
2154                  * here, as we have to ensure that any dirty inode that we
2155                  * can't get the flush lock on is attached to the buffer.
2156                  * If we scan the in-memory inodes first, then buffer IO can
2157                  * complete before we get a lock on it, and hence we may fail
2158                  * to mark all the active inodes on the buffer stale.
2159                  */
2160                 bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno,
2161                                         mp->m_bsize * blks_per_cluster,
2162                                         XBF_UNMAPPED);
2163
2164                 if (!bp)
2165                         return ENOMEM;
2166
2167                 /*
2168                  * This buffer may not have been correctly initialised as we
2169                  * didn't read it from disk. That's not important because we are
2170                  * only using to mark the buffer as stale in the log, and to
2171                  * attach stale cached inodes on it. That means it will never be
2172                  * dispatched for IO. If it is, we want to know about it, and we
2173                  * want it to fail. We can acheive this by adding a write
2174                  * verifier to the buffer.
2175                  */
2176                  bp->b_ops = &xfs_inode_buf_ops;
2177
2178                 /*
2179                  * Walk the inodes already attached to the buffer and mark them
2180                  * stale. These will all have the flush locks held, so an
2181                  * in-memory inode walk can't lock them. By marking them all
2182                  * stale first, we will not attempt to lock them in the loop
2183                  * below as the XFS_ISTALE flag will be set.
2184                  */
2185                 lip = bp->b_fspriv;
2186                 while (lip) {
2187                         if (lip->li_type == XFS_LI_INODE) {
2188                                 iip = (xfs_inode_log_item_t *)lip;
2189                                 ASSERT(iip->ili_logged == 1);
2190                                 lip->li_cb = xfs_istale_done;
2191                                 xfs_trans_ail_copy_lsn(mp->m_ail,
2192                                                         &iip->ili_flush_lsn,
2193                                                         &iip->ili_item.li_lsn);
2194                                 xfs_iflags_set(iip->ili_inode, XFS_ISTALE);
2195                         }
2196                         lip = lip->li_bio_list;
2197                 }
2198
2199
2200                 /*
2201                  * For each inode in memory attempt to add it to the inode
2202                  * buffer and set it up for being staled on buffer IO
2203                  * completion.  This is safe as we've locked out tail pushing
2204                  * and flushing by locking the buffer.
2205                  *
2206                  * We have already marked every inode that was part of a
2207                  * transaction stale above, which means there is no point in
2208                  * even trying to lock them.
2209                  */
2210                 for (i = 0; i < ninodes; i++) {
2211 retry:
2212                         rcu_read_lock();
2213                         ip = radix_tree_lookup(&pag->pag_ici_root,
2214                                         XFS_INO_TO_AGINO(mp, (inum + i)));
2215
2216                         /* Inode not in memory, nothing to do */
2217                         if (!ip) {
2218                                 rcu_read_unlock();
2219                                 continue;
2220                         }
2221
2222                         /*
2223                          * because this is an RCU protected lookup, we could
2224                          * find a recently freed or even reallocated inode
2225                          * during the lookup. We need to check under the
2226                          * i_flags_lock for a valid inode here. Skip it if it
2227                          * is not valid, the wrong inode or stale.
2228                          */
2229                         spin_lock(&ip->i_flags_lock);
2230                         if (ip->i_ino != inum + i ||
2231                             __xfs_iflags_test(ip, XFS_ISTALE)) {
2232                                 spin_unlock(&ip->i_flags_lock);
2233                                 rcu_read_unlock();
2234                                 continue;
2235                         }
2236                         spin_unlock(&ip->i_flags_lock);
2237
2238                         /*
2239                          * Don't try to lock/unlock the current inode, but we
2240                          * _cannot_ skip the other inodes that we did not find
2241                          * in the list attached to the buffer and are not
2242                          * already marked stale. If we can't lock it, back off
2243                          * and retry.
2244                          */
2245                         if (ip != free_ip &&
2246                             !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
2247                                 rcu_read_unlock();
2248                                 delay(1);
2249                                 goto retry;
2250                         }
2251                         rcu_read_unlock();
2252
2253                         xfs_iflock(ip);
2254                         xfs_iflags_set(ip, XFS_ISTALE);
2255
2256                         /*
2257                          * we don't need to attach clean inodes or those only
2258                          * with unlogged changes (which we throw away, anyway).
2259                          */
2260                         iip = ip->i_itemp;
2261                         if (!iip || xfs_inode_clean(ip)) {
2262                                 ASSERT(ip != free_ip);
2263                                 xfs_ifunlock(ip);
2264                                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
2265                                 continue;
2266                         }
2267
2268                         iip->ili_last_fields = iip->ili_fields;
2269                         iip->ili_fields = 0;
2270                         iip->ili_logged = 1;
2271                         xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
2272                                                 &iip->ili_item.li_lsn);
2273
2274                         xfs_buf_attach_iodone(bp, xfs_istale_done,
2275                                                   &iip->ili_item);
2276
2277                         if (ip != free_ip)
2278                                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
2279                 }
2280
2281                 xfs_trans_stale_inode_buf(tp, bp);
2282                 xfs_trans_binval(tp, bp);
2283         }
2284
2285         xfs_perag_put(pag);
2286         return 0;
2287 }
2288
2289 /*
2290  * This is called to return an inode to the inode free list.
2291  * The inode should already be truncated to 0 length and have
2292  * no pages associated with it.  This routine also assumes that
2293  * the inode is already a part of the transaction.
2294  *
2295  * The on-disk copy of the inode will have been added to the list
2296  * of unlinked inodes in the AGI. We need to remove the inode from
2297  * that list atomically with respect to freeing it here.
2298  */
2299 int
2300 xfs_ifree(
2301         xfs_trans_t     *tp,
2302         xfs_inode_t     *ip,
2303         xfs_bmap_free_t *flist)
2304 {
2305         int                     error;
2306         int                     delete;
2307         xfs_ino_t               first_ino;
2308
2309         ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
2310         ASSERT(ip->i_d.di_nlink == 0);
2311         ASSERT(ip->i_d.di_nextents == 0);
2312         ASSERT(ip->i_d.di_anextents == 0);
2313         ASSERT(ip->i_d.di_size == 0 || !S_ISREG(ip->i_d.di_mode));
2314         ASSERT(ip->i_d.di_nblocks == 0);
2315
2316         /*
2317          * Pull the on-disk inode from the AGI unlinked list.
2318          */
2319         error = xfs_iunlink_remove(tp, ip);
2320         if (error)
2321                 return error;
2322
2323         error = xfs_difree(tp, ip->i_ino, flist, &delete, &first_ino);
2324         if (error)
2325                 return error;
2326
2327         ip->i_d.di_mode = 0;            /* mark incore inode as free */
2328         ip->i_d.di_flags = 0;
2329         ip->i_d.di_dmevmask = 0;
2330         ip->i_d.di_forkoff = 0;         /* mark the attr fork not in use */
2331         ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS;
2332         ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
2333         /*
2334          * Bump the generation count so no one will be confused
2335          * by reincarnations of this inode.
2336          */
2337         ip->i_d.di_gen++;
2338         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
2339
2340         if (delete)
2341                 error = xfs_ifree_cluster(ip, tp, first_ino);
2342
2343         return error;
2344 }
2345
2346 /*
2347  * This is called to unpin an inode.  The caller must have the inode locked
2348  * in at least shared mode so that the buffer cannot be subsequently pinned
2349  * once someone is waiting for it to be unpinned.
2350  */
2351 static void
2352 xfs_iunpin(
2353         struct xfs_inode        *ip)
2354 {
2355         ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
2356
2357         trace_xfs_inode_unpin_nowait(ip, _RET_IP_);
2358
2359         /* Give the log a push to start the unpinning I/O */
2360         xfs_log_force_lsn(ip->i_mount, ip->i_itemp->ili_last_lsn, 0);
2361
2362 }
2363
2364 static void
2365 __xfs_iunpin_wait(
2366         struct xfs_inode        *ip)
2367 {
2368         wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IPINNED_BIT);
2369         DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IPINNED_BIT);
2370
2371         xfs_iunpin(ip);
2372
2373         do {
2374                 prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
2375                 if (xfs_ipincount(ip))
2376                         io_schedule();
2377         } while (xfs_ipincount(ip));
2378         finish_wait(wq, &wait.wait);
2379 }
2380
2381 void
2382 xfs_iunpin_wait(
2383         struct xfs_inode        *ip)
2384 {
2385         if (xfs_ipincount(ip))
2386                 __xfs_iunpin_wait(ip);
2387 }
2388
2389 /*
2390  * Removing an inode from the namespace involves removing the directory entry
2391  * and dropping the link count on the inode. Removing the directory entry can
2392  * result in locking an AGF (directory blocks were freed) and removing a link
2393  * count can result in placing the inode on an unlinked list which results in
2394  * locking an AGI.
2395  *
2396  * The big problem here is that we have an ordering constraint on AGF and AGI
2397  * locking - inode allocation locks the AGI, then can allocate a new extent for
2398  * new inodes, locking the AGF after the AGI. Similarly, freeing the inode
2399  * removes the inode from the unlinked list, requiring that we lock the AGI
2400  * first, and then freeing the inode can result in an inode chunk being freed
2401  * and hence freeing disk space requiring that we lock an AGF.
2402  *
2403  * Hence the ordering that is imposed by other parts of the code is AGI before
2404  * AGF. This means we cannot remove the directory entry before we drop the inode
2405  * reference count and put it on the unlinked list as this results in a lock
2406  * order of AGF then AGI, and this can deadlock against inode allocation and
2407  * freeing. Therefore we must drop the link counts before we remove the
2408  * directory entry.
2409  *
2410  * This is still safe from a transactional point of view - it is not until we
2411  * get to xfs_bmap_finish() that we have the possibility of multiple
2412  * transactions in this operation. Hence as long as we remove the directory
2413  * entry and drop the link count in the first transaction of the remove
2414  * operation, there are no transactional constraints on the ordering here.
2415  */
2416 int
2417 xfs_remove(
2418         xfs_inode_t             *dp,
2419         struct xfs_name         *name,
2420         xfs_inode_t             *ip)
2421 {
2422         xfs_mount_t             *mp = dp->i_mount;
2423         xfs_trans_t             *tp = NULL;
2424         int                     is_dir = S_ISDIR(ip->i_d.di_mode);
2425         int                     error = 0;
2426         xfs_bmap_free_t         free_list;
2427         xfs_fsblock_t           first_block;
2428         int                     cancel_flags;
2429         int                     committed;
2430         int                     link_zero;
2431         uint                    resblks;
2432         uint                    log_count;
2433
2434         trace_xfs_remove(dp, name);
2435
2436         if (XFS_FORCED_SHUTDOWN(mp))
2437                 return XFS_ERROR(EIO);
2438
2439         error = xfs_qm_dqattach(dp, 0);
2440         if (error)
2441                 goto std_return;
2442
2443         error = xfs_qm_dqattach(ip, 0);
2444         if (error)
2445                 goto std_return;
2446
2447         if (is_dir) {
2448                 tp = xfs_trans_alloc(mp, XFS_TRANS_RMDIR);
2449                 log_count = XFS_DEFAULT_LOG_COUNT;
2450         } else {
2451                 tp = xfs_trans_alloc(mp, XFS_TRANS_REMOVE);
2452                 log_count = XFS_REMOVE_LOG_COUNT;
2453         }
2454         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
2455
2456         /*
2457          * We try to get the real space reservation first,
2458          * allowing for directory btree deletion(s) implying
2459          * possible bmap insert(s).  If we can't get the space
2460          * reservation then we use 0 instead, and avoid the bmap
2461          * btree insert(s) in the directory code by, if the bmap
2462          * insert tries to happen, instead trimming the LAST
2463          * block from the directory.
2464          */
2465         resblks = XFS_REMOVE_SPACE_RES(mp);
2466         error = xfs_trans_reserve(tp, &M_RES(mp)->tr_remove, resblks, 0);
2467         if (error == ENOSPC) {
2468                 resblks = 0;
2469                 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_remove, 0, 0);
2470         }
2471         if (error) {
2472                 ASSERT(error != ENOSPC);
2473                 cancel_flags = 0;
2474                 goto out_trans_cancel;
2475         }
2476
2477         xfs_lock_two_inodes(dp, ip, XFS_ILOCK_EXCL);
2478
2479         xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
2480         xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
2481
2482         /*
2483          * If we're removing a directory perform some additional validation.
2484          */
2485         cancel_flags |= XFS_TRANS_ABORT;
2486         if (is_dir) {
2487                 ASSERT(ip->i_d.di_nlink >= 2);
2488                 if (ip->i_d.di_nlink != 2) {
2489                         error = XFS_ERROR(ENOTEMPTY);
2490                         goto out_trans_cancel;
2491                 }
2492                 if (!xfs_dir_isempty(ip)) {
2493                         error = XFS_ERROR(ENOTEMPTY);
2494                         goto out_trans_cancel;
2495                 }
2496
2497                 /* Drop the link from ip's "..".  */
2498                 error = xfs_droplink(tp, dp);
2499                 if (error)
2500                         goto out_trans_cancel;
2501
2502                 /* Drop the "." link from ip to self.  */
2503                 error = xfs_droplink(tp, ip);
2504                 if (error)
2505                         goto out_trans_cancel;
2506         } else {
2507                 /*
2508                  * When removing a non-directory we need to log the parent
2509                  * inode here.  For a directory this is done implicitly
2510                  * by the xfs_droplink call for the ".." entry.
2511                  */
2512                 xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
2513         }
2514         xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2515
2516         /* Drop the link from dp to ip. */
2517         error = xfs_droplink(tp, ip);
2518         if (error)
2519                 goto out_trans_cancel;
2520
2521         /* Determine if this is the last link while the inode is locked */
2522         link_zero = (ip->i_d.di_nlink == 0);
2523
2524         xfs_bmap_init(&free_list, &first_block);
2525         error = xfs_dir_removename(tp, dp, name, ip->i_ino,
2526                                         &first_block, &free_list, resblks);
2527         if (error) {
2528                 ASSERT(error != ENOENT);
2529                 goto out_bmap_cancel;
2530         }
2531
2532         /*
2533          * If this is a synchronous mount, make sure that the
2534          * remove transaction goes to disk before returning to
2535          * the user.
2536          */
2537         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
2538                 xfs_trans_set_sync(tp);
2539
2540         error = xfs_bmap_finish(&tp, &free_list, &committed);
2541         if (error)
2542                 goto out_bmap_cancel;
2543
2544         error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
2545         if (error)
2546                 goto std_return;
2547
2548         /*
2549          * If we are using filestreams, kill the stream association.
2550          * If the file is still open it may get a new one but that
2551          * will get killed on last close in xfs_close() so we don't
2552          * have to worry about that.
2553          */
2554         if (!is_dir && link_zero && xfs_inode_is_filestream(ip))
2555                 xfs_filestream_deassociate(ip);
2556
2557         return 0;
2558
2559  out_bmap_cancel:
2560         xfs_bmap_cancel(&free_list);
2561  out_trans_cancel:
2562         xfs_trans_cancel(tp, cancel_flags);
2563  std_return:
2564         return error;
2565 }
2566
2567 /*
2568  * Enter all inodes for a rename transaction into a sorted array.
2569  */
2570 STATIC void
2571 xfs_sort_for_rename(
2572         xfs_inode_t     *dp1,   /* in: old (source) directory inode */
2573         xfs_inode_t     *dp2,   /* in: new (target) directory inode */
2574         xfs_inode_t     *ip1,   /* in: inode of old entry */
2575         xfs_inode_t     *ip2,   /* in: inode of new entry, if it
2576                                    already exists, NULL otherwise. */
2577         xfs_inode_t     **i_tab,/* out: array of inode returned, sorted */
2578         int             *num_inodes)  /* out: number of inodes in array */
2579 {
2580         xfs_inode_t             *temp;
2581         int                     i, j;
2582
2583         /*
2584          * i_tab contains a list of pointers to inodes.  We initialize
2585          * the table here & we'll sort it.  We will then use it to
2586          * order the acquisition of the inode locks.
2587          *
2588          * Note that the table may contain duplicates.  e.g., dp1 == dp2.
2589          */
2590         i_tab[0] = dp1;
2591         i_tab[1] = dp2;
2592         i_tab[2] = ip1;
2593         if (ip2) {
2594                 *num_inodes = 4;
2595                 i_tab[3] = ip2;
2596         } else {
2597                 *num_inodes = 3;
2598                 i_tab[3] = NULL;
2599         }
2600
2601         /*
2602          * Sort the elements via bubble sort.  (Remember, there are at
2603          * most 4 elements to sort, so this is adequate.)
2604          */
2605         for (i = 0; i < *num_inodes; i++) {
2606                 for (j = 1; j < *num_inodes; j++) {
2607                         if (i_tab[j]->i_ino < i_tab[j-1]->i_ino) {
2608                                 temp = i_tab[j];
2609                                 i_tab[j] = i_tab[j-1];
2610                                 i_tab[j-1] = temp;
2611                         }
2612                 }
2613         }
2614 }
2615
2616 /*
2617  * xfs_rename
2618  */
2619 int
2620 xfs_rename(
2621         xfs_inode_t     *src_dp,
2622         struct xfs_name *src_name,
2623         xfs_inode_t     *src_ip,
2624         xfs_inode_t     *target_dp,
2625         struct xfs_name *target_name,
2626         xfs_inode_t     *target_ip)
2627 {
2628         xfs_trans_t     *tp = NULL;
2629         xfs_mount_t     *mp = src_dp->i_mount;
2630         int             new_parent;             /* moving to a new dir */
2631         int             src_is_directory;       /* src_name is a directory */
2632         int             error;
2633         xfs_bmap_free_t free_list;
2634         xfs_fsblock_t   first_block;
2635         int             cancel_flags;
2636         int             committed;
2637         xfs_inode_t     *inodes[4];
2638         int             spaceres;
2639         int             num_inodes;
2640
2641         trace_xfs_rename(src_dp, target_dp, src_name, target_name);
2642
2643         new_parent = (src_dp != target_dp);
2644         src_is_directory = S_ISDIR(src_ip->i_d.di_mode);
2645
2646         xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip,
2647                                 inodes, &num_inodes);
2648
2649         xfs_bmap_init(&free_list, &first_block);
2650         tp = xfs_trans_alloc(mp, XFS_TRANS_RENAME);
2651         cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
2652         spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len);
2653         error = xfs_trans_reserve(tp, &M_RES(mp)->tr_rename, spaceres, 0);
2654         if (error == ENOSPC) {
2655                 spaceres = 0;
2656                 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_rename, 0, 0);
2657         }
2658         if (error) {
2659                 xfs_trans_cancel(tp, 0);
2660                 goto std_return;
2661         }
2662
2663         /*
2664          * Attach the dquots to the inodes
2665          */
2666         error = xfs_qm_vop_rename_dqattach(inodes);
2667         if (error) {
2668                 xfs_trans_cancel(tp, cancel_flags);
2669                 goto std_return;
2670         }
2671
2672         /*
2673          * Lock all the participating inodes. Depending upon whether
2674          * the target_name exists in the target directory, and
2675          * whether the target directory is the same as the source
2676          * directory, we can lock from 2 to 4 inodes.
2677          */
2678         xfs_lock_inodes(inodes, num_inodes, XFS_ILOCK_EXCL);
2679
2680         /*
2681          * Join all the inodes to the transaction. From this point on,
2682          * we can rely on either trans_commit or trans_cancel to unlock
2683          * them.
2684          */
2685         xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL);
2686         if (new_parent)
2687                 xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL);
2688         xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL);
2689         if (target_ip)
2690                 xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL);
2691
2692         /*
2693          * If we are using project inheritance, we only allow renames
2694          * into our tree when the project IDs are the same; else the
2695          * tree quota mechanism would be circumvented.
2696          */
2697         if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
2698                      (xfs_get_projid(target_dp) != xfs_get_projid(src_ip)))) {
2699                 error = XFS_ERROR(EXDEV);
2700                 goto error_return;
2701         }
2702
2703         /*
2704          * Set up the target.
2705          */
2706         if (target_ip == NULL) {
2707                 /*
2708                  * If there's no space reservation, check the entry will
2709                  * fit before actually inserting it.
2710                  */
2711                 error = xfs_dir_canenter(tp, target_dp, target_name, spaceres);
2712                 if (error)
2713                         goto error_return;
2714                 /*
2715                  * If target does not exist and the rename crosses
2716                  * directories, adjust the target directory link count
2717                  * to account for the ".." reference from the new entry.
2718                  */
2719                 error = xfs_dir_createname(tp, target_dp, target_name,
2720                                                 src_ip->i_ino, &first_block,
2721                                                 &free_list, spaceres);
2722                 if (error == ENOSPC)
2723                         goto error_return;
2724                 if (error)
2725                         goto abort_return;
2726
2727                 xfs_trans_ichgtime(tp, target_dp,
2728                                         XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2729
2730                 if (new_parent && src_is_directory) {
2731                         error = xfs_bumplink(tp, target_dp);
2732                         if (error)
2733                                 goto abort_return;
2734                 }
2735         } else { /* target_ip != NULL */
2736                 /*
2737                  * If target exists and it's a directory, check that both
2738                  * target and source are directories and that target can be
2739                  * destroyed, or that neither is a directory.
2740                  */
2741                 if (S_ISDIR(target_ip->i_d.di_mode)) {
2742                         /*
2743                          * Make sure target dir is empty.
2744                          */
2745                         if (!(xfs_dir_isempty(target_ip)) ||
2746                             (target_ip->i_d.di_nlink > 2)) {
2747                                 error = XFS_ERROR(EEXIST);
2748                                 goto error_return;
2749                         }
2750                 }
2751
2752                 /*
2753                  * Link the source inode under the target name.
2754                  * If the source inode is a directory and we are moving
2755                  * it across directories, its ".." entry will be
2756                  * inconsistent until we replace that down below.
2757                  *
2758                  * In case there is already an entry with the same
2759                  * name at the destination directory, remove it first.
2760                  */
2761                 error = xfs_dir_replace(tp, target_dp, target_name,
2762                                         src_ip->i_ino,
2763                                         &first_block, &free_list, spaceres);
2764                 if (error)
2765                         goto abort_return;
2766
2767                 xfs_trans_ichgtime(tp, target_dp,
2768                                         XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2769
2770                 /*
2771                  * Decrement the link count on the target since the target
2772                  * dir no longer points to it.
2773                  */
2774                 error = xfs_droplink(tp, target_ip);
2775                 if (error)
2776                         goto abort_return;
2777
2778                 if (src_is_directory) {
2779                         /*
2780                          * Drop the link from the old "." entry.
2781                          */
2782                         error = xfs_droplink(tp, target_ip);
2783                         if (error)
2784                                 goto abort_return;
2785                 }
2786         } /* target_ip != NULL */
2787
2788         /*
2789          * Remove the source.
2790          */
2791         if (new_parent && src_is_directory) {
2792                 /*
2793                  * Rewrite the ".." entry to point to the new
2794                  * directory.
2795                  */
2796                 error = xfs_dir_replace(tp, src_ip, &xfs_name_dotdot,
2797                                         target_dp->i_ino,
2798                                         &first_block, &free_list, spaceres);
2799                 ASSERT(error != EEXIST);
2800                 if (error)
2801                         goto abort_return;
2802         }
2803
2804         /*
2805          * We always want to hit the ctime on the source inode.
2806          *
2807          * This isn't strictly required by the standards since the source
2808          * inode isn't really being changed, but old unix file systems did
2809          * it and some incremental backup programs won't work without it.
2810          */
2811         xfs_trans_ichgtime(tp, src_ip, XFS_ICHGTIME_CHG);
2812         xfs_trans_log_inode(tp, src_ip, XFS_ILOG_CORE);
2813
2814         /*
2815          * Adjust the link count on src_dp.  This is necessary when
2816          * renaming a directory, either within one parent when
2817          * the target existed, or across two parent directories.
2818          */
2819         if (src_is_directory && (new_parent || target_ip != NULL)) {
2820
2821                 /*
2822                  * Decrement link count on src_directory since the
2823                  * entry that's moved no longer points to it.
2824                  */
2825                 error = xfs_droplink(tp, src_dp);
2826                 if (error)
2827                         goto abort_return;
2828         }
2829
2830         error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino,
2831                                         &first_block, &free_list, spaceres);
2832         if (error)
2833                 goto abort_return;
2834
2835         xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2836         xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE);
2837         if (new_parent)
2838                 xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE);
2839
2840         /*
2841          * If this is a synchronous mount, make sure that the
2842          * rename transaction goes to disk before returning to
2843          * the user.
2844          */
2845         if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
2846                 xfs_trans_set_sync(tp);
2847         }
2848
2849         error = xfs_bmap_finish(&tp, &free_list, &committed);
2850         if (error) {
2851                 xfs_bmap_cancel(&free_list);
2852                 xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES |
2853                                  XFS_TRANS_ABORT));
2854                 goto std_return;
2855         }
2856
2857         /*
2858          * trans_commit will unlock src_ip, target_ip & decrement
2859          * the vnode references.
2860          */
2861         return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
2862
2863  abort_return:
2864         cancel_flags |= XFS_TRANS_ABORT;
2865  error_return:
2866         xfs_bmap_cancel(&free_list);
2867         xfs_trans_cancel(tp, cancel_flags);
2868  std_return:
2869         return error;
2870 }
2871
2872 STATIC int
2873 xfs_iflush_cluster(
2874         xfs_inode_t     *ip,
2875         xfs_buf_t       *bp)
2876 {
2877         xfs_mount_t             *mp = ip->i_mount;
2878         struct xfs_perag        *pag;
2879         unsigned long           first_index, mask;
2880         unsigned long           inodes_per_cluster;
2881         int                     ilist_size;
2882         xfs_inode_t             **ilist;
2883         xfs_inode_t             *iq;
2884         int                     nr_found;
2885         int                     clcount = 0;
2886         int                     bufwasdelwri;
2887         int                     i;
2888
2889         pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
2890
2891         inodes_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog;
2892         ilist_size = inodes_per_cluster * sizeof(xfs_inode_t *);
2893         ilist = kmem_alloc(ilist_size, KM_MAYFAIL|KM_NOFS);
2894         if (!ilist)
2895                 goto out_put;
2896
2897         mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1);
2898         first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask;
2899         rcu_read_lock();
2900         /* really need a gang lookup range call here */
2901         nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist,
2902                                         first_index, inodes_per_cluster);
2903         if (nr_found == 0)
2904                 goto out_free;
2905
2906         for (i = 0; i < nr_found; i++) {
2907                 iq = ilist[i];
2908                 if (iq == ip)
2909                         continue;
2910
2911                 /*
2912                  * because this is an RCU protected lookup, we could find a
2913                  * recently freed or even reallocated inode during the lookup.
2914                  * We need to check under the i_flags_lock for a valid inode
2915                  * here. Skip it if it is not valid or the wrong inode.
2916                  */
2917                 spin_lock(&ip->i_flags_lock);
2918                 if (!ip->i_ino ||
2919                     (XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index) {
2920                         spin_unlock(&ip->i_flags_lock);
2921                         continue;
2922                 }
2923                 spin_unlock(&ip->i_flags_lock);
2924
2925                 /*
2926                  * Do an un-protected check to see if the inode is dirty and
2927                  * is a candidate for flushing.  These checks will be repeated
2928                  * later after the appropriate locks are acquired.
2929                  */
2930                 if (xfs_inode_clean(iq) && xfs_ipincount(iq) == 0)
2931                         continue;
2932
2933                 /*
2934                  * Try to get locks.  If any are unavailable or it is pinned,
2935                  * then this inode cannot be flushed and is skipped.
2936                  */
2937
2938                 if (!xfs_ilock_nowait(iq, XFS_ILOCK_SHARED))
2939                         continue;
2940                 if (!xfs_iflock_nowait(iq)) {
2941                         xfs_iunlock(iq, XFS_ILOCK_SHARED);
2942                         continue;
2943                 }
2944                 if (xfs_ipincount(iq)) {
2945                         xfs_ifunlock(iq);
2946                         xfs_iunlock(iq, XFS_ILOCK_SHARED);
2947                         continue;
2948                 }
2949
2950                 /*
2951                  * arriving here means that this inode can be flushed.  First
2952                  * re-check that it's dirty before flushing.
2953                  */
2954                 if (!xfs_inode_clean(iq)) {
2955                         int     error;
2956                         error = xfs_iflush_int(iq, bp);
2957                         if (error) {
2958                                 xfs_iunlock(iq, XFS_ILOCK_SHARED);
2959                                 goto cluster_corrupt_out;
2960                         }
2961                         clcount++;
2962                 } else {
2963                         xfs_ifunlock(iq);
2964                 }
2965                 xfs_iunlock(iq, XFS_ILOCK_SHARED);
2966         }
2967
2968         if (clcount) {
2969                 XFS_STATS_INC(xs_icluster_flushcnt);
2970                 XFS_STATS_ADD(xs_icluster_flushinode, clcount);
2971         }
2972
2973 out_free:
2974         rcu_read_unlock();
2975         kmem_free(ilist);
2976 out_put:
2977         xfs_perag_put(pag);
2978         return 0;
2979
2980
2981 cluster_corrupt_out:
2982         /*
2983          * Corruption detected in the clustering loop.  Invalidate the
2984          * inode buffer and shut down the filesystem.
2985          */
2986         rcu_read_unlock();
2987         /*
2988          * Clean up the buffer.  If it was delwri, just release it --
2989          * brelse can handle it with no problems.  If not, shut down the
2990          * filesystem before releasing the buffer.
2991          */
2992         bufwasdelwri = (bp->b_flags & _XBF_DELWRI_Q);
2993         if (bufwasdelwri)
2994                 xfs_buf_relse(bp);
2995
2996         xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
2997
2998         if (!bufwasdelwri) {
2999                 /*
3000                  * Just like incore_relse: if we have b_iodone functions,
3001                  * mark the buffer as an error and call them.  Otherwise
3002                  * mark it as stale and brelse.
3003                  */
3004                 if (bp->b_iodone) {
3005                         XFS_BUF_UNDONE(bp);
3006                         xfs_buf_stale(bp);
3007                         xfs_buf_ioerror(bp, EIO);
3008                         xfs_buf_ioend(bp, 0);
3009                 } else {
3010                         xfs_buf_stale(bp);
3011                         xfs_buf_relse(bp);
3012                 }
3013         }
3014
3015         /*
3016          * Unlocks the flush lock
3017          */
3018         xfs_iflush_abort(iq, false);
3019         kmem_free(ilist);
3020         xfs_perag_put(pag);
3021         return XFS_ERROR(EFSCORRUPTED);
3022 }
3023
3024 /*
3025  * Flush dirty inode metadata into the backing buffer.
3026  *
3027  * The caller must have the inode lock and the inode flush lock held.  The
3028  * inode lock will still be held upon return to the caller, and the inode
3029  * flush lock will be released after the inode has reached the disk.
3030  *
3031  * The caller must write out the buffer returned in *bpp and release it.
3032  */
3033 int
3034 xfs_iflush(
3035         struct xfs_inode        *ip,
3036         struct xfs_buf          **bpp)
3037 {
3038         struct xfs_mount        *mp = ip->i_mount;
3039         struct xfs_buf          *bp;
3040         struct xfs_dinode       *dip;
3041         int                     error;
3042
3043         XFS_STATS_INC(xs_iflush_count);
3044
3045         ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
3046         ASSERT(xfs_isiflocked(ip));
3047         ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
3048                ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
3049
3050         *bpp = NULL;
3051
3052         xfs_iunpin_wait(ip);
3053
3054         /*
3055          * For stale inodes we cannot rely on the backing buffer remaining
3056          * stale in cache for the remaining life of the stale inode and so
3057          * xfs_imap_to_bp() below may give us a buffer that no longer contains
3058          * inodes below. We have to check this after ensuring the inode is
3059          * unpinned so that it is safe to reclaim the stale inode after the
3060          * flush call.
3061          */
3062         if (xfs_iflags_test(ip, XFS_ISTALE)) {
3063                 xfs_ifunlock(ip);
3064                 return 0;
3065         }
3066
3067         /*
3068          * This may have been unpinned because the filesystem is shutting
3069          * down forcibly. If that's the case we must not write this inode
3070          * to disk, because the log record didn't make it to disk.
3071          *
3072          * We also have to remove the log item from the AIL in this case,
3073          * as we wait for an empty AIL as part of the unmount process.
3074          */
3075         if (XFS_FORCED_SHUTDOWN(mp)) {
3076                 error = XFS_ERROR(EIO);
3077                 goto abort_out;
3078         }
3079
3080         /*
3081          * Get the buffer containing the on-disk inode.
3082          */
3083         error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &bp, XBF_TRYLOCK,
3084                                0);
3085         if (error || !bp) {
3086                 xfs_ifunlock(ip);
3087                 return error;
3088         }
3089
3090         /*
3091          * First flush out the inode that xfs_iflush was called with.
3092          */
3093         error = xfs_iflush_int(ip, bp);
3094         if (error)
3095                 goto corrupt_out;
3096
3097         /*
3098          * If the buffer is pinned then push on the log now so we won't
3099          * get stuck waiting in the write for too long.
3100          */
3101         if (xfs_buf_ispinned(bp))
3102                 xfs_log_force(mp, 0);
3103
3104         /*
3105          * inode clustering:
3106          * see if other inodes can be gathered into this write
3107          */
3108         error = xfs_iflush_cluster(ip, bp);
3109         if (error)
3110                 goto cluster_corrupt_out;
3111
3112         *bpp = bp;
3113         return 0;
3114
3115 corrupt_out:
3116         xfs_buf_relse(bp);
3117         xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
3118 cluster_corrupt_out:
3119         error = XFS_ERROR(EFSCORRUPTED);
3120 abort_out:
3121         /*
3122          * Unlocks the flush lock
3123          */
3124         xfs_iflush_abort(ip, false);
3125         return error;
3126 }
3127
3128 STATIC int
3129 xfs_iflush_int(
3130         struct xfs_inode        *ip,
3131         struct xfs_buf          *bp)
3132 {
3133         struct xfs_inode_log_item *iip = ip->i_itemp;
3134         struct xfs_dinode       *dip;
3135         struct xfs_mount        *mp = ip->i_mount;
3136
3137         ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
3138         ASSERT(xfs_isiflocked(ip));
3139         ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
3140                ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
3141         ASSERT(iip != NULL && iip->ili_fields != 0);
3142
3143         /* set *dip = inode's place in the buffer */
3144         dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset);
3145
3146         if (XFS_TEST_ERROR(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC),
3147                                mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) {
3148                 xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
3149                         "%s: Bad inode %Lu magic number 0x%x, ptr 0x%p",
3150                         __func__, ip->i_ino, be16_to_cpu(dip->di_magic), dip);
3151                 goto corrupt_out;
3152         }
3153         if (XFS_TEST_ERROR(ip->i_d.di_magic != XFS_DINODE_MAGIC,
3154                                 mp, XFS_ERRTAG_IFLUSH_2, XFS_RANDOM_IFLUSH_2)) {
3155                 xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
3156                         "%s: Bad inode %Lu, ptr 0x%p, magic number 0x%x",
3157                         __func__, ip->i_ino, ip, ip->i_d.di_magic);
3158                 goto corrupt_out;
3159         }
3160         if (S_ISREG(ip->i_d.di_mode)) {
3161                 if (XFS_TEST_ERROR(
3162                     (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) &&
3163                     (ip->i_d.di_format != XFS_DINODE_FMT_BTREE),
3164                     mp, XFS_ERRTAG_IFLUSH_3, XFS_RANDOM_IFLUSH_3)) {
3165                         xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
3166                                 "%s: Bad regular inode %Lu, ptr 0x%p",
3167                                 __func__, ip->i_ino, ip);
3168                         goto corrupt_out;
3169                 }
3170         } else if (S_ISDIR(ip->i_d.di_mode)) {
3171                 if (XFS_TEST_ERROR(
3172                     (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) &&
3173                     (ip->i_d.di_format != XFS_DINODE_FMT_BTREE) &&
3174                     (ip->i_d.di_format != XFS_DINODE_FMT_LOCAL),
3175                     mp, XFS_ERRTAG_IFLUSH_4, XFS_RANDOM_IFLUSH_4)) {
3176                         xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
3177                                 "%s: Bad directory inode %Lu, ptr 0x%p",
3178                                 __func__, ip->i_ino, ip);
3179                         goto corrupt_out;
3180                 }
3181         }
3182         if (XFS_TEST_ERROR(ip->i_d.di_nextents + ip->i_d.di_anextents >
3183                                 ip->i_d.di_nblocks, mp, XFS_ERRTAG_IFLUSH_5,
3184                                 XFS_RANDOM_IFLUSH_5)) {
3185                 xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
3186                         "%s: detected corrupt incore inode %Lu, "
3187                         "total extents = %d, nblocks = %Ld, ptr 0x%p",
3188                         __func__, ip->i_ino,
3189                         ip->i_d.di_nextents + ip->i_d.di_anextents,
3190                         ip->i_d.di_nblocks, ip);
3191                 goto corrupt_out;
3192         }
3193         if (XFS_TEST_ERROR(ip->i_d.di_forkoff > mp->m_sb.sb_inodesize,
3194                                 mp, XFS_ERRTAG_IFLUSH_6, XFS_RANDOM_IFLUSH_6)) {
3195                 xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
3196                         "%s: bad inode %Lu, forkoff 0x%x, ptr 0x%p",
3197                         __func__, ip->i_ino, ip->i_d.di_forkoff, ip);
3198                 goto corrupt_out;
3199         }
3200
3201         /*
3202          * Inode item log recovery for v1/v2 inodes are dependent on the
3203          * di_flushiter count for correct sequencing. We bump the flush
3204          * iteration count so we can detect flushes which postdate a log record
3205          * during recovery. This is redundant as we now log every change and
3206          * hence this can't happen but we need to still do it to ensure
3207          * backwards compatibility with old kernels that predate logging all
3208          * inode changes.
3209          */
3210         if (ip->i_d.di_version < 3)
3211                 ip->i_d.di_flushiter++;
3212
3213         /*
3214          * Copy the dirty parts of the inode into the on-disk
3215          * inode.  We always copy out the core of the inode,
3216          * because if the inode is dirty at all the core must
3217          * be.
3218          */
3219         xfs_dinode_to_disk(dip, &ip->i_d);
3220
3221         /* Wrap, we never let the log put out DI_MAX_FLUSH */
3222         if (ip->i_d.di_flushiter == DI_MAX_FLUSH)
3223                 ip->i_d.di_flushiter = 0;
3224
3225         /*
3226          * If this is really an old format inode and the superblock version
3227          * has not been updated to support only new format inodes, then
3228          * convert back to the old inode format.  If the superblock version
3229          * has been updated, then make the conversion permanent.
3230          */
3231         ASSERT(ip->i_d.di_version == 1 || xfs_sb_version_hasnlink(&mp->m_sb));
3232         if (ip->i_d.di_version == 1) {
3233                 if (!xfs_sb_version_hasnlink(&mp->m_sb)) {
3234                         /*
3235                          * Convert it back.
3236                          */
3237                         ASSERT(ip->i_d.di_nlink <= XFS_MAXLINK_1);
3238                         dip->di_onlink = cpu_to_be16(ip->i_d.di_nlink);
3239                 } else {
3240                         /*
3241                          * The superblock version has already been bumped,
3242                          * so just make the conversion to the new inode
3243                          * format permanent.
3244                          */
3245                         ip->i_d.di_version = 2;
3246                         dip->di_version = 2;
3247                         ip->i_d.di_onlink = 0;
3248                         dip->di_onlink = 0;
3249                         memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
3250                         memset(&(dip->di_pad[0]), 0,
3251                               sizeof(dip->di_pad));
3252                         ASSERT(xfs_get_projid(ip) == 0);
3253                 }
3254         }
3255
3256         xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK, bp);
3257         if (XFS_IFORK_Q(ip))
3258                 xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK, bp);
3259         xfs_inobp_check(mp, bp);
3260
3261         /*
3262          * We've recorded everything logged in the inode, so we'd like to clear
3263          * the ili_fields bits so we don't log and flush things unnecessarily.
3264          * However, we can't stop logging all this information until the data
3265          * we've copied into the disk buffer is written to disk.  If we did we
3266          * might overwrite the copy of the inode in the log with all the data
3267          * after re-logging only part of it, and in the face of a crash we
3268          * wouldn't have all the data we need to recover.
3269          *
3270          * What we do is move the bits to the ili_last_fields field.  When
3271          * logging the inode, these bits are moved back to the ili_fields field.
3272          * In the xfs_iflush_done() routine we clear ili_last_fields, since we
3273          * know that the information those bits represent is permanently on
3274          * disk.  As long as the flush completes before the inode is logged
3275          * again, then both ili_fields and ili_last_fields will be cleared.
3276          *
3277          * We can play with the ili_fields bits here, because the inode lock
3278          * must be held exclusively in order to set bits there and the flush
3279          * lock protects the ili_last_fields bits.  Set ili_logged so the flush
3280          * done routine can tell whether or not to look in the AIL.  Also, store
3281          * the current LSN of the inode so that we can tell whether the item has
3282          * moved in the AIL from xfs_iflush_done().  In order to read the lsn we
3283          * need the AIL lock, because it is a 64 bit value that cannot be read
3284          * atomically.
3285          */
3286         iip->ili_last_fields = iip->ili_fields;
3287         iip->ili_fields = 0;
3288         iip->ili_logged = 1;
3289
3290         xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
3291                                 &iip->ili_item.li_lsn);
3292
3293         /*
3294          * Attach the function xfs_iflush_done to the inode's
3295          * buffer.  This will remove the inode from the AIL
3296          * and unlock the inode's flush lock when the inode is
3297          * completely written to disk.
3298          */
3299         xfs_buf_attach_iodone(bp, xfs_iflush_done, &iip->ili_item);
3300
3301         /* update the lsn in the on disk inode if required */
3302         if (ip->i_d.di_version == 3)
3303                 dip->di_lsn = cpu_to_be64(iip->ili_item.li_lsn);
3304
3305         /* generate the checksum. */
3306         xfs_dinode_calc_crc(mp, dip);
3307
3308         ASSERT(bp->b_fspriv != NULL);
3309         ASSERT(bp->b_iodone != NULL);
3310         return 0;
3311
3312 corrupt_out:
3313         return XFS_ERROR(EFSCORRUPTED);
3314 }