Merge tag 'xfs-4.20-merge-1' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux
authorLinus Torvalds <torvalds@linux-foundation.org>
Wed, 24 Oct 2018 16:36:12 +0000 (17:36 +0100)
committerLinus Torvalds <torvalds@linux-foundation.org>
Wed, 24 Oct 2018 16:36:12 +0000 (17:36 +0100)
Pul xfs updates from Dave Chinner:
 "There's not a huge amount of change in this cycle - Darrick has been
  out of action for a couple of months (hence me sending the last few
  pull requests), so we decided a quiet cycle mainly focussed on bug
  fixes was a good idea. Darrick will take the helm again at the end of
  this merge window.

  FYI, I may be sending another update later in the cycle - there's a
  pending rework of the clone/dedupe_file_range code that fixes numerous
  bugs that is spread amongst the VFS, XFS and ocfs2 code. It has been
  reviewed and tested, Al and I just need to work out the details of the
  merge, so it may come from him rather than me.

  Summary:

   - only support filesystems with unwritten extents

   - add definition for statfs XFS magic number

   - remove unused parameters around reflink code

   - more debug for dangling delalloc extents

   - cancel COW extents on extent swap targets

   - fix quota stats output and clean up the code

   - refactor some of the attribute code in preparation for parent
     pointers

   - fix several buffer handling bugs"

* tag 'xfs-4.20-merge-1' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux: (21 commits)
  xfs: cancel COW blocks before swapext
  xfs: clear ail delwri queued bufs on unmount of shutdown fs
  xfs: use offsetof() in place of offset macros for __xfsstats
  xfs: Fix xqmstats offsets in /proc/fs/xfs/xqmstat
  xfs: fix use-after-free race in xfs_buf_rele
  xfs: Add attibute remove and helper functions
  xfs: Add attibute set and helper functions
  xfs: Add helper function xfs_attr_try_sf_addname
  xfs: Move fs/xfs/xfs_attr.h to fs/xfs/libxfs/xfs_attr.h
  xfs: issue log message on user force shutdown
  xfs: fix buffer state management in xrep_findroot_block
  xfs: always assign buffer verifiers when one is provided
  xfs: xrep_findroot_block should reject root blocks with siblings
  xfs: add a define for statfs magic to uapi
  xfs: print dangling delalloc extents
  xfs: fix fork selection in xfs_find_trim_cow_extent
  xfs: remove the unused trimmed argument from xfs_reflink_trim_around_shared
  xfs: remove the unused shared argument to xfs_reflink_reserve_cow
  xfs: handle zeroing in xfs_file_iomap_begin_delay
  xfs: remove suport for filesystems without unwritten extent flag
  ...

26 files changed:
fs/xfs/libxfs/xfs_attr.c
fs/xfs/libxfs/xfs_attr.h [new file with mode: 0644]
fs/xfs/libxfs/xfs_bmap.c
fs/xfs/libxfs/xfs_bmap.h
fs/xfs/libxfs/xfs_format.h
fs/xfs/libxfs/xfs_sb.c
fs/xfs/scrub/repair.c
fs/xfs/scrub/scrub.c
fs/xfs/xfs_aops.c
fs/xfs/xfs_aops.h
fs/xfs/xfs_attr.h [deleted file]
fs/xfs/xfs_bmap_util.c
fs/xfs/xfs_buf.c
fs/xfs/xfs_buf.h
fs/xfs/xfs_fsops.c
fs/xfs/xfs_ioctl.c
fs/xfs/xfs_iomap.c
fs/xfs/xfs_reflink.c
fs/xfs/xfs_reflink.h
fs/xfs/xfs_stats.c
fs/xfs/xfs_stats.h
fs/xfs/xfs_super.c
fs/xfs/xfs_trans.h
fs/xfs/xfs_trans_ail.c
fs/xfs/xfs_trans_buf.c
include/uapi/linux/magic.h

index c6299f82a6e496ac00b1ef27953a5ca5313cb9f8..844ed87b190077115c760204659179bca1da8c43 100644 (file)
@@ -191,6 +191,128 @@ xfs_attr_calc_size(
        return nblks;
 }
 
+STATIC int
+xfs_attr_try_sf_addname(
+       struct xfs_inode        *dp,
+       struct xfs_da_args      *args)
+{
+
+       struct xfs_mount        *mp = dp->i_mount;
+       int                     error, error2;
+
+       error = xfs_attr_shortform_addname(args);
+       if (error == -ENOSPC)
+               return error;
+
+       /*
+        * Commit the shortform mods, and we're done.
+        * NOTE: this is also the error path (EEXIST, etc).
+        */
+       if (!error && (args->flags & ATTR_KERNOTIME) == 0)
+               xfs_trans_ichgtime(args->trans, dp, XFS_ICHGTIME_CHG);
+
+       if (mp->m_flags & XFS_MOUNT_WSYNC)
+               xfs_trans_set_sync(args->trans);
+
+       error2 = xfs_trans_commit(args->trans);
+       args->trans = NULL;
+       return error ? error : error2;
+}
+
+/*
+ * Set the attribute specified in @args.
+ */
+int
+xfs_attr_set_args(
+       struct xfs_da_args      *args,
+       struct xfs_buf          **leaf_bp)
+{
+       struct xfs_inode        *dp = args->dp;
+       int                     error;
+
+       /*
+        * If the attribute list is non-existent or a shortform list,
+        * upgrade it to a single-leaf-block attribute list.
+        */
+       if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL ||
+           (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS &&
+            dp->i_d.di_anextents == 0)) {
+
+               /*
+                * Build initial attribute list (if required).
+                */
+               if (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS)
+                       xfs_attr_shortform_create(args);
+
+               /*
+                * Try to add the attr to the attribute list in the inode.
+                */
+               error = xfs_attr_try_sf_addname(dp, args);
+               if (error != -ENOSPC)
+                       return error;
+
+               /*
+                * It won't fit in the shortform, transform to a leaf block.
+                * GROT: another possible req'mt for a double-split btree op.
+                */
+               error = xfs_attr_shortform_to_leaf(args, leaf_bp);
+               if (error)
+                       return error;
+
+               /*
+                * Prevent the leaf buffer from being unlocked so that a
+                * concurrent AIL push cannot grab the half-baked leaf
+                * buffer and run into problems with the write verifier.
+                */
+               xfs_trans_bhold(args->trans, *leaf_bp);
+
+               error = xfs_defer_finish(&args->trans);
+               if (error)
+                       return error;
+
+               /*
+                * Commit the leaf transformation.  We'll need another
+                * (linked) transaction to add the new attribute to the
+                * leaf.
+                */
+               error = xfs_trans_roll_inode(&args->trans, dp);
+               if (error)
+                       return error;
+               xfs_trans_bjoin(args->trans, *leaf_bp);
+               *leaf_bp = NULL;
+       }
+
+       if (xfs_bmap_one_block(dp, XFS_ATTR_FORK))
+               error = xfs_attr_leaf_addname(args);
+       else
+               error = xfs_attr_node_addname(args);
+       return error;
+}
+
+/*
+ * Remove the attribute specified in @args.
+ */
+int
+xfs_attr_remove_args(
+       struct xfs_da_args      *args)
+{
+       struct xfs_inode        *dp = args->dp;
+       int                     error;
+
+       if (!xfs_inode_hasattr(dp)) {
+               error = -ENOATTR;
+       } else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
+               ASSERT(dp->i_afp->if_flags & XFS_IFINLINE);
+               error = xfs_attr_shortform_remove(args);
+       } else if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) {
+               error = xfs_attr_leaf_removename(args);
+       } else {
+               error = xfs_attr_node_removename(args);
+       }
+
+       return error;
+}
+
 int
 xfs_attr_set(
        struct xfs_inode        *dp,
@@ -204,7 +326,7 @@ xfs_attr_set(
        struct xfs_da_args      args;
        struct xfs_trans_res    tres;
        int                     rsvd = (flags & ATTR_ROOT) != 0;
-       int                     error, err2, local;
+       int                     error, local;
 
        XFS_STATS_INC(mp, xs_attr_set);
 
@@ -255,93 +377,17 @@ xfs_attr_set(
        error = xfs_trans_reserve_quota_nblks(args.trans, dp, args.total, 0,
                                rsvd ? XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES :
                                       XFS_QMOPT_RES_REGBLKS);
-       if (error) {
-               xfs_iunlock(dp, XFS_ILOCK_EXCL);
-               xfs_trans_cancel(args.trans);
-               return error;
-       }
+       if (error)
+               goto out_trans_cancel;
 
        xfs_trans_ijoin(args.trans, dp, 0);
-
-       /*
-        * If the attribute list is non-existent or a shortform list,
-        * upgrade it to a single-leaf-block attribute list.
-        */
-       if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL ||
-           (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS &&
-            dp->i_d.di_anextents == 0)) {
-
-               /*
-                * Build initial attribute list (if required).
-                */
-               if (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS)
-                       xfs_attr_shortform_create(&args);
-
-               /*
-                * Try to add the attr to the attribute list in
-                * the inode.
-                */
-               error = xfs_attr_shortform_addname(&args);
-               if (error != -ENOSPC) {
-                       /*
-                        * Commit the shortform mods, and we're done.
-                        * NOTE: this is also the error path (EEXIST, etc).
-                        */
-                       ASSERT(args.trans != NULL);
-
-                       /*
-                        * If this is a synchronous mount, make sure that
-                        * the transaction goes to disk before returning
-                        * to the user.
-                        */
-                       if (mp->m_flags & XFS_MOUNT_WSYNC)
-                               xfs_trans_set_sync(args.trans);
-
-                       if (!error && (flags & ATTR_KERNOTIME) == 0) {
-                               xfs_trans_ichgtime(args.trans, dp,
-                                                       XFS_ICHGTIME_CHG);
-                       }
-                       err2 = xfs_trans_commit(args.trans);
-                       xfs_iunlock(dp, XFS_ILOCK_EXCL);
-
-                       return error ? error : err2;
-               }
-
-               /*
-                * It won't fit in the shortform, transform to a leaf block.
-                * GROT: another possible req'mt for a double-split btree op.
-                */
-               error = xfs_attr_shortform_to_leaf(&args, &leaf_bp);
-               if (error)
-                       goto out;
-               /*
-                * Prevent the leaf buffer from being unlocked so that a
-                * concurrent AIL push cannot grab the half-baked leaf
-                * buffer and run into problems with the write verifier.
-                */
-               xfs_trans_bhold(args.trans, leaf_bp);
-               error = xfs_defer_finish(&args.trans);
-               if (error)
-                       goto out;
-
-               /*
-                * Commit the leaf transformation.  We'll need another (linked)
-                * transaction to add the new attribute to the leaf, which
-                * means that we have to hold & join the leaf buffer here too.
-                */
-               error = xfs_trans_roll_inode(&args.trans, dp);
-               if (error)
-                       goto out;
-               xfs_trans_bjoin(args.trans, leaf_bp);
-               leaf_bp = NULL;
-       }
-
-       if (xfs_bmap_one_block(dp, XFS_ATTR_FORK))
-               error = xfs_attr_leaf_addname(&args);
-       else
-               error = xfs_attr_node_addname(&args);
+       error = xfs_attr_set_args(&args, &leaf_bp);
        if (error)
-               goto out;
+               goto out_release_leaf;
+       if (!args.trans) {
+               /* shortform attribute has already been committed */
+               goto out_unlock;
+       }
 
        /*
         * If this is a synchronous mount, make sure that the
@@ -358,17 +404,17 @@ xfs_attr_set(
         */
        xfs_trans_log_inode(args.trans, dp, XFS_ILOG_CORE);
        error = xfs_trans_commit(args.trans);
+out_unlock:
        xfs_iunlock(dp, XFS_ILOCK_EXCL);
-
        return error;
 
-out:
+out_release_leaf:
        if (leaf_bp)
                xfs_trans_brelse(args.trans, leaf_bp);
+out_trans_cancel:
        if (args.trans)
                xfs_trans_cancel(args.trans);
-       xfs_iunlock(dp, XFS_ILOCK_EXCL);
-       return error;
+       goto out_unlock;
 }
 
 /*
@@ -423,17 +469,7 @@ xfs_attr_remove(
         */
        xfs_trans_ijoin(args.trans, dp, 0);
 
-       if (!xfs_inode_hasattr(dp)) {
-               error = -ENOATTR;
-       } else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
-               ASSERT(dp->i_afp->if_flags & XFS_IFINLINE);
-               error = xfs_attr_shortform_remove(&args);
-       } else if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) {
-               error = xfs_attr_leaf_removename(&args);
-       } else {
-               error = xfs_attr_node_removename(&args);
-       }
-
+       error = xfs_attr_remove_args(&args);
        if (error)
                goto out;
 
diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h
new file mode 100644 (file)
index 0000000..bdf52a3
--- /dev/null
@@ -0,0 +1,150 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000,2002-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_ATTR_H__
+#define        __XFS_ATTR_H__
+
+struct xfs_inode;
+struct xfs_da_args;
+struct xfs_attr_list_context;
+
+/*
+ * Large attribute lists are structured around Btrees where all the data
+ * elements are in the leaf nodes.  Attribute names are hashed into an int,
+ * then that int is used as the index into the Btree.  Since the hashval
+ * of an attribute name may not be unique, we may have duplicate keys.
+ * The internal links in the Btree are logical block offsets into the file.
+ *
+ * Small attribute lists use a different format and are packed as tightly
+ * as possible so as to fit into the literal area of the inode.
+ */
+
+/*========================================================================
+ * External interfaces
+ *========================================================================*/
+
+
+#define ATTR_DONTFOLLOW        0x0001  /* -- unused, from IRIX -- */
+#define ATTR_ROOT      0x0002  /* use attrs in root (trusted) namespace */
+#define ATTR_TRUST     0x0004  /* -- unused, from IRIX -- */
+#define ATTR_SECURE    0x0008  /* use attrs in security namespace */
+#define ATTR_CREATE    0x0010  /* pure create: fail if attr already exists */
+#define ATTR_REPLACE   0x0020  /* pure set: fail if attr does not exist */
+
+#define ATTR_KERNOTIME 0x1000  /* [kernel] don't update inode timestamps */
+#define ATTR_KERNOVAL  0x2000  /* [kernel] get attr size only, not value */
+
+#define ATTR_INCOMPLETE        0x4000  /* [kernel] return INCOMPLETE attr keys */
+
+#define XFS_ATTR_FLAGS \
+       { ATTR_DONTFOLLOW,      "DONTFOLLOW" }, \
+       { ATTR_ROOT,            "ROOT" }, \
+       { ATTR_TRUST,           "TRUST" }, \
+       { ATTR_SECURE,          "SECURE" }, \
+       { ATTR_CREATE,          "CREATE" }, \
+       { ATTR_REPLACE,         "REPLACE" }, \
+       { ATTR_KERNOTIME,       "KERNOTIME" }, \
+       { ATTR_KERNOVAL,        "KERNOVAL" }, \
+       { ATTR_INCOMPLETE,      "INCOMPLETE" }
+
+/*
+ * The maximum size (into the kernel or returned from the kernel) of an
+ * attribute value or the buffer used for an attr_list() call.  Larger
+ * sizes will result in an ERANGE return code.
+ */
+#define        ATTR_MAX_VALUELEN       (64*1024)       /* max length of a value */
+
+/*
+ * Define how lists of attribute names are returned to the user from
+ * the attr_list() call.  A large, 32bit aligned, buffer is passed in
+ * along with its size.  We put an array of offsets at the top that each
+ * reference an attrlist_ent_t and pack the attrlist_ent_t's at the bottom.
+ */
+typedef struct attrlist {
+       __s32   al_count;       /* number of entries in attrlist */
+       __s32   al_more;        /* T/F: more attrs (do call again) */
+       __s32   al_offset[1];   /* byte offsets of attrs [var-sized] */
+} attrlist_t;
+
+/*
+ * Show the interesting info about one attribute.  This is what the
+ * al_offset[i] entry points to.
+ */
+typedef struct attrlist_ent {  /* data from attr_list() */
+       __u32   a_valuelen;     /* number bytes in value of attr */
+       char    a_name[1];      /* attr name (NULL terminated) */
+} attrlist_ent_t;
+
+/*
+ * Given a pointer to the (char*) buffer containing the attr_list() result,
+ * and an index, return a pointer to the indicated attribute in the buffer.
+ */
+#define        ATTR_ENTRY(buffer, index)               \
+       ((attrlist_ent_t *)                     \
+        &((char *)buffer)[ ((attrlist_t *)(buffer))->al_offset[index] ])
+
+/*
+ * Kernel-internal version of the attrlist cursor.
+ */
+typedef struct attrlist_cursor_kern {
+       __u32   hashval;        /* hash value of next entry to add */
+       __u32   blkno;          /* block containing entry (suggestion) */
+       __u32   offset;         /* offset in list of equal-hashvals */
+       __u16   pad1;           /* padding to match user-level */
+       __u8    pad2;           /* padding to match user-level */
+       __u8    initted;        /* T/F: cursor has been initialized */
+} attrlist_cursor_kern_t;
+
+
+/*========================================================================
+ * Structure used to pass context around among the routines.
+ *========================================================================*/
+
+
+/* void; state communicated via *context */
+typedef void (*put_listent_func_t)(struct xfs_attr_list_context *, int,
+                             unsigned char *, int, int);
+
+typedef struct xfs_attr_list_context {
+       struct xfs_trans                *tp;
+       struct xfs_inode                *dp;            /* inode */
+       struct attrlist_cursor_kern     *cursor;        /* position in list */
+       char                            *alist;         /* output buffer */
+       int                             seen_enough;    /* T/F: seen enough of list? */
+       ssize_t                         count;          /* num used entries */
+       int                             dupcnt;         /* count dup hashvals seen */
+       int                             bufsize;        /* total buffer size */
+       int                             firstu;         /* first used byte in buffer */
+       int                             flags;          /* from VOP call */
+       int                             resynch;        /* T/F: resynch with cursor */
+       put_listent_func_t              put_listent;    /* list output fmt function */
+       int                             index;          /* index into output buffer */
+} xfs_attr_list_context_t;
+
+
+/*========================================================================
+ * Function prototypes for the kernel.
+ *========================================================================*/
+
+/*
+ * Overall external interface routines.
+ */
+int xfs_attr_inactive(struct xfs_inode *dp);
+int xfs_attr_list_int_ilocked(struct xfs_attr_list_context *);
+int xfs_attr_list_int(struct xfs_attr_list_context *);
+int xfs_inode_hasattr(struct xfs_inode *ip);
+int xfs_attr_get_ilocked(struct xfs_inode *ip, struct xfs_da_args *args);
+int xfs_attr_get(struct xfs_inode *ip, const unsigned char *name,
+                unsigned char *value, int *valuelenp, int flags);
+int xfs_attr_set(struct xfs_inode *dp, const unsigned char *name,
+                unsigned char *value, int valuelen, int flags);
+int xfs_attr_set_args(struct xfs_da_args *args, struct xfs_buf **leaf_bp);
+int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags);
+int xfs_attr_remove_args(struct xfs_da_args *args);
+int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize,
+                 int flags, struct attrlist_cursor_kern *cursor);
+
+
+#endif /* __XFS_ATTR_H__ */
index a47670332326449cb97f73850887cea799ea684c..74d7228e755b3ade097457c0ae68e07e252e67a4 100644 (file)
@@ -1019,6 +1019,34 @@ xfs_bmap_add_attrfork_local(
        return -EFSCORRUPTED;
 }
 
+/* Set an inode attr fork off based on the format */
+int
+xfs_bmap_set_attrforkoff(
+       struct xfs_inode        *ip,
+       int                     size,
+       int                     *version)
+{
+       switch (ip->i_d.di_format) {
+       case XFS_DINODE_FMT_DEV:
+               ip->i_d.di_forkoff = roundup(sizeof(xfs_dev_t), 8) >> 3;
+               break;
+       case XFS_DINODE_FMT_LOCAL:
+       case XFS_DINODE_FMT_EXTENTS:
+       case XFS_DINODE_FMT_BTREE:
+               ip->i_d.di_forkoff = xfs_attr_shortform_bytesfit(ip, size);
+               if (!ip->i_d.di_forkoff)
+                       ip->i_d.di_forkoff = xfs_default_attroffset(ip) >> 3;
+               else if ((ip->i_mount->m_flags & XFS_MOUNT_ATTR2) && version)
+                       *version = 2;
+               break;
+       default:
+               ASSERT(0);
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
 /*
  * Convert inode from non-attributed to attributed.
  * Must not be in a transaction, ip must not be locked.
@@ -1070,26 +1098,9 @@ xfs_bmap_add_attrfork(
 
        xfs_trans_ijoin(tp, ip, 0);
        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-
-       switch (ip->i_d.di_format) {
-       case XFS_DINODE_FMT_DEV:
-               ip->i_d.di_forkoff = roundup(sizeof(xfs_dev_t), 8) >> 3;
-               break;
-       case XFS_DINODE_FMT_LOCAL:
-       case XFS_DINODE_FMT_EXTENTS:
-       case XFS_DINODE_FMT_BTREE:
-               ip->i_d.di_forkoff = xfs_attr_shortform_bytesfit(ip, size);
-               if (!ip->i_d.di_forkoff)
-                       ip->i_d.di_forkoff = xfs_default_attroffset(ip) >> 3;
-               else if (mp->m_flags & XFS_MOUNT_ATTR2)
-                       version = 2;
-               break;
-       default:
-               ASSERT(0);
-               error = -EINVAL;
+       error = xfs_bmap_set_attrforkoff(ip, size, &version);
+       if (error)
                goto trans_cancel;
-       }
-
        ASSERT(ip->i_afp == NULL);
        ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP);
        ip->i_afp->if_flags = XFS_IFEXTENTS;
@@ -4081,8 +4092,7 @@ xfs_bmapi_allocate(
         * extents to real extents when we're about to write the data.
         */
        if ((!bma->wasdel || (bma->flags & XFS_BMAPI_COWFORK)) &&
-           (bma->flags & XFS_BMAPI_PREALLOC) &&
-           xfs_sb_version_hasextflgbit(&mp->m_sb))
+           (bma->flags & XFS_BMAPI_PREALLOC))
                bma->got.br_state = XFS_EXT_UNWRITTEN;
 
        if (bma->wasdel)
@@ -5245,8 +5255,7 @@ __xfs_bunmapi(
                         * unmapping part of it.  But we can't really
                         * get rid of part of a realtime extent.
                         */
-                       if (del.br_state == XFS_EXT_UNWRITTEN ||
-                           !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
+                       if (del.br_state == XFS_EXT_UNWRITTEN) {
                                /*
                                 * This piece is unwritten, or we're not
                                 * using unwritten extents.  Skip over it.
@@ -5296,10 +5305,9 @@ __xfs_bunmapi(
                                del.br_blockcount -= mod;
                                del.br_startoff += mod;
                                del.br_startblock += mod;
-                       } else if ((del.br_startoff == start &&
-                                   (del.br_state == XFS_EXT_UNWRITTEN ||
-                                    tp->t_blk_res == 0)) ||
-                                  !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
+                       } else if (del.br_startoff == start &&
+                                  (del.br_state == XFS_EXT_UNWRITTEN ||
+                                   tp->t_blk_res == 0)) {
                                /*
                                 * Can't make it unwritten.  There isn't
                                 * a full extent here so just skip it.
@@ -6114,11 +6122,7 @@ xfs_bmap_validate_extent(
                    XFS_FSB_TO_AGNO(mp, endfsb))
                        return __this_address;
        }
-       if (irec->br_state != XFS_EXT_NORM) {
-               if (whichfork != XFS_DATA_FORK)
-                       return __this_address;
-               if (!xfs_sb_version_hasextflgbit(&mp->m_sb))
-                       return __this_address;
-       }
+       if (irec->br_state != XFS_EXT_NORM && whichfork != XFS_DATA_FORK)
+               return __this_address;
        return NULL;
 }
index b6e9b639e731a1fafd1116b9decb22872dc765af..488dc8860fd7c551b02e21eeae3462b96e268eb8 100644 (file)
@@ -183,6 +183,7 @@ void        xfs_trim_extent(struct xfs_bmbt_irec *irec, xfs_fileoff_t bno,
                xfs_filblks_t len);
 void   xfs_trim_extent_eof(struct xfs_bmbt_irec *, struct xfs_inode *);
 int    xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd);
+int    xfs_bmap_set_attrforkoff(struct xfs_inode *ip, int size, int *version);
 void   xfs_bmap_local_to_extents_empty(struct xfs_inode *ip, int whichfork);
 void   __xfs_bmap_add_free(struct xfs_trans *tp, xfs_fsblock_t bno,
                xfs_filblks_t len, struct xfs_owner_info *oinfo,
index afbe336600e165e2fe475ebcf6683e3f677c1cc4..9995d5ae380b9cb0df10274a7824f12b069ddb95 100644 (file)
@@ -287,6 +287,8 @@ static inline bool xfs_sb_good_v4_features(struct xfs_sb *sbp)
 {
        if (!(sbp->sb_versionnum & XFS_SB_VERSION_DIRV2BIT))
                return false;
+       if (!(sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT))
+               return false;
 
        /* check for unknown features in the fs */
        if ((sbp->sb_versionnum & ~XFS_SB_VERSION_OKBITS) ||
@@ -357,12 +359,6 @@ static inline bool xfs_sb_version_haslogv2(struct xfs_sb *sbp)
               (sbp->sb_versionnum & XFS_SB_VERSION_LOGV2BIT);
 }
 
-static inline bool xfs_sb_version_hasextflgbit(struct xfs_sb *sbp)
-{
-       return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 ||
-              (sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT);
-}
-
 static inline bool xfs_sb_version_hassector(struct xfs_sb *sbp)
 {
        return (sbp->sb_versionnum & XFS_SB_VERSION_SECTORBIT);
index 081f46e30556637326ef480ccb976b8cb8d0926c..b5a82acd7dfe01d9225c345bbd15740fb4995e83 100644 (file)
@@ -1115,7 +1115,8 @@ xfs_fs_geometry(
 
        geo->version = XFS_FSOP_GEOM_VERSION;
        geo->flags = XFS_FSOP_GEOM_FLAGS_NLINK |
-                    XFS_FSOP_GEOM_FLAGS_DIRV2;
+                    XFS_FSOP_GEOM_FLAGS_DIRV2 |
+                    XFS_FSOP_GEOM_FLAGS_EXTFLG;
        if (xfs_sb_version_hasattr(sbp))
                geo->flags |= XFS_FSOP_GEOM_FLAGS_ATTR;
        if (xfs_sb_version_hasquota(sbp))
@@ -1124,8 +1125,6 @@ xfs_fs_geometry(
                geo->flags |= XFS_FSOP_GEOM_FLAGS_IALIGN;
        if (xfs_sb_version_hasdalign(sbp))
                geo->flags |= XFS_FSOP_GEOM_FLAGS_DALIGN;
-       if (xfs_sb_version_hasextflgbit(sbp))
-               geo->flags |= XFS_FSOP_GEOM_FLAGS_EXTFLG;
        if (xfs_sb_version_hassector(sbp))
                geo->flags |= XFS_FSOP_GEOM_FLAGS_SECTOR;
        if (xfs_sb_version_hasasciici(sbp))
index 9f08dd9bf1d553934bf2aa2f8a108ab8435bc19c..4fc0a5ea76733df273a4c3dcd4e184a40f9768f5 100644 (file)
@@ -29,6 +29,8 @@
 #include "xfs_ag_resv.h"
 #include "xfs_trans_space.h"
 #include "xfs_quota.h"
+#include "xfs_attr.h"
+#include "xfs_reflink.h"
 #include "scrub/xfs_scrub.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
@@ -692,13 +694,14 @@ xrep_findroot_block(
        struct xrep_find_ag_btree       *fab,
        uint64_t                        owner,
        xfs_agblock_t                   agbno,
-       bool                            *found_it)
+       bool                            *done_with_block)
 {
        struct xfs_mount                *mp = ri->sc->mp;
        struct xfs_buf                  *bp;
        struct xfs_btree_block          *btblock;
        xfs_daddr_t                     daddr;
-       int                             error;
+       int                             block_level;
+       int                             error = 0;
 
        daddr = XFS_AGB_TO_DADDR(mp, ri->sc->sa.agno, agbno);
 
@@ -717,36 +720,111 @@ xrep_findroot_block(
                        return error;
        }
 
+       /*
+        * Read the buffer into memory so that we can see if it's a match for
+        * our btree type.  We have no clue if it is beforehand, and we want to
+        * avoid xfs_trans_read_buf's behavior of dumping the DONE state (which
+        * will cause needless disk reads in subsequent calls to this function)
+        * and logging metadata verifier failures.
+        *
+        * Therefore, pass in NULL buffer ops.  If the buffer was already in
+        * memory from some other caller it will already have b_ops assigned.
+        * If it was in memory from a previous unsuccessful findroot_block
+        * call, the buffer won't have b_ops but it should be clean and ready
+        * for us to try to verify if the read call succeeds.  The same applies
+        * if the buffer wasn't in memory at all.
+        *
+        * Note: If we never match a btree type with this buffer, it will be
+        * left in memory with NULL b_ops.  This shouldn't be a problem unless
+        * the buffer gets written.
+        */
        error = xfs_trans_read_buf(mp, ri->sc->tp, mp->m_ddev_targp, daddr,
                        mp->m_bsize, 0, &bp, NULL);
        if (error)
                return error;
 
-       /*
-        * Does this look like a block matching our fs and higher than any
-        * other block we've found so far?  If so, reattach buffer verifiers
-        * so the AIL won't complain if the buffer is also dirty.
-        */
+       /* Ensure the block magic matches the btree type we're looking for. */
        btblock = XFS_BUF_TO_BLOCK(bp);
        if (be32_to_cpu(btblock->bb_magic) != fab->magic)
                goto out;
-       if (xfs_sb_version_hascrc(&mp->m_sb) &&
-           !uuid_equal(&btblock->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid))
-               goto out;
-       bp->b_ops = fab->buf_ops;
 
-       /* Ignore this block if it's lower in the tree than we've seen. */
-       if (fab->root != NULLAGBLOCK &&
-           xfs_btree_get_level(btblock) < fab->height)
-               goto out;
+       /*
+        * If the buffer already has ops applied and they're not the ones for
+        * this btree type, we know this block doesn't match the btree and we
+        * can bail out.
+        *
+        * If the buffer ops match ours, someone else has already validated
+        * the block for us, so we can move on to checking if this is a root
+        * block candidate.
+        *
+        * If the buffer does not have ops, nobody has successfully validated
+        * the contents and the buffer cannot be dirty.  If the magic, uuid,
+        * and structure match this btree type then we'll move on to checking
+        * if it's a root block candidate.  If there is no match, bail out.
+        */
+       if (bp->b_ops) {
+               if (bp->b_ops != fab->buf_ops)
+                       goto out;
+       } else {
+               ASSERT(!xfs_trans_buf_is_dirty(bp));
+               if (!uuid_equal(&btblock->bb_u.s.bb_uuid,
+                               &mp->m_sb.sb_meta_uuid))
+                       goto out;
+               fab->buf_ops->verify_read(bp);
+               if (bp->b_error) {
+                       bp->b_error = 0;
+                       goto out;
+               }
 
-       /* Make sure we pass the verifiers. */
-       bp->b_ops->verify_read(bp);
-       if (bp->b_error)
+               /*
+                * Some read verifiers will (re)set b_ops, so we must be
+                * careful not to blow away any such assignment.
+                */
+               if (!bp->b_ops)
+                       bp->b_ops = fab->buf_ops;
+       }
+
+       /*
+        * This block passes the magic/uuid and verifier tests for this btree
+        * type.  We don't need the caller to try the other tree types.
+        */
+       *done_with_block = true;
+
+       /*
+        * Compare this btree block's level to the height of the current
+        * candidate root block.
+        *
+        * If the level matches the root we found previously, throw away both
+        * blocks because there can't be two candidate roots.
+        *
+        * If level is lower in the tree than the root we found previously,
+        * ignore this block.
+        */
+       block_level = xfs_btree_get_level(btblock);
+       if (block_level + 1 == fab->height) {
+               fab->root = NULLAGBLOCK;
                goto out;
-       fab->root = agbno;
-       fab->height = xfs_btree_get_level(btblock) + 1;
-       *found_it = true;
+       } else if (block_level < fab->height) {
+               goto out;
+       }
+
+       /*
+        * This is the highest block in the tree that we've found so far.
+        * Update the btree height to reflect what we've learned from this
+        * block.
+        */
+       fab->height = block_level + 1;
+
+       /*
+        * If this block doesn't have sibling pointers, then it's the new root
+        * block candidate.  Otherwise, the root will be found farther up the
+        * tree.
+        */
+       if (btblock->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK) &&
+           btblock->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK))
+               fab->root = agbno;
+       else
+               fab->root = NULLAGBLOCK;
 
        trace_xrep_findroot_block(mp, ri->sc->sa.agno, agbno,
                        be32_to_cpu(btblock->bb_magic), fab->height - 1);
@@ -768,7 +846,7 @@ xrep_findroot_rmap(
        struct xrep_findroot            *ri = priv;
        struct xrep_find_ag_btree       *fab;
        xfs_agblock_t                   b;
-       bool                            found_it;
+       bool                            done;
        int                             error = 0;
 
        /* Ignore anything that isn't AG metadata. */
@@ -777,16 +855,16 @@ xrep_findroot_rmap(
 
        /* Otherwise scan each block + btree type. */
        for (b = 0; b < rec->rm_blockcount; b++) {
-               found_it = false;
+               done = false;
                for (fab = ri->btree_info; fab->buf_ops; fab++) {
                        if (rec->rm_owner != fab->rmap_owner)
                                continue;
                        error = xrep_findroot_block(ri, fab,
                                        rec->rm_owner, rec->rm_startblock + b,
-                                       &found_it);
+                                       &done);
                        if (error)
                                return error;
-                       if (found_it)
+                       if (done)
                                break;
                }
        }
index 4bfae1e61d30ee60b956ec7d3495fbbc8e0856b8..1b2344d0052549d588c61192fc332330f1d42350 100644 (file)
@@ -412,19 +412,6 @@ xchk_validate_inputs(
                goto out;
        }
 
-       error = -EOPNOTSUPP;
-       /*
-        * We won't scrub any filesystem that doesn't have the ability
-        * to record unwritten extents.  The option was made default in
-        * 2003, removed from mkfs in 2007, and cannot be disabled in
-        * v5, so if we find a filesystem without this flag it's either
-        * really old or totally unsupported.  Avoid it either way.
-        * We also don't support v1-v3 filesystems, which aren't
-        * mountable.
-        */
-       if (!xfs_sb_version_hasextflgbit(&mp->m_sb))
-               goto out;
-
        /*
         * We only want to repair read-write v5+ filesystems.  Defer the check
         * for ops->repair until after our scrub confirms that we need to
index 49f5f5896a43532d76768f5df6a03a693a25e6fb..338b9d9984e04a62d790ae72638017c9ac3329d5 100644 (file)
@@ -917,7 +917,7 @@ xfs_vm_writepage(
        struct writeback_control *wbc)
 {
        struct xfs_writepage_ctx wpc = {
-               .io_type = XFS_IO_INVALID,
+               .io_type = XFS_IO_HOLE,
        };
        int                     ret;
 
@@ -933,7 +933,7 @@ xfs_vm_writepages(
        struct writeback_control *wbc)
 {
        struct xfs_writepage_ctx wpc = {
-               .io_type = XFS_IO_INVALID,
+               .io_type = XFS_IO_HOLE,
        };
        int                     ret;
 
index 9af867951a1077a2fdab44badb8bd743ef75b2b7..494b4338446ef2ccc519b5cc0bc1a13eeb6dbd4d 100644 (file)
@@ -12,21 +12,19 @@ extern struct bio_set xfs_ioend_bioset;
  * Types of I/O for bmap clustering and I/O completion tracking.
  */
 enum {
-       XFS_IO_INVALID,         /* initial state */
+       XFS_IO_HOLE,            /* covers region without any block allocation */
        XFS_IO_DELALLOC,        /* covers delalloc region */
        XFS_IO_UNWRITTEN,       /* covers allocated but uninitialized data */
        XFS_IO_OVERWRITE,       /* covers already allocated extent */
        XFS_IO_COW,             /* covers copy-on-write extent */
-       XFS_IO_HOLE,            /* covers region without any block allocation */
 };
 
 #define XFS_IO_TYPES \
-       { XFS_IO_INVALID,               "invalid" }, \
-       { XFS_IO_DELALLOC,              "delalloc" }, \
-       { XFS_IO_UNWRITTEN,             "unwritten" }, \
-       { XFS_IO_OVERWRITE,             "overwrite" }, \
-       { XFS_IO_COW,                   "CoW" }, \
-       { XFS_IO_HOLE,                  "hole" }
+       { XFS_IO_HOLE,                  "hole" },       \
+       { XFS_IO_DELALLOC,              "delalloc" },   \
+       { XFS_IO_UNWRITTEN,             "unwritten" },  \
+       { XFS_IO_OVERWRITE,             "overwrite" },  \
+       { XFS_IO_COW,                   "CoW" }
 
 /*
  * Structure for buffered I/O completions.
diff --git a/fs/xfs/xfs_attr.h b/fs/xfs/xfs_attr.h
deleted file mode 100644 (file)
index 033ff8c..0000000
+++ /dev/null
@@ -1,148 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (c) 2000,2002-2003,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- */
-#ifndef __XFS_ATTR_H__
-#define        __XFS_ATTR_H__
-
-struct xfs_inode;
-struct xfs_da_args;
-struct xfs_attr_list_context;
-
-/*
- * Large attribute lists are structured around Btrees where all the data
- * elements are in the leaf nodes.  Attribute names are hashed into an int,
- * then that int is used as the index into the Btree.  Since the hashval
- * of an attribute name may not be unique, we may have duplicate keys.
- * The internal links in the Btree are logical block offsets into the file.
- *
- * Small attribute lists use a different format and are packed as tightly
- * as possible so as to fit into the literal area of the inode.
- */
-
-/*========================================================================
- * External interfaces
- *========================================================================*/
-
-
-#define ATTR_DONTFOLLOW        0x0001  /* -- unused, from IRIX -- */
-#define ATTR_ROOT      0x0002  /* use attrs in root (trusted) namespace */
-#define ATTR_TRUST     0x0004  /* -- unused, from IRIX -- */
-#define ATTR_SECURE    0x0008  /* use attrs in security namespace */
-#define ATTR_CREATE    0x0010  /* pure create: fail if attr already exists */
-#define ATTR_REPLACE   0x0020  /* pure set: fail if attr does not exist */
-
-#define ATTR_KERNOTIME 0x1000  /* [kernel] don't update inode timestamps */
-#define ATTR_KERNOVAL  0x2000  /* [kernel] get attr size only, not value */
-
-#define ATTR_INCOMPLETE        0x4000  /* [kernel] return INCOMPLETE attr keys */
-
-#define XFS_ATTR_FLAGS \
-       { ATTR_DONTFOLLOW,      "DONTFOLLOW" }, \
-       { ATTR_ROOT,            "ROOT" }, \
-       { ATTR_TRUST,           "TRUST" }, \
-       { ATTR_SECURE,          "SECURE" }, \
-       { ATTR_CREATE,          "CREATE" }, \
-       { ATTR_REPLACE,         "REPLACE" }, \
-       { ATTR_KERNOTIME,       "KERNOTIME" }, \
-       { ATTR_KERNOVAL,        "KERNOVAL" }, \
-       { ATTR_INCOMPLETE,      "INCOMPLETE" }
-
-/*
- * The maximum size (into the kernel or returned from the kernel) of an
- * attribute value or the buffer used for an attr_list() call.  Larger
- * sizes will result in an ERANGE return code.
- */
-#define        ATTR_MAX_VALUELEN       (64*1024)       /* max length of a value */
-
-/*
- * Define how lists of attribute names are returned to the user from
- * the attr_list() call.  A large, 32bit aligned, buffer is passed in
- * along with its size.  We put an array of offsets at the top that each
- * reference an attrlist_ent_t and pack the attrlist_ent_t's at the bottom.
- */
-typedef struct attrlist {
-       __s32   al_count;       /* number of entries in attrlist */
-       __s32   al_more;        /* T/F: more attrs (do call again) */
-       __s32   al_offset[1];   /* byte offsets of attrs [var-sized] */
-} attrlist_t;
-
-/*
- * Show the interesting info about one attribute.  This is what the
- * al_offset[i] entry points to.
- */
-typedef struct attrlist_ent {  /* data from attr_list() */
-       __u32   a_valuelen;     /* number bytes in value of attr */
-       char    a_name[1];      /* attr name (NULL terminated) */
-} attrlist_ent_t;
-
-/*
- * Given a pointer to the (char*) buffer containing the attr_list() result,
- * and an index, return a pointer to the indicated attribute in the buffer.
- */
-#define        ATTR_ENTRY(buffer, index)               \
-       ((attrlist_ent_t *)                     \
-        &((char *)buffer)[ ((attrlist_t *)(buffer))->al_offset[index] ])
-
-/*
- * Kernel-internal version of the attrlist cursor.
- */
-typedef struct attrlist_cursor_kern {
-       __u32   hashval;        /* hash value of next entry to add */
-       __u32   blkno;          /* block containing entry (suggestion) */
-       __u32   offset;         /* offset in list of equal-hashvals */
-       __u16   pad1;           /* padding to match user-level */
-       __u8    pad2;           /* padding to match user-level */
-       __u8    initted;        /* T/F: cursor has been initialized */
-} attrlist_cursor_kern_t;
-
-
-/*========================================================================
- * Structure used to pass context around among the routines.
- *========================================================================*/
-
-
-/* void; state communicated via *context */
-typedef void (*put_listent_func_t)(struct xfs_attr_list_context *, int,
-                             unsigned char *, int, int);
-
-typedef struct xfs_attr_list_context {
-       struct xfs_trans                *tp;
-       struct xfs_inode                *dp;            /* inode */
-       struct attrlist_cursor_kern     *cursor;        /* position in list */
-       char                            *alist;         /* output buffer */
-       int                             seen_enough;    /* T/F: seen enough of list? */
-       ssize_t                         count;          /* num used entries */
-       int                             dupcnt;         /* count dup hashvals seen */
-       int                             bufsize;        /* total buffer size */
-       int                             firstu;         /* first used byte in buffer */
-       int                             flags;          /* from VOP call */
-       int                             resynch;        /* T/F: resynch with cursor */
-       put_listent_func_t              put_listent;    /* list output fmt function */
-       int                             index;          /* index into output buffer */
-} xfs_attr_list_context_t;
-
-
-/*========================================================================
- * Function prototypes for the kernel.
- *========================================================================*/
-
-/*
- * Overall external interface routines.
- */
-int xfs_attr_inactive(struct xfs_inode *dp);
-int xfs_attr_list_int_ilocked(struct xfs_attr_list_context *);
-int xfs_attr_list_int(struct xfs_attr_list_context *);
-int xfs_inode_hasattr(struct xfs_inode *ip);
-int xfs_attr_get_ilocked(struct xfs_inode *ip, struct xfs_da_args *args);
-int xfs_attr_get(struct xfs_inode *ip, const unsigned char *name,
-                unsigned char *value, int *valuelenp, int flags);
-int xfs_attr_set(struct xfs_inode *dp, const unsigned char *name,
-                unsigned char *value, int valuelen, int flags);
-int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags);
-int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize,
-                 int flags, struct attrlist_cursor_kern *cursor);
-
-
-#endif /* __XFS_ATTR_H__ */
index 6de8d90041ff0e676e85e559c1b2eef65dd74946..5d263dfdb3bcc60ca30708622397e42de9fbaeac 100644 (file)
@@ -406,10 +406,10 @@ xfs_getbmap_report_one(
        struct xfs_bmbt_irec    *got)
 {
        struct kgetbmap         *p = out + bmv->bmv_entries;
-       bool                    shared = false, trimmed = false;
+       bool                    shared = false;
        int                     error;
 
-       error = xfs_reflink_trim_around_shared(ip, got, &shared, &trimmed);
+       error = xfs_reflink_trim_around_shared(ip, got, &shared);
        if (error)
                return error;
 
@@ -1042,44 +1042,6 @@ out_trans_cancel:
        goto out_unlock;
 }
 
-static int
-xfs_adjust_extent_unmap_boundaries(
-       struct xfs_inode        *ip,
-       xfs_fileoff_t           *startoffset_fsb,
-       xfs_fileoff_t           *endoffset_fsb)
-{
-       struct xfs_mount        *mp = ip->i_mount;
-       struct xfs_bmbt_irec    imap;
-       int                     nimap, error;
-       xfs_extlen_t            mod = 0;
-
-       nimap = 1;
-       error = xfs_bmapi_read(ip, *startoffset_fsb, 1, &imap, &nimap, 0);
-       if (error)
-               return error;
-
-       if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
-               ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
-               div_u64_rem(imap.br_startblock, mp->m_sb.sb_rextsize, &mod);
-               if (mod)
-                       *startoffset_fsb += mp->m_sb.sb_rextsize - mod;
-       }
-
-       nimap = 1;
-       error = xfs_bmapi_read(ip, *endoffset_fsb - 1, 1, &imap, &nimap, 0);
-       if (error)
-               return error;
-
-       if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
-               ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
-               mod++;
-               if (mod && mod != mp->m_sb.sb_rextsize)
-                       *endoffset_fsb -= mod;
-       }
-
-       return 0;
-}
-
 static int
 xfs_flush_unmap_range(
        struct xfs_inode        *ip,
@@ -1133,19 +1095,8 @@ xfs_free_file_space(
        endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len);
 
        /*
-        * Need to zero the stuff we're not freeing, on disk.  If it's a RT file
-        * and we can't use unwritten extents then we actually need to ensure
-        * to zero the whole extent, otherwise we just need to take of block
-        * boundaries, and xfs_bunmapi will handle the rest.
+        * Need to zero the stuff we're not freeing, on disk.
         */
-       if (XFS_IS_REALTIME_INODE(ip) &&
-           !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
-               error = xfs_adjust_extent_unmap_boundaries(ip, &startoffset_fsb,
-                               &endoffset_fsb);
-               if (error)
-                       return error;
-       }
-
        if (endoffset_fsb > startoffset_fsb) {
                while (!done) {
                        error = xfs_unmap_extent(ip, startoffset_fsb,
@@ -1824,6 +1775,12 @@ xfs_swap_extents(
        if (error)
                goto out_unlock;
 
+       if (xfs_inode_has_cow_data(tip)) {
+               error = xfs_reflink_cancel_cow_range(tip, 0, NULLFILEOFF, true);
+               if (error)
+                       return error;
+       }
+
        /*
         * Extent "swapping" with rmap requires a permanent reservation and
         * a block reservation because it's really just a remap operation
index e839907e8492f940b431a7f28621d4148ad9a4a1..b21ea2ba768d624329d76078cfe8bab228d40cdd 100644 (file)
@@ -37,6 +37,32 @@ static kmem_zone_t *xfs_buf_zone;
 #define xb_to_gfp(flags) \
        ((((flags) & XBF_READ_AHEAD) ? __GFP_NORETRY : GFP_NOFS) | __GFP_NOWARN)
 
+/*
+ * Locking orders
+ *
+ * xfs_buf_ioacct_inc:
+ * xfs_buf_ioacct_dec:
+ *     b_sema (caller holds)
+ *       b_lock
+ *
+ * xfs_buf_stale:
+ *     b_sema (caller holds)
+ *       b_lock
+ *         lru_lock
+ *
+ * xfs_buf_rele:
+ *     b_lock
+ *       pag_buf_lock
+ *         lru_lock
+ *
+ * xfs_buftarg_wait_rele
+ *     lru_lock
+ *       b_lock (trylock due to inversion)
+ *
+ * xfs_buftarg_isolate
+ *     lru_lock
+ *       b_lock (trylock due to inversion)
+ */
 
 static inline int
 xfs_buf_is_vmapped(
@@ -749,6 +775,30 @@ _xfs_buf_read(
        return xfs_buf_submit(bp);
 }
 
+/*
+ * If the caller passed in an ops structure and the buffer doesn't have ops
+ * assigned, set the ops and use them to verify the contents.  If the contents
+ * cannot be verified, we'll clear XBF_DONE.  We assume the buffer has no
+ * recorded errors and is already in XBF_DONE state.
+ */
+int
+xfs_buf_ensure_ops(
+       struct xfs_buf          *bp,
+       const struct xfs_buf_ops *ops)
+{
+       ASSERT(bp->b_flags & XBF_DONE);
+       ASSERT(bp->b_error == 0);
+
+       if (!ops || bp->b_ops)
+               return 0;
+
+       bp->b_ops = ops;
+       bp->b_ops->verify_read(bp);
+       if (bp->b_error)
+               bp->b_flags &= ~XBF_DONE;
+       return bp->b_error;
+}
+
 xfs_buf_t *
 xfs_buf_read_map(
        struct xfs_buftarg      *target,
@@ -762,26 +812,32 @@ xfs_buf_read_map(
        flags |= XBF_READ;
 
        bp = xfs_buf_get_map(target, map, nmaps, flags);
-       if (bp) {
-               trace_xfs_buf_read(bp, flags, _RET_IP_);
+       if (!bp)
+               return NULL;
 
-               if (!(bp->b_flags & XBF_DONE)) {
-                       XFS_STATS_INC(target->bt_mount, xb_get_read);
-                       bp->b_ops = ops;
-                       _xfs_buf_read(bp, flags);
-               } else if (flags & XBF_ASYNC) {
-                       /*
-                        * Read ahead call which is already satisfied,
-                        * drop the buffer
-                        */
-                       xfs_buf_relse(bp);
-                       return NULL;
-               } else {
-                       /* We do not want read in the flags */
-                       bp->b_flags &= ~XBF_READ;
-               }
+       trace_xfs_buf_read(bp, flags, _RET_IP_);
+
+       if (!(bp->b_flags & XBF_DONE)) {
+               XFS_STATS_INC(target->bt_mount, xb_get_read);
+               bp->b_ops = ops;
+               _xfs_buf_read(bp, flags);
+               return bp;
        }
 
+       xfs_buf_ensure_ops(bp, ops);
+
+       if (flags & XBF_ASYNC) {
+               /*
+                * Read ahead call which is already satisfied,
+                * drop the buffer
+                */
+               xfs_buf_relse(bp);
+               return NULL;
+       }
+
+       /* We do not want read in the flags */
+       bp->b_flags &= ~XBF_READ;
+       ASSERT(bp->b_ops != NULL || ops == NULL);
        return bp;
 }
 
@@ -1006,8 +1062,18 @@ xfs_buf_rele(
 
        ASSERT(atomic_read(&bp->b_hold) > 0);
 
-       release = atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock);
+       /*
+        * We grab the b_lock here first to serialise racing xfs_buf_rele()
+        * calls. The pag_buf_lock being taken on the last reference only
+        * serialises against racing lookups in xfs_buf_find(). IOWs, the second
+        * to last reference we drop here is not serialised against the last
+        * reference until we take bp->b_lock. Hence if we don't grab b_lock
+        * first, the last "release" reference can win the race to the lock and
+        * free the buffer before the second-to-last reference is processed,
+        * leading to a use-after-free scenario.
+        */
        spin_lock(&bp->b_lock);
+       release = atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock);
        if (!release) {
                /*
                 * Drop the in-flight state if the buffer is already on the LRU
@@ -1989,6 +2055,13 @@ xfs_buf_delwri_submit_buffers(
  * is only safely useable for callers that can track I/O completion by higher
  * level means, e.g. AIL pushing as the @buffer_list is consumed in this
  * function.
+ *
+ * Note: this function will skip buffers it would block on, and in doing so
+ * leaves them on @buffer_list so they can be retried on a later pass. As such,
+ * it is up to the caller to ensure that the buffer list is fully submitted or
+ * cancelled appropriately when they are finished with the list. Failure to
+ * cancel or resubmit the list until it is empty will result in leaked buffers
+ * at unmount time.
  */
 int
 xfs_buf_delwri_submit_nowait(
index 4e3171acd0f82bb3e2ad962b68119458ab99b64b..b9f5511ea998a22927f141ecf446f63e3c99f60c 100644 (file)
@@ -385,4 +385,6 @@ extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int);
 #define xfs_getsize_buftarg(buftarg)   block_size((buftarg)->bt_bdev)
 #define xfs_readonly_buftarg(buftarg)  bdev_read_only((buftarg)->bt_bdev)
 
+int xfs_buf_ensure_ops(struct xfs_buf *bp, const struct xfs_buf_ops *ops);
+
 #endif /* __XFS_BUF_H__ */
index 7c00b8bedfe358ae508f8e10ff25771618529a1d..093c2b8d7e20bc7a3408843ed6cf06604d87b0c1 100644 (file)
@@ -470,20 +470,13 @@ xfs_fs_goingdown(
  */
 void
 xfs_do_force_shutdown(
-       xfs_mount_t     *mp,
+       struct xfs_mount *mp,
        int             flags,
        char            *fname,
        int             lnnum)
 {
-       int             logerror;
-
-       logerror = flags & SHUTDOWN_LOG_IO_ERROR;
+       bool            logerror = flags & SHUTDOWN_LOG_IO_ERROR;
 
-       if (!(flags & SHUTDOWN_FORCE_UMOUNT)) {
-               xfs_notice(mp,
-       "%s(0x%x) called from line %d of file %s.  Return address = "PTR_FMT,
-                       __func__, flags, lnnum, fname, __return_address);
-       }
        /*
         * No need to duplicate efforts.
         */
@@ -499,27 +492,34 @@ xfs_do_force_shutdown(
        if (xfs_log_force_umount(mp, logerror))
                return;
 
+       if (flags & SHUTDOWN_FORCE_UMOUNT) {
+               xfs_alert(mp,
+"User initiated shutdown received. Shutting down filesystem");
+               return;
+       }
+
+       xfs_notice(mp,
+"%s(0x%x) called from line %d of file %s. Return address = "PTR_FMT,
+               __func__, flags, lnnum, fname, __return_address);
+
        if (flags & SHUTDOWN_CORRUPT_INCORE) {
                xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_CORRUPT,
-    "Corruption of in-memory data detected.  Shutting down filesystem");
+"Corruption of in-memory data detected.  Shutting down filesystem");
                if (XFS_ERRLEVEL_HIGH <= xfs_error_level)
                        xfs_stack_trace();
-       } else if (!(flags & SHUTDOWN_FORCE_UMOUNT)) {
-               if (logerror) {
-                       xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_LOGERROR,
-               "Log I/O Error Detected.  Shutting down filesystem");
-               } else if (flags & SHUTDOWN_DEVICE_REQ) {
-                       xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR,
-               "All device paths lost.  Shutting down filesystem");
-               } else if (!(flags & SHUTDOWN_REMOTE_REQ)) {
-                       xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR,
-               "I/O Error Detected. Shutting down filesystem");
-               }
-       }
-       if (!(flags & SHUTDOWN_FORCE_UMOUNT)) {
-               xfs_alert(mp,
-       "Please umount the filesystem and rectify the problem(s)");
+       } else if (logerror) {
+               xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_LOGERROR,
+                       "Log I/O Error Detected. Shutting down filesystem");
+       } else if (flags & SHUTDOWN_DEVICE_REQ) {
+               xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR,
+                       "All device paths lost. Shutting down filesystem");
+       } else if (!(flags & SHUTDOWN_REMOTE_REQ)) {
+               xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR,
+                       "I/O Error Detected. Shutting down filesystem");
        }
+
+       xfs_alert(mp,
+               "Please unmount the filesystem and rectify the problem(s)");
 }
 
 /*
index 0ef5ece5634c11099d04112359daadb4fdbcab03..6e2c08f30f602deb360e737003cc3ae1abf4bfc7 100644 (file)
@@ -604,14 +604,6 @@ xfs_ioc_space(
        uint                    iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
        int                     error;
 
-       /*
-        * Only allow the sys admin to reserve space unless
-        * unwritten extents are enabled.
-        */
-       if (!xfs_sb_version_hasextflgbit(&ip->i_mount->m_sb) &&
-           !capable(CAP_SYS_ADMIN))
-               return -EPERM;
-
        if (inode->i_flags & (S_IMMUTABLE|S_APPEND))
                return -EPERM;
 
index 6320aca39f39415257f3bbb9b0313dbc26284861..27c93b5f029df92b17c22456c0bfe7a5a3fa085c 100644 (file)
@@ -62,6 +62,21 @@ xfs_bmbt_to_iomap(
        iomap->dax_dev = xfs_find_daxdev_for_inode(VFS_I(ip));
 }
 
+static void
+xfs_hole_to_iomap(
+       struct xfs_inode        *ip,
+       struct iomap            *iomap,
+       xfs_fileoff_t           offset_fsb,
+       xfs_fileoff_t           end_fsb)
+{
+       iomap->addr = IOMAP_NULL_ADDR;
+       iomap->type = IOMAP_HOLE;
+       iomap->offset = XFS_FSB_TO_B(ip->i_mount, offset_fsb);
+       iomap->length = XFS_FSB_TO_B(ip->i_mount, end_fsb - offset_fsb);
+       iomap->bdev = xfs_find_bdev_for_inode(VFS_I(ip));
+       iomap->dax_dev = xfs_find_daxdev_for_inode(VFS_I(ip));
+}
+
 xfs_extlen_t
 xfs_eof_alignment(
        struct xfs_inode        *ip,
@@ -502,6 +517,7 @@ xfs_file_iomap_begin_delay(
        struct inode            *inode,
        loff_t                  offset,
        loff_t                  count,
+       unsigned                flags,
        struct iomap            *iomap)
 {
        struct xfs_inode        *ip = XFS_I(inode);
@@ -538,15 +554,23 @@ xfs_file_iomap_begin_delay(
                        goto out_unlock;
        }
 
+       end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb);
+
        eof = !xfs_iext_lookup_extent(ip, ifp, offset_fsb, &icur, &got);
-       if (!eof && got.br_startoff <= offset_fsb) {
-               if (xfs_is_reflink_inode(ip)) {
-                       bool            shared;
+       if (eof)
+               got.br_startoff = end_fsb; /* fake hole until the end */
 
-                       end_fsb = min(XFS_B_TO_FSB(mp, offset + count),
-                                       maxbytes_fsb);
+       if (got.br_startoff <= offset_fsb) {
+               /*
+                * For reflink files we may need a delalloc reservation when
+                * overwriting shared extents.   This includes zeroing of
+                * existing extents that contain data.
+                */
+               if (xfs_is_reflink_inode(ip) &&
+                   ((flags & IOMAP_WRITE) ||
+                    got.br_state != XFS_EXT_UNWRITTEN)) {
                        xfs_trim_extent(&got, offset_fsb, end_fsb - offset_fsb);
-                       error = xfs_reflink_reserve_cow(ip, &got, &shared);
+                       error = xfs_reflink_reserve_cow(ip, &got);
                        if (error)
                                goto out_unlock;
                }
@@ -555,6 +579,11 @@ xfs_file_iomap_begin_delay(
                goto done;
        }
 
+       if (flags & IOMAP_ZERO) {
+               xfs_hole_to_iomap(ip, iomap, offset_fsb, got.br_startoff);
+               goto out_unlock;
+       }
+
        error = xfs_qm_dqattach_locked(ip, false);
        if (error)
                goto out_unlock;
@@ -1003,16 +1032,17 @@ xfs_file_iomap_begin(
        struct xfs_bmbt_irec    imap;
        xfs_fileoff_t           offset_fsb, end_fsb;
        int                     nimaps = 1, error = 0;
-       bool                    shared = false, trimmed = false;
+       bool                    shared = false;
        unsigned                lockmode;
 
        if (XFS_FORCED_SHUTDOWN(mp))
                return -EIO;
 
-       if (((flags & (IOMAP_WRITE | IOMAP_DIRECT)) == IOMAP_WRITE) &&
+       if ((flags & (IOMAP_WRITE | IOMAP_ZERO)) && !(flags & IOMAP_DIRECT) &&
                        !IS_DAX(inode) && !xfs_get_extsz_hint(ip)) {
                /* Reserve delalloc blocks for regular writeback. */
-               return xfs_file_iomap_begin_delay(inode, offset, length, iomap);
+               return xfs_file_iomap_begin_delay(inode, offset, length, flags,
+                               iomap);
        }
 
        /*
@@ -1038,8 +1068,7 @@ xfs_file_iomap_begin(
 
        if (flags & IOMAP_REPORT) {
                /* Trim the mapping to the nearest shared extent boundary. */
-               error = xfs_reflink_trim_around_shared(ip, &imap, &shared,
-                               &trimmed);
+               error = xfs_reflink_trim_around_shared(ip, &imap, &shared);
                if (error)
                        goto out_unlock;
        }
@@ -1065,7 +1094,7 @@ xfs_file_iomap_begin(
                        if (error)
                                goto out_unlock;
                } else {
-                       error = xfs_reflink_reserve_cow(ip, &imap, &shared);
+                       error = xfs_reflink_reserve_cow(ip, &imap);
                        if (error)
                                goto out_unlock;
                }
index 42ea7bab9144cc026f50d802acc31c42ab7f0858..8eaeec9d58ed6799898753f49f0ad895b5db4cb5 100644 (file)
@@ -182,8 +182,7 @@ int
 xfs_reflink_trim_around_shared(
        struct xfs_inode        *ip,
        struct xfs_bmbt_irec    *irec,
-       bool                    *shared,
-       bool                    *trimmed)
+       bool                    *shared)
 {
        xfs_agnumber_t          agno;
        xfs_agblock_t           agbno;
@@ -209,7 +208,7 @@ xfs_reflink_trim_around_shared(
        if (error)
                return error;
 
-       *shared = *trimmed = false;
+       *shared = false;
        if (fbno == NULLAGBLOCK) {
                /* No shared blocks at all. */
                return 0;
@@ -222,8 +221,6 @@ xfs_reflink_trim_around_shared(
                 */
                irec->br_blockcount = flen;
                *shared = true;
-               if (flen != aglen)
-                       *trimmed = true;
                return 0;
        } else {
                /*
@@ -233,7 +230,6 @@ xfs_reflink_trim_around_shared(
                 * start of the shared region.
                 */
                irec->br_blockcount = fbno - agbno;
-               *trimmed = true;
                return 0;
        }
 }
@@ -241,7 +237,7 @@ xfs_reflink_trim_around_shared(
 /*
  * Trim the passed in imap to the next shared/unshared extent boundary, and
  * if imap->br_startoff points to a shared extent reserve space for it in the
- * COW fork.  In this case *shared is set to true, else to false.
+ * COW fork.
  *
  * Note that imap will always contain the block numbers for the existing blocks
  * in the data fork, as the upper layers need them for read-modify-write
@@ -250,14 +246,14 @@ xfs_reflink_trim_around_shared(
 int
 xfs_reflink_reserve_cow(
        struct xfs_inode        *ip,
-       struct xfs_bmbt_irec    *imap,
-       bool                    *shared)
+       struct xfs_bmbt_irec    *imap)
 {
        struct xfs_ifork        *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
        struct xfs_bmbt_irec    got;
        int                     error = 0;
-       bool                    eof = false, trimmed;
+       bool                    eof = false;
        struct xfs_iext_cursor  icur;
+       bool                    shared;
 
        /*
         * Search the COW fork extent list first.  This serves two purposes:
@@ -273,18 +269,16 @@ xfs_reflink_reserve_cow(
        if (!eof && got.br_startoff <= imap->br_startoff) {
                trace_xfs_reflink_cow_found(ip, imap);
                xfs_trim_extent(imap, got.br_startoff, got.br_blockcount);
-
-               *shared = true;
                return 0;
        }
 
        /* Trim the mapping to the nearest shared extent boundary. */
-       error = xfs_reflink_trim_around_shared(ip, imap, shared, &trimmed);
+       error = xfs_reflink_trim_around_shared(ip, imap, &shared);
        if (error)
                return error;
 
        /* Not shared?  Just report the (potentially capped) extent. */
-       if (!*shared)
+       if (!shared)
                return 0;
 
        /*
@@ -368,7 +362,6 @@ xfs_find_trim_cow_extent(
        xfs_filblks_t           count_fsb = imap->br_blockcount;
        struct xfs_iext_cursor  icur;
        struct xfs_bmbt_irec    got;
-       bool                    trimmed;
 
        *found = false;
 
@@ -376,9 +369,13 @@ xfs_find_trim_cow_extent(
         * If we don't find an overlapping extent, trim the range we need to
         * allocate to fit the hole we found.
         */
-       if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got) ||
-           got.br_startoff > offset_fsb)
-               return xfs_reflink_trim_around_shared(ip, imap, shared, &trimmed);
+       if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got))
+               got.br_startoff = offset_fsb + count_fsb;
+       if (got.br_startoff > offset_fsb) {
+               xfs_trim_extent(imap, imap->br_startoff,
+                               got.br_startoff - imap->br_startoff);
+               return xfs_reflink_trim_around_shared(ip, imap, shared);
+       }
 
        *shared = true;
        if (isnullstartblock(got.br_startblock)) {
index c585ad9552b23ff375fd40c21a6aadab694b8e14..7f47202b5639142054420b2fb2384e9c44cfbb0d 100644 (file)
@@ -10,10 +10,10 @@ extern int xfs_reflink_find_shared(struct xfs_mount *mp, struct xfs_trans *tp,
                xfs_agnumber_t agno, xfs_agblock_t agbno, xfs_extlen_t aglen,
                xfs_agblock_t *fbno, xfs_extlen_t *flen, bool find_maximal);
 extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip,
-               struct xfs_bmbt_irec *irec, bool *shared, bool *trimmed);
+               struct xfs_bmbt_irec *irec, bool *shared);
 
 extern int xfs_reflink_reserve_cow(struct xfs_inode *ip,
-               struct xfs_bmbt_irec *imap, bool *shared);
+               struct xfs_bmbt_irec *imap);
 extern int xfs_reflink_allocate_cow(struct xfs_inode *ip,
                struct xfs_bmbt_irec *imap, bool *shared, uint *lockmode);
 extern int xfs_reflink_convert_cow(struct xfs_inode *ip, xfs_off_t offset,
index 4e4423153071c788bb278d3856e3ff346cb915f8..cc509743facd8ddfedc6c8946446f515a79dfb69 100644 (file)
@@ -29,30 +29,30 @@ int xfs_stats_format(struct xfsstats __percpu *stats, char *buf)
                char    *desc;
                int     endpoint;
        } xstats[] = {
-               { "extent_alloc",       XFSSTAT_END_EXTENT_ALLOC        },
-               { "abt",                XFSSTAT_END_ALLOC_BTREE         },
-               { "blk_map",            XFSSTAT_END_BLOCK_MAPPING       },
-               { "bmbt",               XFSSTAT_END_BLOCK_MAP_BTREE     },
-               { "dir",                XFSSTAT_END_DIRECTORY_OPS       },
-               { "trans",              XFSSTAT_END_TRANSACTIONS        },
-               { "ig",                 XFSSTAT_END_INODE_OPS           },
-               { "log",                XFSSTAT_END_LOG_OPS             },
-               { "push_ail",           XFSSTAT_END_TAIL_PUSHING        },
-               { "xstrat",             XFSSTAT_END_WRITE_CONVERT       },
-               { "rw",                 XFSSTAT_END_READ_WRITE_OPS      },
-               { "attr",               XFSSTAT_END_ATTRIBUTE_OPS       },
-               { "icluster",           XFSSTAT_END_INODE_CLUSTER       },
-               { "vnodes",             XFSSTAT_END_VNODE_OPS           },
-               { "buf",                XFSSTAT_END_BUF                 },
-               { "abtb2",              XFSSTAT_END_ABTB_V2             },
-               { "abtc2",              XFSSTAT_END_ABTC_V2             },
-               { "bmbt2",              XFSSTAT_END_BMBT_V2             },
-               { "ibt2",               XFSSTAT_END_IBT_V2              },
-               { "fibt2",              XFSSTAT_END_FIBT_V2             },
-               { "rmapbt",             XFSSTAT_END_RMAP_V2             },
-               { "refcntbt",           XFSSTAT_END_REFCOUNT            },
+               { "extent_alloc",       xfsstats_offset(xs_abt_lookup)  },
+               { "abt",                xfsstats_offset(xs_blk_mapr)    },
+               { "blk_map",            xfsstats_offset(xs_bmbt_lookup) },
+               { "bmbt",               xfsstats_offset(xs_dir_lookup)  },
+               { "dir",                xfsstats_offset(xs_trans_sync)  },
+               { "trans",              xfsstats_offset(xs_ig_attempts) },
+               { "ig",                 xfsstats_offset(xs_log_writes)  },
+               { "log",                xfsstats_offset(xs_try_logspace)},
+               { "push_ail",           xfsstats_offset(xs_xstrat_quick)},
+               { "xstrat",             xfsstats_offset(xs_write_calls) },
+               { "rw",                 xfsstats_offset(xs_attr_get)    },
+               { "attr",               xfsstats_offset(xs_iflush_count)},
+               { "icluster",           xfsstats_offset(vn_active)      },
+               { "vnodes",             xfsstats_offset(xb_get)         },
+               { "buf",                xfsstats_offset(xs_abtb_2)      },
+               { "abtb2",              xfsstats_offset(xs_abtc_2)      },
+               { "abtc2",              xfsstats_offset(xs_bmbt_2)      },
+               { "bmbt2",              xfsstats_offset(xs_ibt_2)       },
+               { "ibt2",               xfsstats_offset(xs_fibt_2)      },
+               { "fibt2",              xfsstats_offset(xs_rmap_2)      },
+               { "rmapbt",             xfsstats_offset(xs_refcbt_2)    },
+               { "refcntbt",           xfsstats_offset(xs_qm_dqreclaims)},
                /* we print both series of quota information together */
-               { "qm",                 XFSSTAT_END_QM                  },
+               { "qm",                 xfsstats_offset(xs_xstrat_bytes)},
        };
 
        /* Loop over all stats groups */
@@ -104,6 +104,10 @@ void xfs_stats_clearall(struct xfsstats __percpu *stats)
 #ifdef CONFIG_PROC_FS
 /* legacy quota interfaces */
 #ifdef CONFIG_XFS_QUOTA
+
+#define XFSSTAT_START_XQMSTAT xfsstats_offset(xs_qm_dqreclaims)
+#define XFSSTAT_END_XQMSTAT xfsstats_offset(xs_qm_dquot)
+
 static int xqm_proc_show(struct seq_file *m, void *v)
 {
        /* maximum; incore; ratio free to inuse; freelist */
@@ -119,7 +123,7 @@ static int xqmstat_proc_show(struct seq_file *m, void *v)
        int j;
 
        seq_printf(m, "qm");
-       for (j = XFSSTAT_END_IBT_V2; j < XFSSTAT_END_XQMSTAT; j++)
+       for (j = XFSSTAT_START_XQMSTAT; j < XFSSTAT_END_XQMSTAT; j++)
                seq_printf(m, " %u", counter_val(xfsstats.xs_stats, j));
        seq_putc(m, '\n');
        return 0;
index 130db070e4d8a9ae0ad22b89d85cdd3eb3255f7e..34d704f703d25e07ab8af65f9ed0205a9ab6e655 100644 (file)
@@ -41,17 +41,14 @@ enum {
  * XFS global statistics
  */
 struct __xfsstats {
-# define XFSSTAT_END_EXTENT_ALLOC      4
        uint32_t                xs_allocx;
        uint32_t                xs_allocb;
        uint32_t                xs_freex;
        uint32_t                xs_freeb;
-# define XFSSTAT_END_ALLOC_BTREE       (XFSSTAT_END_EXTENT_ALLOC+4)
        uint32_t                xs_abt_lookup;
        uint32_t                xs_abt_compare;
        uint32_t                xs_abt_insrec;
        uint32_t                xs_abt_delrec;
-# define XFSSTAT_END_BLOCK_MAPPING     (XFSSTAT_END_ALLOC_BTREE+7)
        uint32_t                xs_blk_mapr;
        uint32_t                xs_blk_mapw;
        uint32_t                xs_blk_unmap;
@@ -59,21 +56,17 @@ struct __xfsstats {
        uint32_t                xs_del_exlist;
        uint32_t                xs_look_exlist;
        uint32_t                xs_cmp_exlist;
-# define XFSSTAT_END_BLOCK_MAP_BTREE   (XFSSTAT_END_BLOCK_MAPPING+4)
        uint32_t                xs_bmbt_lookup;
        uint32_t                xs_bmbt_compare;
        uint32_t                xs_bmbt_insrec;
        uint32_t                xs_bmbt_delrec;
-# define XFSSTAT_END_DIRECTORY_OPS     (XFSSTAT_END_BLOCK_MAP_BTREE+4)
        uint32_t                xs_dir_lookup;
        uint32_t                xs_dir_create;
        uint32_t                xs_dir_remove;
        uint32_t                xs_dir_getdents;
-# define XFSSTAT_END_TRANSACTIONS      (XFSSTAT_END_DIRECTORY_OPS+3)
        uint32_t                xs_trans_sync;
        uint32_t                xs_trans_async;
        uint32_t                xs_trans_empty;
-# define XFSSTAT_END_INODE_OPS         (XFSSTAT_END_TRANSACTIONS+7)
        uint32_t                xs_ig_attempts;
        uint32_t                xs_ig_found;
        uint32_t                xs_ig_frecycle;
@@ -81,13 +74,11 @@ struct __xfsstats {
        uint32_t                xs_ig_dup;
        uint32_t                xs_ig_reclaims;
        uint32_t                xs_ig_attrchg;
-# define XFSSTAT_END_LOG_OPS           (XFSSTAT_END_INODE_OPS+5)
        uint32_t                xs_log_writes;
        uint32_t                xs_log_blocks;
        uint32_t                xs_log_noiclogs;
        uint32_t                xs_log_force;
        uint32_t                xs_log_force_sleep;
-# define XFSSTAT_END_TAIL_PUSHING      (XFSSTAT_END_LOG_OPS+10)
        uint32_t                xs_try_logspace;
        uint32_t                xs_sleep_logspace;
        uint32_t                xs_push_ail;
@@ -98,22 +89,17 @@ struct __xfsstats {
        uint32_t                xs_push_ail_flushing;
        uint32_t                xs_push_ail_restarts;
        uint32_t                xs_push_ail_flush;
-# define XFSSTAT_END_WRITE_CONVERT     (XFSSTAT_END_TAIL_PUSHING+2)
        uint32_t                xs_xstrat_quick;
        uint32_t                xs_xstrat_split;
-# define XFSSTAT_END_READ_WRITE_OPS    (XFSSTAT_END_WRITE_CONVERT+2)
        uint32_t                xs_write_calls;
        uint32_t                xs_read_calls;
-# define XFSSTAT_END_ATTRIBUTE_OPS     (XFSSTAT_END_READ_WRITE_OPS+4)
        uint32_t                xs_attr_get;
        uint32_t                xs_attr_set;
        uint32_t                xs_attr_remove;
        uint32_t                xs_attr_list;
-# define XFSSTAT_END_INODE_CLUSTER     (XFSSTAT_END_ATTRIBUTE_OPS+3)
        uint32_t                xs_iflush_count;
        uint32_t                xs_icluster_flushcnt;
        uint32_t                xs_icluster_flushinode;
-# define XFSSTAT_END_VNODE_OPS         (XFSSTAT_END_INODE_CLUSTER+8)
        uint32_t                vn_active;      /* # vnodes not on free lists */
        uint32_t                vn_alloc;       /* # times vn_alloc called */
        uint32_t                vn_get;         /* # times vn_get called */
@@ -122,7 +108,6 @@ struct __xfsstats {
        uint32_t                vn_reclaim;     /* # times vn_reclaim called */
        uint32_t                vn_remove;      /* # times vn_remove called */
        uint32_t                vn_free;        /* # times vn_free called */
-#define XFSSTAT_END_BUF                        (XFSSTAT_END_VNODE_OPS+9)
        uint32_t                xb_get;
        uint32_t                xb_create;
        uint32_t                xb_get_locked;
@@ -133,28 +118,19 @@ struct __xfsstats {
        uint32_t                xb_page_found;
        uint32_t                xb_get_read;
 /* Version 2 btree counters */
-#define XFSSTAT_END_ABTB_V2            (XFSSTAT_END_BUF + __XBTS_MAX)
        uint32_t                xs_abtb_2[__XBTS_MAX];
-#define XFSSTAT_END_ABTC_V2            (XFSSTAT_END_ABTB_V2 + __XBTS_MAX)
        uint32_t                xs_abtc_2[__XBTS_MAX];
-#define XFSSTAT_END_BMBT_V2            (XFSSTAT_END_ABTC_V2 + __XBTS_MAX)
        uint32_t                xs_bmbt_2[__XBTS_MAX];
-#define XFSSTAT_END_IBT_V2             (XFSSTAT_END_BMBT_V2 + __XBTS_MAX)
        uint32_t                xs_ibt_2[__XBTS_MAX];
-#define XFSSTAT_END_FIBT_V2            (XFSSTAT_END_IBT_V2 + __XBTS_MAX)
        uint32_t                xs_fibt_2[__XBTS_MAX];
-#define XFSSTAT_END_RMAP_V2            (XFSSTAT_END_FIBT_V2 + __XBTS_MAX)
        uint32_t                xs_rmap_2[__XBTS_MAX];
-#define XFSSTAT_END_REFCOUNT           (XFSSTAT_END_RMAP_V2 + __XBTS_MAX)
        uint32_t                xs_refcbt_2[__XBTS_MAX];
-#define XFSSTAT_END_XQMSTAT            (XFSSTAT_END_REFCOUNT + 6)
        uint32_t                xs_qm_dqreclaims;
        uint32_t                xs_qm_dqreclaim_misses;
        uint32_t                xs_qm_dquot_dups;
        uint32_t                xs_qm_dqcachemisses;
        uint32_t                xs_qm_dqcachehits;
        uint32_t                xs_qm_dqwants;
-#define XFSSTAT_END_QM                 (XFSSTAT_END_XQMSTAT+2)
        uint32_t                xs_qm_dquot;
        uint32_t                xs_qm_dquot_unused;
 /* Extra precision counters */
@@ -163,10 +139,12 @@ struct __xfsstats {
        uint64_t                xs_read_bytes;
 };
 
+#define        xfsstats_offset(f)      (offsetof(struct __xfsstats, f)/sizeof(uint32_t))
+
 struct xfsstats {
        union {
                struct __xfsstats       s;
-               uint32_t                a[XFSSTAT_END_XQMSTAT];
+               uint32_t                a[xfsstats_offset(xs_qm_dquot)];
        };
 };
 
index 207ee302b1bb9f4039b8963b7ae1a68ce8ac8487..d3e6cd063688406ae69b0625db862f09cdbcd260 100644 (file)
@@ -43,6 +43,7 @@
 #include <linux/dax.h>
 #include <linux/init.h>
 #include <linux/slab.h>
+#include <linux/magic.h>
 #include <linux/mount.h>
 #include <linux/mempool.h>
 #include <linux/writeback.h>
@@ -933,6 +934,32 @@ xfs_fs_alloc_inode(
        return NULL;
 }
 
+#ifdef DEBUG
+static void
+xfs_check_delalloc(
+       struct xfs_inode        *ip,
+       int                     whichfork)
+{
+       struct xfs_ifork        *ifp = XFS_IFORK_PTR(ip, whichfork);
+       struct xfs_bmbt_irec    got;
+       struct xfs_iext_cursor  icur;
+
+       if (!ifp || !xfs_iext_lookup_extent(ip, ifp, 0, &icur, &got))
+               return;
+       do {
+               if (isnullstartblock(got.br_startblock)) {
+                       xfs_warn(ip->i_mount,
+       "ino %llx %s fork has delalloc extent at [0x%llx:0x%llx]",
+                               ip->i_ino,
+                               whichfork == XFS_DATA_FORK ? "data" : "cow",
+                               got.br_startoff, got.br_blockcount);
+               }
+       } while (xfs_iext_next_extent(ifp, &icur, &got));
+}
+#else
+#define xfs_check_delalloc(ip, whichfork)      do { } while (0)
+#endif
+
 /*
  * Now that the generic code is guaranteed not to be accessing
  * the linux inode, we can inactivate and reclaim the inode.
@@ -951,7 +978,12 @@ xfs_fs_destroy_inode(
 
        xfs_inactive(ip);
 
-       ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0);
+       if (!XFS_FORCED_SHUTDOWN(ip->i_mount) && ip->i_delayed_blks) {
+               xfs_check_delalloc(ip, XFS_DATA_FORK);
+               xfs_check_delalloc(ip, XFS_COW_FORK);
+               ASSERT(0);
+       }
+
        XFS_STATS_INC(ip->i_mount, vn_reclaim);
 
        /*
@@ -1097,7 +1129,7 @@ xfs_fs_statfs(
        xfs_extlen_t            lsize;
        int64_t                 ffree;
 
-       statp->f_type = XFS_SB_MAGIC;
+       statp->f_type = XFS_SUPER_MAGIC;
        statp->f_namelen = MAXNAMELEN - 1;
 
        id = huge_encode_dev(mp->m_ddev_targp->bt_dev);
@@ -1650,7 +1682,7 @@ xfs_fs_fill_super(
         * we must configure the block size in the superblock before we run the
         * full mount process as the mount process can lookup and cache inodes.
         */
-       sb->s_magic = XFS_SB_MAGIC;
+       sb->s_magic = XFS_SUPER_MAGIC;
        sb->s_blocksize = mp->m_sb.sb_blocksize;
        sb->s_blocksize_bits = ffs(sb->s_blocksize) - 1;
        sb->s_maxbytes = xfs_max_file_offset(sb->s_blocksize_bits);
index c3d278e96ad1a73462808c1867ed7615974011e0..a0c5dbda18aabfa84c4f0ca94543b80174e5ba19 100644 (file)
@@ -220,6 +220,7 @@ void                xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *, uint);
 void           xfs_trans_log_buf(struct xfs_trans *, struct xfs_buf *, uint,
                                  uint);
 void           xfs_trans_dirty_buf(struct xfs_trans *, struct xfs_buf *);
+bool           xfs_trans_buf_is_dirty(struct xfs_buf *bp);
 void           xfs_trans_log_inode(xfs_trans_t *, struct xfs_inode *, uint);
 
 void           xfs_extent_free_init_defer_op(void);
index 55326f971cb36bb87e35552f461a81a1c206ef3f..d3a4e89bf4a0ddb916ed4f5d395285e2e2188869 100644 (file)
@@ -531,17 +531,33 @@ xfsaild(
                        set_current_state(TASK_INTERRUPTIBLE);
 
                /*
-                * Check kthread_should_stop() after we set the task state
-                * to guarantee that we either see the stop bit and exit or
-                * the task state is reset to runnable such that it's not
-                * scheduled out indefinitely and detects the stop bit at
-                * next iteration.
-                *
+                * Check kthread_should_stop() after we set the task state to
+                * guarantee that we either see the stop bit and exit or the
+                * task state is reset to runnable such that it's not scheduled
+                * out indefinitely and detects the stop bit at next iteration.
                 * A memory barrier is included in above task state set to
                 * serialize again kthread_stop().
                 */
                if (kthread_should_stop()) {
                        __set_current_state(TASK_RUNNING);
+
+                       /*
+                        * The caller forces out the AIL before stopping the
+                        * thread in the common case, which means the delwri
+                        * queue is drained. In the shutdown case, the queue may
+                        * still hold relogged buffers that haven't been
+                        * submitted because they were pinned since added to the
+                        * queue.
+                        *
+                        * Log I/O error processing stales the underlying buffer
+                        * and clears the delwri state, expecting the buf to be
+                        * removed on the next submission attempt. That won't
+                        * happen if we're shutting down, so this is the last
+                        * opportunity to release such buffers from the queue.
+                        */
+                       ASSERT(list_empty(&ailp->ail_buf_list) ||
+                              XFS_FORCED_SHUTDOWN(ailp->ail_mount));
+                       xfs_buf_delwri_cancel(&ailp->ail_buf_list);
                        break;
                }
 
index 286a287ac57acc5c5abcbe51a881af09026ecf0e..629f1479c9d234492d3a7d431dd21bb30f4db393 100644 (file)
@@ -264,11 +264,39 @@ xfs_trans_read_buf_map(
                        return -EIO;
                }
 
+               /*
+                * Check if the caller is trying to read a buffer that is
+                * already attached to the transaction yet has no buffer ops
+                * assigned.  Ops are usually attached when the buffer is
+                * attached to the transaction, or by the read caller if
+                * special circumstances.  That didn't happen, which is not
+                * how this is supposed to go.
+                *
+                * If the buffer passes verification we'll let this go, but if
+                * not we have to shut down.  Let the transaction cleanup code
+                * release this buffer when it kills the tranaction.
+                */
+               ASSERT(bp->b_ops != NULL);
+               error = xfs_buf_ensure_ops(bp, ops);
+               if (error) {
+                       xfs_buf_ioerror_alert(bp, __func__);
+
+                       if (tp->t_flags & XFS_TRANS_DIRTY)
+                               xfs_force_shutdown(tp->t_mountp,
+                                               SHUTDOWN_META_IO_ERROR);
+
+                       /* bad CRC means corrupted metadata */
+                       if (error == -EFSBADCRC)
+                               error = -EFSCORRUPTED;
+                       return error;
+               }
+
                bip = bp->b_log_item;
                bip->bli_recur++;
 
                ASSERT(atomic_read(&bip->bli_refcount) > 0);
                trace_xfs_trans_read_buf_recur(bip);
+               ASSERT(bp->b_ops != NULL || ops == NULL);
                *bpp = bp;
                return 0;
        }
@@ -316,11 +344,25 @@ xfs_trans_read_buf_map(
                _xfs_trans_bjoin(tp, bp, 1);
                trace_xfs_trans_read_buf(bp->b_log_item);
        }
+       ASSERT(bp->b_ops != NULL || ops == NULL);
        *bpp = bp;
        return 0;
 
 }
 
+/* Has this buffer been dirtied by anyone? */
+bool
+xfs_trans_buf_is_dirty(
+       struct xfs_buf          *bp)
+{
+       struct xfs_buf_log_item *bip = bp->b_log_item;
+
+       if (!bip)
+               return false;
+       ASSERT(bip->bli_item.li_type == XFS_LI_BUF);
+       return test_bit(XFS_LI_DIRTY, &bip->bli_item.li_flags);
+}
+
 /*
  * Release a buffer previously joined to the transaction. If the buffer is
  * modified within this transaction, decrement the recursion count but do not
index 1a6fee974116a406deb077bd8be5585e5c4591cb..96c24478d8cedb50de46c7b27662f9b81008ce53 100644 (file)
@@ -29,6 +29,7 @@
 #define HPFS_SUPER_MAGIC       0xf995e849
 #define ISOFS_SUPER_MAGIC      0x9660
 #define JFFS2_SUPER_MAGIC      0x72b6
+#define XFS_SUPER_MAGIC                0x58465342      /* "XFSB" */
 #define PSTOREFS_MAGIC         0x6165676C
 #define EFIVARFS_MAGIC         0xde5e81e4
 #define HOSTFS_SUPER_MAGIC     0x00c0ffee