Merge tag 'xfs-5.18-merge-4' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux

author Linus Torvalds <torvalds@linux-foundation.org>

Sat, 2 Apr 2022 02:30:44 +0000 (19:30 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Sat, 2 Apr 2022 02:30:44 +0000 (19:30 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Sat, 2 Apr 2022 02:30:44 +0000 (19:30 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Sat, 2 Apr 2022 02:30:44 +0000 (19:30 -0700)
diff --combined fs/xfs/xfs_bio_io.c

index 32fa02945f7399d30b367dac248bf4fb344582c6,17f36db2f792891604611d3b40d18cb281460edc..ae4345b37621b600b8156cb5c10f7b2bac6a79da
--- 1/fs/xfs/xfs_bio_io.c
--- 2/fs/xfs/xfs_bio_io.c
+++ b/fs/xfs/xfs_bio_io.c
@@@ -9,39 -9,6 +9,6 @@@ static inline unsigned int bio_max_vecs
         return bio_max_segs(howmany(count, PAGE_SIZE));
   }
   
- static void
- xfs_flush_bdev_async_endio(
-       struct bio      *bio)
- {
-       complete(bio->bi_private);
- }
- 
- /*
-  * Submit a request for an async cache flush to run. If the request queue does
-  * not require flush operations, just skip it altogether. If the caller needs
-  * to wait for the flush completion at a later point in time, they must supply a
-  * valid completion. This will be signalled when the flush completes.  The
-  * caller never sees the bio that is issued here.
-  */
- void
- xfs_flush_bdev_async(
-       struct bio              *bio,
-       struct block_device     *bdev,
-       struct completion       *done)
- {
-       struct request_queue    *q = bdev->bd_disk->queue;
- 
-       if (!test_bit(QUEUE_FLAG_WC, &q->queue_flags)) {
-               complete(done);
-               return;
-       }
- 
-       bio_init(bio, bdev, NULL, 0, REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC);
-       bio->bi_private = done;
-       bio->bi_end_io = xfs_flush_bdev_async_endio;
- 
-       submit_bio(bio);
- }
   int
   xfs_rw_bdev(
         struct block_device     *bdev,
@@@ -59,9 -26,10 +26,9 @@@
         if (is_vmalloc && op == REQ_OP_WRITE)
                 flush_kernel_vmap_range(data, count);
   
- -      bio = bio_alloc(GFP_KERNEL, bio_max_vecs(left));
- -      bio_set_dev(bio, bdev);
+ +      bio = bio_alloc(bdev, bio_max_vecs(left), op | REQ_META | REQ_SYNC,
+ +                      GFP_KERNEL);
         bio->bi_iter.bi_sector = sector;
- -      bio->bi_opf = op | REQ_META | REQ_SYNC;
   
         do {
                 struct page     *page = kmem_to_page(data);
@@@ -71,9 -39,10 +38,9 @@@
                 while (bio_add_page(bio, page, len, off) != len) {
                         struct bio      *prev = bio;
   
- -                      bio = bio_alloc(GFP_KERNEL, bio_max_vecs(left));
- -                      bio_copy_dev(bio, prev);
+ +                      bio = bio_alloc(prev->bi_bdev, bio_max_vecs(left),
+ +                                      prev->bi_opf, GFP_KERNEL);
                         bio->bi_iter.bi_sector = bio_end_sector(prev);
- -                      bio->bi_opf = prev->bi_opf;
                         bio_chain(prev, bio);
   
                         submit_bio(prev);
diff --combined fs/xfs/xfs_icache.c

index 20186c584c7d6824c26500248ab554fc98257007,6c7267451b82ad4e6aabd784587eb1bf31290d79..bffd6eb0b298eedb54f820014302dd1bd2bca765
--- 1/fs/xfs/xfs_icache.c
--- 2/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@@ -78,7 -78,7 +78,7 @@@ xfs_inode_alloc
          * XXX: If this didn't occur in transactions, we could drop GFP_NOFAIL
          * and return NULL here on ENOMEM.
          */
- -      ip = kmem_cache_alloc(xfs_inode_cache, GFP_KERNEL | __GFP_NOFAIL);
+ +      ip = alloc_inode_sb(mp->m_super, xfs_inode_cache, GFP_KERNEL | __GFP_NOFAIL);
   
         if (inode_init_always(mp->m_super, VFS_I(ip))) {
                 kmem_cache_free(xfs_inode_cache, ip);
@@@ -883,7 -883,7 +883,7 @@@ xfs_reclaim_inode
          */
         if (xlog_is_shutdown(ip->i_mount->m_log)) {
                 xfs_iunpin_wait(ip);
-               xfs_iflush_abort(ip);
+               xfs_iflush_shutdown_abort(ip);
                 goto reclaim;
         }
         if (xfs_ipincount(ip))
diff --combined fs/xfs/xfs_log.c

index a8034c0afdf2270d0fe7a580cbe2f5375031b1cb,af22cbae9124bd733f46adbd013379423d82357c..499e15b24215703aa3847972b6c44a87a28e70b6
--- 1/fs/xfs/xfs_log.c
--- 2/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@@ -487,7 -487,10 +487,10 @@@ out_error
    * Run all the pending iclog callbacks and wake log force waiters and iclog
    * space waiters so they can process the newly set shutdown state. We really
    * don't care what order we process callbacks here because the log is shut down
-  * and so state cannot change on disk anymore.
+  * and so state cannot change on disk anymore. However, we cannot wake waiters
+  * until the callbacks have been processed because we may be in unmount and
+  * we must ensure that all AIL operations the callbacks perform have completed
+  * before we tear down the AIL.
    *
    * We avoid processing actively referenced iclogs so that we don't run callbacks
    * while the iclog owner might still be preparing the iclog for IO submssion.
@@@ -501,7 -504,6 +504,6 @@@ xlog_state_shutdown_callbacks
         struct xlog_in_core     *iclog;
         LIST_HEAD(cb_list);
   
-       spin_lock(&log->l_icloglock);
         iclog = log->l_iclog;
         do {
                 if (atomic_read(&iclog->ic_refcnt)) {
@@@ -509,26 -511,22 +511,22 @@@
                         continue;
                 }
                 list_splice_init(&iclog->ic_callbacks, &cb_list);
+               spin_unlock(&log->l_icloglock);
+ 
+               xlog_cil_process_committed(&cb_list);
+ 
+               spin_lock(&log->l_icloglock);
                 wake_up_all(&iclog->ic_write_wait);
                 wake_up_all(&iclog->ic_force_wait);
         } while ((iclog = iclog->ic_next) != log->l_iclog);
   
         wake_up_all(&log->l_flush_wait);
-       spin_unlock(&log->l_icloglock);
- 
-       xlog_cil_process_committed(&cb_list);
   }
   
   /*
    * Flush iclog to disk if this is the last reference to the given iclog and the
    * it is in the WANT_SYNC state.
    *
-  * If the caller passes in a non-zero @old_tail_lsn and the current log tail
-  * does not match, there may be metadata on disk that must be persisted before
-  * this iclog is written.  To satisfy that requirement, set the
-  * XLOG_ICL_NEED_FLUSH flag as a condition for writing this iclog with the new
-  * log tail value.
-  *
    * If XLOG_ICL_NEED_FUA is already set on the iclog, we need to ensure that the
    * log tail is updated correctly. NEED_FUA indicates that the iclog will be
    * written to stable storage, and implies that a commit record is contained
@@@ -545,12 -543,10 +543,10 @@@
    * always capture the tail lsn on the iclog on the first NEED_FUA release
    * regardless of the number of active reference counts on this iclog.
    */
- 
   int
   xlog_state_release_iclog(
         struct xlog             *log,
-       struct xlog_in_core     *iclog,
-       xfs_lsn_t               old_tail_lsn)
+       struct xlog_in_core     *iclog)
   {
         xfs_lsn_t               tail_lsn;
         bool                    last_ref;
@@@ -561,18 -557,14 +557,14 @@@
         /*
          * Grabbing the current log tail needs to be atomic w.r.t. the writing
          * of the tail LSN into the iclog so we guarantee that the log tail does
-        * not move between deciding if a cache flush is required and writing
-        * the LSN into the iclog below.
+        * not move between the first time we know that the iclog needs to be
+        * made stable and when we eventually submit it.
          */
-       if (old_tail_lsn || iclog->ic_state == XLOG_STATE_WANT_SYNC) {
+       if ((iclog->ic_state == XLOG_STATE_WANT_SYNC ||
+            (iclog->ic_flags & XLOG_ICL_NEED_FUA)) &&
+           !iclog->ic_header.h_tail_lsn) {
                 tail_lsn = xlog_assign_tail_lsn(log->l_mp);
- 
-               if (old_tail_lsn && tail_lsn != old_tail_lsn)
-                       iclog->ic_flags |= XLOG_ICL_NEED_FLUSH;
- 
-               if ((iclog->ic_flags & XLOG_ICL_NEED_FUA) &&
-                   !iclog->ic_header.h_tail_lsn)
-                       iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn);
+               iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn);
         }
   
         last_ref = atomic_dec_and_test(&iclog->ic_refcnt);
@@@ -583,11 -575,8 +575,8 @@@
                  * pending iclog callbacks that were waiting on the release of
                  * this iclog.
                  */
-               if (last_ref) {
-                       spin_unlock(&log->l_icloglock);
+               if (last_ref)
                         xlog_state_shutdown_callbacks(log);
-                       spin_lock(&log->l_icloglock);
-               }
                 return -EIO;
         }
   
@@@ -600,8 -589,6 +589,6 @@@
         }
   
         iclog->ic_state = XLOG_STATE_SYNCING;
-       if (!iclog->ic_header.h_tail_lsn)
-               iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn);
         xlog_verify_tail_lsn(log, iclog);
         trace_xlog_iclog_syncing(iclog, _RET_IP_);
   
@@@ -873,7 -860,7 +860,7 @@@ xlog_force_iclog
         iclog->ic_flags |= XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA;
         if (iclog->ic_state == XLOG_STATE_ACTIVE)
                 xlog_state_switch_iclogs(iclog->ic_log, iclog, 0);
-       return xlog_state_release_iclog(iclog->ic_log, iclog, 0);
+       return xlog_state_release_iclog(iclog->ic_log, iclog);
   }
   
   /*
@@@ -1373,7 -1360,7 +1360,7 @@@ xlog_ioend_work
          */
         if (XFS_TEST_ERROR(error, log->l_mp, XFS_ERRTAG_IODONE_IOERR)) {
                 xfs_alert(log->l_mp, "log I/O error %d", error);
-               xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR);
+               xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
         }
   
         xlog_state_done_syncing(iclog);
@@@ -1882,19 -1869,19 +1869,19 @@@ xlog_write_iclog
                 return;
         }
   
- -      bio_init(&iclog->ic_bio, iclog->ic_bvec, howmany(count, PAGE_SIZE));
- -      bio_set_dev(&iclog->ic_bio, log->l_targ->bt_bdev);
- -      iclog->ic_bio.bi_iter.bi_sector = log->l_logBBstart + bno;
- -      iclog->ic_bio.bi_end_io = xlog_bio_end_io;
- -      iclog->ic_bio.bi_private = iclog;
- -
         /*
          * We use REQ_SYNC | REQ_IDLE here to tell the block layer the are more
          * IOs coming immediately after this one. This prevents the block layer
          * writeback throttle from throttling log writes behind background
          * metadata writeback and causing priority inversions.
          */
- -      iclog->ic_bio.bi_opf = REQ_OP_WRITE | REQ_META | REQ_SYNC | REQ_IDLE;
+ +      bio_init(&iclog->ic_bio, log->l_targ->bt_bdev, iclog->ic_bvec,
+ +               howmany(count, PAGE_SIZE),
+ +               REQ_OP_WRITE | REQ_META | REQ_SYNC | REQ_IDLE);
+ +      iclog->ic_bio.bi_iter.bi_sector = log->l_logBBstart + bno;
+ +      iclog->ic_bio.bi_end_io = xlog_bio_end_io;
+ +      iclog->ic_bio.bi_private = iclog;
+ +
         if (iclog->ic_flags & XLOG_ICL_NEED_FLUSH) {
                 iclog->ic_bio.bi_opf |= REQ_PREFLUSH;
                 /*
@@@ -1912,7 -1899,7 +1899,7 @@@
         iclog->ic_flags &= ~(XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA);
   
         if (xlog_map_iclog_data(&iclog->ic_bio, iclog->ic_data, count)) {
-               xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR);
+               xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
                 return;
         }
         if (is_vmalloc_addr(iclog->ic_data))
@@@ -2411,7 -2398,7 +2398,7 @@@ xlog_write_copy_finish
                 ASSERT(iclog->ic_state == XLOG_STATE_WANT_SYNC ||
                         xlog_is_shutdown(log));
   release_iclog:
-       error = xlog_state_release_iclog(log, iclog, 0);
+       error = xlog_state_release_iclog(log, iclog);
         spin_unlock(&log->l_icloglock);
         return error;
   }
@@@ -2487,7 -2474,7 +2474,7 @@@ xlog_write
                 xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES,
                      "ctx ticket reservation ran out. Need to up reservation");
                 xlog_print_tic_res(log->l_mp, ticket);
-               xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR);
+               xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
         }
   
         len = xlog_write_calc_vec_length(ticket, log_vector, optype);
@@@ -2628,7 -2615,7 +2615,7 @@@ next_lv
   
         spin_lock(&log->l_icloglock);
         xlog_state_finish_copy(log, iclog, record_cnt, data_cnt);
-       error = xlog_state_release_iclog(log, iclog, 0);
+       error = xlog_state_release_iclog(log, iclog);
         spin_unlock(&log->l_icloglock);
   
         return error;
@@@ -3052,7 -3039,7 +3039,7 @@@ restart
                  * reference to the iclog.
                  */
                 if (!atomic_add_unless(&iclog->ic_refcnt, -1, 1))
-                       error = xlog_state_release_iclog(log, iclog, 0);
+                       error = xlog_state_release_iclog(log, iclog);
                 spin_unlock(&log->l_icloglock);
                 if (error)
                         return error;
@@@ -3821,9 -3808,10 +3808,10 @@@ xlog_verify_iclog
   #endif
   
   /*
-  * Perform a forced shutdown on the log. This should be called once and once
-  * only by the high level filesystem shutdown code to shut the log subsystem
-  * down cleanly.
+  * Perform a forced shutdown on the log.
+  *
+  * This can be called from low level log code to trigger a shutdown, or from the
+  * high level mount shutdown code when the mount shuts down.
    *
    * Our main objectives here are to make sure that:
    *    a. if the shutdown was not due to a log IO error, flush the logs to
@@@ -3832,6 -3820,8 +3820,8 @@@
    *       parties to find out. Nothing new gets queued after this is done.
    *    c. Tasks sleeping on log reservations, pinned objects and
    *       other resources get woken up.
+  *    d. The mount is also marked as shut down so that log triggered shutdowns
+  *       still behave the same as if they called xfs_forced_shutdown().
    *
    * Return true if the shutdown cause was a log IO error and we actually shut the
    * log down.
@@@ -3843,25 -3833,25 +3833,25 @@@ xlog_force_shutdown
   {
         bool            log_error = (shutdown_flags & SHUTDOWN_LOG_IO_ERROR);
   
-       /*
-        * If this happens during log recovery then we aren't using the runtime
-        * log mechanisms yet so there's nothing to shut down.
-        */
-       if (!log || xlog_in_recovery(log))
+       if (!log)
                 return false;
   
-       ASSERT(!xlog_is_shutdown(log));
- 
         /*
          * Flush all the completed transactions to disk before marking the log
          * being shut down. We need to do this first as shutting down the log
          * before the force will prevent the log force from flushing the iclogs
          * to disk.
          *
-        * Re-entry due to a log IO error shutdown during the log force is
-        * prevented by the atomicity of higher level shutdown code.
+        * When we are in recovery, there are no transactions to flush, and
+        * we don't want to touch the log because we don't want to perturb the
+        * current head/tail for future recovery attempts. Hence we need to
+        * avoid a log force in this case.
+        *
+        * If we are shutting down due to a log IO error, then we must avoid
+        * trying to write the log as that may just result in more IO errors and
+        * an endless shutdown/force loop.
          */
-       if (!log_error)
+       if (!log_error && !xlog_in_recovery(log))
                 xfs_log_force(log->l_mp, XFS_LOG_SYNC);
   
         /*
@@@ -3878,11 -3868,24 +3868,24 @@@
         spin_lock(&log->l_icloglock);
         if (test_and_set_bit(XLOG_IO_ERROR, &log->l_opstate)) {
                 spin_unlock(&log->l_icloglock);
-               ASSERT(0);
                 return false;
         }
         spin_unlock(&log->l_icloglock);
   
+       /*
+        * If this log shutdown also sets the mount shutdown state, issue a
+        * shutdown warning message.
+        */
+       if (!test_and_set_bit(XFS_OPSTATE_SHUTDOWN, &log->l_mp->m_opstate)) {
+               xfs_alert_tag(log->l_mp, XFS_PTAG_SHUTDOWN_LOGERROR,
+ "Filesystem has been shut down due to log error (0x%x).",
+                               shutdown_flags);
+               xfs_alert(log->l_mp,
+ "Please unmount the filesystem and rectify the problem(s).");
+               if (xfs_error_level >= XFS_ERRLEVEL_HIGH)
+                       xfs_stack_trace();
+       }
+ 
         /*
          * We don't want anybody waiting for log reservations after this. That
          * means we have to wake up everybody queued up on reserveq as well as
@@@ -3903,8 -3906,12 +3906,12 @@@
         wake_up_all(&log->l_cilp->xc_start_wait);
         wake_up_all(&log->l_cilp->xc_commit_wait);
         spin_unlock(&log->l_cilp->xc_push_lock);
+ 
+       spin_lock(&log->l_icloglock);
         xlog_state_shutdown_callbacks(log);
+       spin_unlock(&log->l_icloglock);
   
+       wake_up_var(&log->l_opstate);
         return log_error;
   }
author	Linus Torvalds <torvalds@linux-foundation.org>
	Sat, 2 Apr 2022 02:30:44 +0000 (19:30 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Sat, 2 Apr 2022 02:30:44 +0000 (19:30 -0700)
		1	2
fs/xfs/xfs_bio_io.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/xfs/xfs_icache.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/xfs/xfs_log.c	patch \|	diff1 \|	diff2 \|	blob \| history