Merge branch 'writeback' of git://git.kernel.dk/linux-2.6-block

author Linus Torvalds <torvalds@linux-foundation.org>

Fri, 25 Sep 2009 16:27:30 +0000 (09:27 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Fri, 25 Sep 2009 16:27:30 +0000 (09:27 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Fri, 25 Sep 2009 16:27:30 +0000 (09:27 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Fri, 25 Sep 2009 16:27:30 +0000 (09:27 -0700)
diff --git a/fs/buffer.c b/fs/buffer.c

index 24afd7422ae866851ecfbd12d7e1c231c5bffda5..6fa530256bfd7b118bbef4aebce7ceb79e145928 100644 (file)
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -280,7 +280,7 @@ void invalidate_bdev(struct block_device *bdev)
  EXPORT_SYMBOL(invalidate_bdev);
  
  /*
- * Kick pdflush then try to free up some ZONE_NORMAL memory.
+ * Kick the writeback threads then try to free up some ZONE_NORMAL memory.
   */
  static void free_more_memory(void)
  {
@@ -1709,9 +1709,9 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
                 /*
                  * If it's a fully non-blocking write attempt and we cannot
                  * lock the buffer then redirty the page.  Note that this can
-                * potentially cause a busy-wait loop from pdflush and kswapd
-                * activity, but those code paths have their own higher-level
-                * throttling.
+                * potentially cause a busy-wait loop from writeback threads
+                * and kswapd activity, but those code paths have their own
+                * higher-level throttling.
                  */
                 if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
                         lock_buffer(bh);
@@ -3208,7 +3208,7 @@ EXPORT_SYMBOL(block_sync_page);
   * still running obsolete flush daemons, so we terminate them here.
   *
   * Use of bdflush() is deprecated and will be removed in a future kernel.
- * The `pdflush' kernel threads fully replace bdflush daemons and this call.
+ * The `flush-X' kernel threads fully replace bdflush daemons and this call.
   */
  SYSCALL_DEFINE2(bdflush, int, func, long, data)
  {
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c

index 8e1e5e19d21e1fd60c9f99b3988fee0191c3e901..fb61178c86e3e56eb0dcecaad7ff6e3f3db1cc1d 100644 (file)
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -41,8 +41,9 @@ struct wb_writeback_args {
         long nr_pages;
         struct super_block *sb;
         enum writeback_sync_modes sync_mode;
-       int for_kupdate;
-       int range_cyclic;
+       int for_kupdate:1;
+       int range_cyclic:1;
+       int for_background:1;
  };
  
  /*
@@ -257,6 +258,15 @@ void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages)
                 .range_cyclic   = 1,
         };
  
+       /*
+        * We treat @nr_pages=0 as the special case to do background writeback,
+        * ie. to sync pages until the background dirty threshold is reached.
+        */
+       if (!nr_pages) {
+               args.nr_pages = LONG_MAX;
+               args.for_background = 1;
+       }
+
         bdi_alloc_queue_work(bdi, &args);
  }
  
@@ -310,7 +320,7 @@ static bool inode_dirtied_after(struct inode *inode, unsigned long t)
          * For inodes being constantly redirtied, dirtied_when can get stuck.
          * It _appears_ to be in the future, but is actually in distant past.
          * This test is necessary to prevent such wrapped-around relative times
-        * from permanently stopping the whole pdflush writeback.
+        * from permanently stopping the whole bdi writeback.
          */
         ret = ret && time_before_eq(inode->dirtied_when, jiffies);
  #endif
@@ -324,13 +334,38 @@ static void move_expired_inodes(struct list_head *delaying_queue,
                                struct list_head *dispatch_queue,
                                 unsigned long *older_than_this)
  {
+       LIST_HEAD(tmp);
+       struct list_head *pos, *node;
+       struct super_block *sb = NULL;
+       struct inode *inode;
+       int do_sb_sort = 0;
+
         while (!list_empty(delaying_queue)) {
-               struct inode *inode = list_entry(delaying_queue->prev,
-                                               struct inode, i_list);
+               inode = list_entry(delaying_queue->prev, struct inode, i_list);
                 if (older_than_this &&
                     inode_dirtied_after(inode, *older_than_this))
                         break;
-               list_move(&inode->i_list, dispatch_queue);
+               if (sb && sb != inode->i_sb)
+                       do_sb_sort = 1;
+               sb = inode->i_sb;
+               list_move(&inode->i_list, &tmp);
+       }
+
+       /* just one sb in list, splice to dispatch_queue and we're done */
+       if (!do_sb_sort) {
+               list_splice(&tmp, dispatch_queue);
+               return;
+       }
+
+       /* Move inodes from one superblock together */
+       while (!list_empty(&tmp)) {
+               inode = list_entry(tmp.prev, struct inode, i_list);
+               sb = inode->i_sb;
+               list_for_each_prev_safe(pos, node, &tmp) {
+                       inode = list_entry(pos, struct inode, i_list);
+                       if (inode->i_sb == sb)
+                               list_move(&inode->i_list, dispatch_queue);
+               }
         }
  }
  
@@ -439,8 +474,18 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
         spin_lock(&inode_lock);
         inode->i_state &= ~I_SYNC;
         if (!(inode->i_state & (I_FREEING | I_CLEAR))) {
-               if (!(inode->i_state & I_DIRTY) &&
-                   mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
+               if ((inode->i_state & I_DIRTY_PAGES) && wbc->for_kupdate) {
+                       /*
+                        * More pages get dirtied by a fast dirtier.
+                        */
+                       goto select_queue;
+               } else if (inode->i_state & I_DIRTY) {
+                       /*
+                        * At least XFS will redirty the inode during the
+                        * writeback (delalloc) and on io completion (isize).
+                        */
+                       redirty_tail(inode);
+               } else if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
                         /*
                          * We didn't write back all the pages.  nfs_writepages()
                          * sometimes bales out without doing anything. Redirty
@@ -462,6 +507,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
                                  * soon as the queue becomes uncongested.
                                  */
                                 inode->i_state |= I_DIRTY_PAGES;
+select_queue:
                                 if (wbc->nr_to_write <= 0) {
                                         /*
                                          * slice used up: queue for next turn
@@ -484,12 +530,6 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
                                 inode->i_state |= I_DIRTY_PAGES;
                                 redirty_tail(inode);
                         }
-               } else if (inode->i_state & I_DIRTY) {
-                       /*
-                        * Someone redirtied the inode while were writing back
-                        * the pages.
-                        */
-                       redirty_tail(inode);
                 } else if (atomic_read(&inode->i_count)) {
                         /*
                          * The inode is clean, inuse
@@ -506,6 +546,17 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
         return ret;
  }
  
+static void unpin_sb_for_writeback(struct super_block **psb)
+{
+       struct super_block *sb = *psb;
+
+       if (sb) {
+               up_read(&sb->s_umount);
+               put_super(sb);
+               *psb = NULL;
+       }
+}
+
  /*
   * For WB_SYNC_NONE writeback, the caller does not have the sb pinned
   * before calling writeback. So make sure that we do pin it, so it doesn't
@@ -515,10 +566,19 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
   * 1 if we failed.
   */
  static int pin_sb_for_writeback(struct writeback_control *wbc,
-                                  struct inode *inode)
+                               struct inode *inode, struct super_block **psb)
  {
         struct super_block *sb = inode->i_sb;
  
+       /*
+        * If this sb is already pinned, nothing more to do. If not and
+        * *psb is non-NULL, unpin the old one first
+        */
+       if (sb == *psb)
+               return 0;
+       else if (*psb)
+               unpin_sb_for_writeback(psb);
+
         /*
          * Caller must already hold the ref for this
          */
@@ -532,7 +592,7 @@ static int pin_sb_for_writeback(struct writeback_control *wbc,
         if (down_read_trylock(&sb->s_umount)) {
                 if (sb->s_root) {
                         spin_unlock(&sb_lock);
-                       return 0;
+                       goto pinned;
                 }
                 /*
                  * umounted, drop rwsem again and fall through to failure
@@ -543,24 +603,15 @@ static int pin_sb_for_writeback(struct writeback_control *wbc,
         sb->s_count--;
         spin_unlock(&sb_lock);
         return 1;
-}
-
-static void unpin_sb_for_writeback(struct writeback_control *wbc,
-                                  struct inode *inode)
-{
-       struct super_block *sb = inode->i_sb;
-
-       if (wbc->sync_mode == WB_SYNC_ALL)
-               return;
-
-       up_read(&sb->s_umount);
-       put_super(sb);
+pinned:
+       *psb = sb;
+       return 0;
  }
  
  static void writeback_inodes_wb(struct bdi_writeback *wb,
                                 struct writeback_control *wbc)
  {
-       struct super_block *sb = wbc->sb;
+       struct super_block *sb = wbc->sb, *pin_sb = NULL;
         const int is_blkdev_sb = sb_is_blkdev_sb(sb);
         const unsigned long start = jiffies;    /* livelock avoidance */
  
@@ -619,7 +670,7 @@ static void writeback_inodes_wb(struct bdi_writeback *wb,
                 if (inode_dirtied_after(inode, start))
                         break;
  
-               if (pin_sb_for_writeback(wbc, inode)) {
+               if (pin_sb_for_writeback(wbc, inode, &pin_sb)) {
                         requeue_io(inode);
                         continue;
                 }
@@ -628,7 +679,6 @@ static void writeback_inodes_wb(struct bdi_writeback *wb,
                 __iget(inode);
                 pages_skipped = wbc->pages_skipped;
                 writeback_single_inode(inode, wbc);
-               unpin_sb_for_writeback(wbc, inode);
                 if (wbc->pages_skipped != pages_skipped) {
                         /*
                          * writeback is not making progress due to locked
@@ -648,6 +698,8 @@ static void writeback_inodes_wb(struct bdi_writeback *wb,
                         wbc->more_io = 1;
         }
  
+       unpin_sb_for_writeback(&pin_sb);
+
         spin_unlock(&inode_lock);
         /* Leave any unwritten inodes on b_io */
  }
@@ -706,6 +758,7 @@ static long wb_writeback(struct bdi_writeback *wb,
         };
         unsigned long oldest_jif;
         long wrote = 0;
+       struct inode *inode;
  
         if (wbc.for_kupdate) {
                 wbc.older_than_this = &oldest_jif;
@@ -719,20 +772,16 @@ static long wb_writeback(struct bdi_writeback *wb,
  
         for (;;) {
                 /*
-                * Don't flush anything for non-integrity writeback where
-                * no nr_pages was given
+                * Stop writeback when nr_pages has been consumed
                  */
-               if (!args->for_kupdate && args->nr_pages <= 0 &&
-                    args->sync_mode == WB_SYNC_NONE)
+               if (args->nr_pages <= 0)
                         break;
  
                 /*
-                * If no specific pages were given and this is just a
-                * periodic background writeout and we are below the
-                * background dirty threshold, don't do anything
+                * For background writeout, stop when we are below the
+                * background dirty threshold
                  */
-               if (args->for_kupdate && args->nr_pages <= 0 &&
-                   !over_bground_thresh())
+               if (args->for_background && !over_bground_thresh())
                         break;
  
                 wbc.more_io = 0;
@@ -744,13 +793,32 @@ static long wb_writeback(struct bdi_writeback *wb,
                 wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write;
  
                 /*
-                * If we ran out of stuff to write, bail unless more_io got set
+                * If we consumed everything, see if we have more
                  */
-               if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
-                       if (wbc.more_io && !wbc.for_kupdate)
-                               continue;
+               if (wbc.nr_to_write <= 0)
+                       continue;
+               /*
+                * Didn't write everything and we don't have more IO, bail
+                */
+               if (!wbc.more_io)
                         break;
+               /*
+                * Did we write something? Try for more
+                */
+               if (wbc.nr_to_write < MAX_WRITEBACK_PAGES)
+                       continue;
+               /*
+                * Nothing written. Wait for some inode to
+                * become available for writeback. Otherwise
+                * we'll just busyloop.
+                */
+               spin_lock(&inode_lock);
+               if (!list_empty(&wb->b_more_io))  {
+                       inode = list_entry(wb->b_more_io.prev,
+                                               struct inode, i_list);
+                       inode_wait_for_writeback(inode);
                 }
+               spin_unlock(&inode_lock);
         }
  
         return wrote;
@@ -1060,9 +1128,6 @@ EXPORT_SYMBOL(__mark_inode_dirty);
   * If older_than_this is non-NULL, then only write out inodes which
   * had their first dirtying at a time earlier than *older_than_this.
   *
- * If we're a pdlfush thread, then implement pdflush collision avoidance
- * against the entire list.
- *
   * If `bdi' is non-zero then we're being asked to writeback a specific queue.
   * This function assumes that the blockdev superblock's inodes are backed by
   * a variety of queues, so all inodes are searched.  For other superblocks,
@@ -1141,7 +1206,7 @@ void writeback_inodes_sb(struct super_block *sb)
         nr_to_write = nr_dirty + nr_unstable +
                         (inodes_stat.nr_inodes - inodes_stat.nr_unused);
  
-       bdi_writeback_all(sb, nr_to_write);
+       bdi_start_writeback(sb->s_bdi, nr_to_write);
  }
  EXPORT_SYMBOL(writeback_inodes_sb);
  
diff --git a/mm/page-writeback.c b/mm/page-writeback.c

index d99664e8607e761235a13b2662353df131ac5b41..69b5fbabc8bd4e9248f54f460d0f01b4e75f8e86 100644 (file)
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -44,18 +44,21 @@ static long ratelimit_pages = 32;
  /*
   * When balance_dirty_pages decides that the caller needs to perform some
   * non-background writeback, this is how many pages it will attempt to write.
- * It should be somewhat larger than RATELIMIT_PAGES to ensure that reasonably
+ * It should be somewhat larger than dirtied pages to ensure that reasonably
   * large amounts of I/O are submitted.
   */
-static inline long sync_writeback_pages(void)
+static inline long sync_writeback_pages(unsigned long dirtied)
  {
-       return ratelimit_pages + ratelimit_pages / 2;
+       if (dirtied < ratelimit_pages)
+               dirtied = ratelimit_pages;
+
+       return dirtied + dirtied / 2;
  }
  
  /* The following parameters are exported via /proc/sys/vm */
  
  /*
- * Start background writeback (via pdflush) at this percentage
+ * Start background writeback (via writeback threads) at this percentage
   */
  int dirty_background_ratio = 10;
  
@@ -474,10 +477,11 @@ get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty,
   * balance_dirty_pages() must be called by processes which are generating dirty
   * data.  It looks at the number of dirty pages in the machine and will force
   * the caller to perform writeback if the system is over `vm_dirty_ratio'.
- * If we're over `background_thresh' then pdflush is woken to perform some
- * writeout.
+ * If we're over `background_thresh' then the writeback threads are woken to
+ * perform some writeout.
   */
-static void balance_dirty_pages(struct address_space *mapping)
+static void balance_dirty_pages(struct address_space *mapping,
+                               unsigned long write_chunk)
  {
         long nr_reclaimable, bdi_nr_reclaimable;
         long nr_writeback, bdi_nr_writeback;
@@ -485,7 +489,6 @@ static void balance_dirty_pages(struct address_space *mapping)
         unsigned long dirty_thresh;
         unsigned long bdi_thresh;
         unsigned long pages_written = 0;
-       unsigned long write_chunk = sync_writeback_pages();
         unsigned long pause = 1;
  
         struct backing_dev_info *bdi = mapping->backing_dev_info;
@@ -579,7 +582,7 @@ static void balance_dirty_pages(struct address_space *mapping)
                 bdi->dirty_exceeded = 0;
  
         if (writeback_in_progress(bdi))
-               return;         /* pdflush is already working this queue */
+               return;
  
         /*
          * In laptop mode, we wait until hitting the higher threshold before
@@ -590,10 +593,10 @@ static void balance_dirty_pages(struct address_space *mapping)
          * background_thresh, to keep the amount of dirty memory low.
          */
         if ((laptop_mode && pages_written) ||
-           (!laptop_mode && ((nr_writeback = global_page_state(NR_FILE_DIRTY)
-                                         + global_page_state(NR_UNSTABLE_NFS))
+           (!laptop_mode && ((global_page_state(NR_FILE_DIRTY)
+                              + global_page_state(NR_UNSTABLE_NFS))
                                           > background_thresh)))
-               bdi_start_writeback(bdi, nr_writeback);
+               bdi_start_writeback(bdi, 0);
  }
  
  void set_page_dirty_balance(struct page *page, int page_mkwrite)
@@ -640,9 +643,10 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
         p =  &__get_cpu_var(bdp_ratelimits);
         *p += nr_pages_dirtied;
         if (unlikely(*p >= ratelimit)) {
+               ratelimit = sync_writeback_pages(*p);
                 *p = 0;
                 preempt_enable();
-               balance_dirty_pages(mapping);
+               balance_dirty_pages(mapping, ratelimit);
                 return;
         }
         preempt_enable();
diff --git a/mm/shmem.c b/mm/shmem.c

index 98631c26c20001931a6e4ca13032716992d6808c..ccf446a9faa141e28e30fd66ca035503f72c6763 100644 (file)
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1046,8 +1046,9 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
          * sync from ever calling shmem_writepage; but a stacking filesystem
          * may use the ->writepage of its underlying filesystem, in which case
          * tmpfs should write out to swap only in response to memory pressure,
-        * and not for pdflush or sync.  However, in those cases, we do still
-        * want to check if there's a redundant swappage to be discarded.
+        * and not for the writeback threads or sync.  However, in those cases,
+        * we do still want to check if there's a redundant swappage to be
+        * discarded.
          */
         if (wbc->for_reclaim)
                 swap = get_swap_page();
diff --git a/mm/vmscan.c b/mm/vmscan.c

index 1219ceb8a9b2d992da20bb9a10942e7cef2d98b1..64e438898832371277e8baeb89fd5ebb1f72a14a 100644 (file)
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1709,10 +1709,10 @@ static void shrink_zones(int priority, struct zonelist *zonelist,
   *
   * If the caller is !__GFP_FS then the probability of a failure is reasonably
   * high - the zone may be full of dirty or under-writeback pages, which this
- * caller can't do much about.  We kick pdflush and take explicit naps in the
- * hope that some of these pages can be written.  But if the allocating task
- * holds filesystem locks which prevent writeout this might not work, and the
- * allocation attempt will fail.
+ * caller can't do much about.  We kick the writeback threads and take explicit
+ * naps in the hope that some of these pages can be written.  But if the
+ * allocating task holds filesystem locks which prevent writeout this might not
+ * work, and the allocation attempt will fail.
   *
   * returns:    0, if no pages reclaimed
   *             else, the number of pages reclaimed
author	Linus Torvalds <torvalds@linux-foundation.org>
	Fri, 25 Sep 2009 16:27:30 +0000 (09:27 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Fri, 25 Sep 2009 16:27:30 +0000 (09:27 -0700)
fs/buffer.c		patch \| blob \| blame \| history
fs/fs-writeback.c		patch \| blob \| blame \| history
mm/page-writeback.c		patch \| blob \| blame \| history
mm/shmem.c		patch \| blob \| blame \| history
mm/vmscan.c		patch \| blob \| blame \| history