char_dev.o stat.o exec.o pipe.o namei.o fcntl.o \
ioctl.o readdir.o select.o fifo.o dcache.o inode.o \
attr.o bad_inode.o file.o filesystems.o namespace.o \
- seq_file.o xattr.o libfs.o fs-writeback.o \
+ seq_file.o xattr.o libfs.o fs-writeback.o flushtree.o \
pnode.o drop_caches.o splice.o sync.o utimes.o \
stack.o fs_struct.o
--- /dev/null
+#include <linux/fs.h>
+#include <linux/writeback.h>
+#include <linux/backing-dev.h>
+#include <linux/rbtree.h>
+
+#include "flushtree.h"
+
+#define rb_to_inode(node) rb_entry((node), struct inode, i_flush_node)
+
+/*
+ * When inodes are parked for writeback they are parked in the
+ * flush_tree. The flush tree is a data structure based on an rb tree.
+ *
+ * Duplicate keys are handled by making a list in the tree for each key
+ * value. The order of how we choose the next inode to flush is decided
+ * by two fields. First the earliest dirtied_when value. If there are
+ * duplicate dirtied_when values then the earliest i_flushed_when value
+ * determines who gets flushed next.
+ *
+ * The flush tree organizes the dirtied_when keys with the rb_tree. Any
+ * inodes with a duplicate dirtied_when value are link listed together. This
+ * link list is sorted by the inode's i_flushed_when. When both the
+ * dirtied_when and the i_flushed_when are indentical the order in the
+ * linked list determines the order we flush the inodes.
+ */
+
+/*
+ * Find a rb_node matching the key in the flush tree. There are no duplicate
+ * rb_nodes in the tree. Instead they are chained off the first node.
+ */
+static struct inode *flush_tree_search(struct bdi_writeback *wb,
+ unsigned long ts)
+{
+ struct rb_node *n = wb->flush_tree.rb_node;
+
+ while (n) {
+ struct inode *inode = rb_to_inode(n);
+
+ if (time_before(ts, inode->dirtied_when))
+ n = n->rb_left;
+ else if (time_after(ts, inode->dirtied_when))
+ n = n->rb_right;
+ else
+ return inode;
+ }
+
+ return NULL;
+}
+
+/*
+ * Inserting an inode into the flush tree. The tree is keyed by the
+ * dirtied_when member.
+ *
+ * If there is a duplicate key in the tree already the new inode is put
+ * on the tail of a list of the rb_node.
+ * All inserted inodes must have one of the I_DIRTY flags set.
+ */
+void flush_tree_insert(struct inode *inode)
+{
+ struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
+ struct rb_node **new = &wb->flush_tree.rb_node;
+ struct rb_node *parent = NULL;
+
+ BUG_ON((inode->i_state & I_DIRTY) == 0);
+ BUG_ON(inode->i_state & (I_FREEING|I_CLEAR));
+ BUG_ON(!RB_EMPTY_NODE(&inode->i_flush_node));
+
+ list_del_init(&inode->i_list);
+ while (*new) {
+ struct inode *this = rb_to_inode(*new);
+
+ parent = *new;
+ if (time_before(inode->dirtied_when, this->dirtied_when))
+ new = &parent->rb_left;
+ else if (time_after(inode->dirtied_when, this->dirtied_when))
+ new = &parent->rb_right;
+ else {
+ list_add_tail(&inode->i_list, &this->i_list);
+ return;
+ }
+ }
+
+ /* Add in the new node and rebalance the tree */
+ rb_link_node(&inode->i_flush_node, parent, new);
+ rb_insert_color(&inode->i_flush_node, &wb->flush_tree);
+}
+
+/*
+ * Here we return the inode that has the smallest key in the flush tree
+ * that is greater than the parameter "prev_time".
+ */
+static struct inode *flush_tree_min_greater(struct bdi_writeback *wb,
+ unsigned long prev_time)
+{
+ struct rb_node *node = wb->flush_tree.rb_node;
+ struct inode *best = NULL;
+
+ while (node) {
+ struct inode *data = rb_to_inode(node);
+
+ /* Just trying to get lucky */
+ if ((prev_time + 1) == data->dirtied_when)
+ return data;
+
+ /* If this value is greater than our prev_time and is
+ less than the best so far, this is our new best so far.*/
+ if ((data->dirtied_when > prev_time) &&
+ (!best || best->dirtied_when > data->dirtied_when))
+ best = data;
+
+ /* Search all the way down to the bottom of the tree */
+ if (time_before(prev_time, data->dirtied_when))
+ node = node->rb_left;
+ else if (time_after_eq(prev_time, data->dirtied_when))
+ node = node->rb_right;
+ }
+
+ return best;
+}
+
+/*
+ * Here is where we interate to find the next inode to process. The
+ * strategy is to first look for any other inodes with the same dirtied_when
+ * value. If we have already processed that node then we need to find
+ * the next highest dirtied_when value in the tree.
+ */
+struct inode *flush_tree_next(struct bdi_writeback *wb,
+ unsigned long start_time,
+ unsigned long prev_time)
+{
+ struct inode *inode = flush_tree_search(wb, prev_time);
+
+ /* We have a duplicate timed inode as the last processed */
+ if (inode && time_before(inode->i_flushed_when, start_time))
+ return inode;
+
+ /* Now we have to find the oldest one next */
+ return flush_tree_min_greater(wb, prev_time);
+}
+
+/* Removing a node from the flushtree. */
+void flush_tree_remove(struct inode *inode)
+{
+ struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
+ struct rb_node *rb_node = &inode->i_flush_node;
+ struct rb_root *rb_root = &wb->flush_tree;
+
+ BUG_ON((inode->i_state & I_DIRTY) == 0);
+
+ /* There is no chain on this inode. Just remove it from the tree */
+ if (list_empty(&inode->i_list)) {
+ BUG_ON(RB_EMPTY_NODE(rb_node));
+ rb_erase(rb_node, rb_root);
+ RB_CLEAR_NODE(rb_node);
+ return;
+ }
+
+ /* This node is on a chain AND is in the rb_tree */
+ if (!RB_EMPTY_NODE(rb_node)) {
+ struct inode *new = list_entry(inode->i_list.next,
+ struct inode, i_list);
+
+ rb_replace_node(rb_node, &new->i_flush_node, rb_root);
+ RB_CLEAR_NODE(rb_node);
+ }
+ /* Take it off the list */
+ list_del_init(&inode->i_list);
+}
--- /dev/null
+#ifndef WB_FLUSHTREE_H
+#define WB_FLUSHTREE_H
+
+void flush_tree_insert(struct inode *inode);
+void flush_tree_remove(struct inode *inode);
+struct inode *flush_tree_next(struct bdi_writeback *wb, unsigned long start,
+ unsigned long prev);
+
+#endif
#include "internal.h"
#include <trace/events/writeback.h>
-#define inode_to_bdi(inode) ((inode)->i_mapping->backing_dev_info)
+#include "flushtree.h"
/*
* We don't actually have pdflush, but this one is exported though /proc...
bdi_alloc_queue_work(bdi, &args);
}
-/*
- * Redirty an inode: set its when-it-was dirtied timestamp and move it to the
- * furthest end of its superblock's dirty-inode list.
- *
- * Before stamping the inode's ->dirtied_when, we check to see whether it is
- * already the most-recently-dirtied inode on the b_dirty list. If that is
- * the case then the inode must have been redirtied while it was being written
- * out and we don't reset its dirtied_when.
- */
-static void redirty_tail(struct inode *inode)
-{
- struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
-
- if (!list_empty(&wb->b_dirty)) {
- struct inode *tail;
-
- tail = list_entry(wb->b_dirty.next, struct inode, i_list);
- if (time_before(inode->dirtied_when, tail->dirtied_when))
- inode->dirtied_when = jiffies;
- }
- list_move(&inode->i_list, &wb->b_dirty);
-}
-
-/*
- * requeue inode for re-scanning after bdi->b_io list is exhausted.
- */
-static void requeue_io(struct inode *inode)
-{
- struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
-
- list_move(&inode->i_list, &wb->b_more_io);
-}
-
static void inode_sync_complete(struct inode *inode)
{
/*
return ret;
}
-/*
- * Move expired dirty inodes from @delaying_queue to @dispatch_queue.
- */
-static void move_expired_inodes(struct list_head *delaying_queue,
- struct list_head *dispatch_queue,
- unsigned long *older_than_this)
-{
- LIST_HEAD(tmp);
- struct list_head *pos, *node;
- struct super_block *sb = NULL;
- struct inode *inode;
- int do_sb_sort = 0;
-
- while (!list_empty(delaying_queue)) {
- inode = list_entry(delaying_queue->prev, struct inode, i_list);
- if (older_than_this &&
- inode_dirtied_after(inode, *older_than_this))
- break;
- if (sb && sb != inode->i_sb)
- do_sb_sort = 1;
- sb = inode->i_sb;
- list_move(&inode->i_list, &tmp);
- }
-
- /* just one sb in list, splice to dispatch_queue and we're done */
- if (!do_sb_sort) {
- list_splice(&tmp, dispatch_queue);
- return;
- }
-
- /* Move inodes from one superblock together */
- while (!list_empty(&tmp)) {
- inode = list_entry(tmp.prev, struct inode, i_list);
- sb = inode->i_sb;
- list_for_each_prev_safe(pos, node, &tmp) {
- inode = list_entry(pos, struct inode, i_list);
- if (inode->i_sb == sb)
- list_move(&inode->i_list, dispatch_queue);
- }
- }
-}
-
-/*
- * Queue all expired dirty inodes for io, eldest first.
- */
-static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this)
-{
- list_splice_init(&wb->b_more_io, wb->b_io.prev);
- move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this);
-}
-
static int write_inode(struct inode *inode, int sync)
{
if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode))
* We'll have another go at writing back this inode when we
* completed a full scan of b_io.
*/
- if (!wait) {
- requeue_io(inode);
+ if (!wait)
return 0;
- }
/*
* It's a data-integrity sync. We must wait.
BUG_ON(inode->i_state & I_SYNC);
/* Set I_SYNC, reset I_DIRTY */
+ flush_tree_remove(inode);
dirty = inode->i_state & I_DIRTY;
inode->i_state |= I_SYNC;
inode->i_state &= ~I_DIRTY;
/*
* More pages get dirtied by a fast dirtier.
*/
- goto select_queue;
+ flush_tree_insert(inode);
} else if (inode->i_state & I_DIRTY) {
/*
* At least XFS will redirty the inode during the
* writeback (delalloc) and on io completion (isize).
*/
- redirty_tail(inode);
+ inode->dirtied_when = jiffies;
+ flush_tree_insert(inode);
} else if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
/*
* We didn't write back all the pages. nfs_writepages()
*/
if (wbc->for_kupdate) {
/*
- * For the kupdate function we move the inode
- * to b_more_io so it will get more writeout as
- * soon as the queue becomes uncongested.
+ * For the kupdate function we leave
+ * dirtied_when field untouched and return
+ * it to the flush_tree. The next iteration
+ * of kupdate will flush more pages when
+ * the queue is no longer congested.
*/
inode->i_state |= I_DIRTY_PAGES;
-select_queue:
- if (wbc->nr_to_write <= 0) {
- /*
- * slice used up: queue for next turn
- */
- requeue_io(inode);
- } else {
- /*
- * somehow blocked: retry later
- */
- redirty_tail(inode);
- }
+ flush_tree_insert(inode);
} else {
/*
* Otherwise fully redirty the inode so that
* all the other files.
*/
inode->i_state |= I_DIRTY_PAGES;
- redirty_tail(inode);
+ inode->dirtied_when = jiffies;
+ flush_tree_insert(inode);
}
} else if (atomic_read(&inode->i_count)) {
/*
{
struct super_block *sb = wbc->sb, *pin_sb = NULL;
const unsigned long start = jiffies; /* livelock avoidance */
+ struct inode *inode = NULL;
+ unsigned long prev_time = 0;
spin_lock(&inode_lock);
- if (!wbc->for_kupdate || list_empty(&wb->b_io))
- queue_io(wb, wbc->older_than_this);
-
- while (!list_empty(&wb->b_io)) {
- struct inode *inode = list_entry(wb->b_io.prev,
- struct inode, i_list);
+ while ((inode = flush_tree_next(wb, start, prev_time)) != NULL) {
long pages_skipped;
+ prev_time = inode->dirtied_when;
+ inode->i_flushed_when = start;
+
/*
* super block given and doesn't match, skip this inode
*/
- if (sb && sb != inode->i_sb) {
- redirty_tail(inode);
+ if (sb && sb != inode->i_sb)
continue;
- }
- if (inode->i_state & (I_NEW | I_WILL_FREE)) {
- requeue_io(inode);
+ if (inode->i_state & (I_NEW | I_WILL_FREE))
continue;
- }
/*
* Was this inode dirtied after sync_sb_inodes was called?
if (inode_dirtied_after(inode, start))
break;
+ /* Was this inode dirtied too recently? */
+ if (wbc->older_than_this &&
+ time_after(inode->dirtied_when, *wbc->older_than_this))
+ break;
+
if (pin_sb_for_writeback(wbc, inode, &pin_sb)) {
- requeue_io(inode);
+ wbc->more_io = 1;
continue;
}
__iget(inode);
pages_skipped = wbc->pages_skipped;
writeback_single_inode(inode, wbc);
- if (wbc->pages_skipped != pages_skipped) {
- /*
- * writeback is not making progress due to locked
- * buffers. Skip this inode for now.
- */
- redirty_tail(inode);
- }
spin_unlock(&inode_lock);
iput(inode);
cond_resched();
wbc->more_io = 1;
break;
}
- if (!list_empty(&wb->b_more_io))
- wbc->more_io = 1;
}
unpin_sb_for_writeback(&pin_sb);
spin_unlock(&inode_lock);
- /* Leave any unwritten inodes on b_io */
}
void writeback_inodes_wbc(struct writeback_control *wbc)
};
unsigned long oldest_jif;
long wrote = 0;
- struct inode *inode;
if (wbc.for_kupdate) {
wbc.older_than_this = &oldest_jif;
*/
if (wbc.nr_to_write < MAX_WRITEBACK_PAGES)
continue;
+#if 0
/*
* Nothing written. Wait for some inode to
* become available for writeback. Otherwise
trace_writeback_inode_wait(0);
}
spin_unlock(&inode_lock);
+#endif
}
return wrote;
if ((inode->i_state & flags) == flags)
return;
+#if 0
+ /* anonynous file systems do not write data back */
+ if (inode->i_sb->s_type->fs_flags & FS_ANONYMOUS)
+ return;
+#endif
+
+ if (inode->i_state & I_DIRTY_NEVER)
+ return;
+
if (unlikely(block_dump))
block_dump___mark_inode_dirty(inode);
if ((inode->i_state & flags) != flags) {
const int was_dirty = inode->i_state & I_DIRTY;
- inode->i_state |= flags;
-
- /*
- * If the inode is being synced, just update its dirty state.
- * The unlocker will place the inode on the appropriate
- * superblock list, based upon its state.
- */
- if (inode->i_state & I_SYNC)
+ if (inode->i_state & (I_FREEING|I_CLEAR))
goto out;
/*
if (hlist_unhashed(&inode->i_hash))
goto out;
}
- if (inode->i_state & (I_FREEING|I_CLEAR))
+
+ inode->i_state |= flags;
+
+ /*
+ * If the inode is being synced, just update its dirty state.
+ * The unlocker will place the inode on the appropriate
+ * superblock list, based upon its state.
+ */
+ if (inode->i_state & I_SYNC)
goto out;
/*
}
inode->dirtied_when = jiffies;
- list_move(&inode->i_list, &wb->b_dirty);
+ inode->i_flushed_when = inode->dirtied_when;
+ flush_tree_insert(inode);
}
}
out:
#include <linux/mount.h>
#include <linux/async.h>
#include <linux/posix_acl.h>
+#include "flushtree.h"
/*
* This is needed for the following functions:
inode->i_cdev = NULL;
inode->i_rdev = 0;
inode->dirtied_when = 0;
+ RB_CLEAR_NODE(&inode->i_flush_node);
if (security_inode_alloc(inode))
goto out;
{
const struct super_operations *op = inode->i_sb->s_op;
+ if ((inode->i_state & I_DIRTY)) {
+ flush_tree_remove(inode);
+ inode->i_state &= ~I_DIRTY;
+ }
+
list_del_init(&inode->i_list);
list_del_init(&inode->i_sb_list);
WARN_ON(inode->i_state & I_NEW);
inode->i_fop = &rdwr_pipefifo_fops;
/*
- * Mark the inode dirty from the very beginning,
- * that way it will never be moved to the dirty
- * list because "mark_inode_dirty()" will think
- * that it already _is_ on the dirty list.
+ * Mark the inode "never dirty" from the very beginning,
+ * that way it will never be written back.
*/
- inode->i_state = I_DIRTY;
+ inode->i_state = I_DIRTY_NEVER;
inode->i_mode = S_IFIFO | S_IRUSR | S_IWUSR;
inode->i_uid = current_fsuid();
inode->i_gid = current_fsgid();
unsigned long last_old_flush; /* last old data flush */
struct task_struct *task; /* writeback task */
- struct list_head b_dirty; /* dirty inodes */
- struct list_head b_io; /* parked for writeback */
- struct list_head b_more_io; /* parked for more writeback */
+
+ struct rb_root flush_tree;
};
struct backing_dev_info {
void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb,
long nr_pages);
int bdi_writeback_task(struct bdi_writeback *wb);
-int bdi_has_dirty_io(struct backing_dev_info *bdi);
+bool bdi_has_dirty_io(struct backing_dev_info *bdi);
extern spinlock_t bdi_lock;
extern struct list_head bdi_list;
-static inline int wb_has_dirty_io(struct bdi_writeback *wb)
+static inline bool wb_has_dirty_io(struct bdi_writeback *wb)
{
- return !list_empty(&wb->b_dirty) ||
- !list_empty(&wb->b_io) ||
- !list_empty(&wb->b_more_io);
+ return !RB_EMPTY_ROOT(&wb->flush_tree);
}
static inline void __add_bdi_stat(struct backing_dev_info *bdi,
#include <linux/limits.h>
#include <linux/ioctl.h>
+#include <linux/rbtree.h>
/*
* It's silly to have NR_OPEN bigger than NR_FILE, but you can change
struct hlist_node i_hash;
struct list_head i_list; /* backing dev IO list */
struct list_head i_sb_list;
+ struct rb_node i_flush_node;
+ unsigned long i_flushed_when;
struct list_head i_dentry;
unsigned long i_ino;
atomic_t i_count;
#define I_CLEAR 64
#define __I_SYNC 7
#define I_SYNC (1 << __I_SYNC)
+#define I_DIRTY_NEVER (1 << 9)
#define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES)
long wb_do_writeback(struct bdi_writeback *wb, int force_wait);
void wakeup_flusher_threads(long nr_pages);
+#define inode_to_bdi(inode) ((inode)->i_mapping->backing_dev_info)
+
/* writeback.h requires fs.h; it, too, is not included from here. */
static inline void wait_on_inode(struct inode *inode)
{
unsigned long background_thresh;
unsigned long dirty_thresh;
unsigned long bdi_thresh;
- unsigned long nr_dirty, nr_io, nr_more_io, nr_wb;
- struct inode *inode;
+ unsigned long nr_dirty, nr_wb;
/*
* inode lock is enough here, the bdi->wb_list is protected by
* RCU on the reader side
*/
- nr_wb = nr_dirty = nr_io = nr_more_io = 0;
+ nr_wb = nr_dirty = 0;
spin_lock(&inode_lock);
list_for_each_entry(wb, &bdi->wb_list, list) {
+ struct rb_node *n;
+
nr_wb++;
- list_for_each_entry(inode, &wb->b_dirty, i_list)
+ n = rb_first(&wb->flush_tree);
+ while (n) {
nr_dirty++;
- list_for_each_entry(inode, &wb->b_io, i_list)
- nr_io++;
- list_for_each_entry(inode, &wb->b_more_io, i_list)
- nr_more_io++;
+ n = rb_next(n);
+ }
}
spin_unlock(&inode_lock);
"BackgroundThresh: %8lu kB\n"
"WritebackThreads: %8lu\n"
"b_dirty: %8lu\n"
- "b_io: %8lu\n"
- "b_more_io: %8lu\n"
"bdi_list: %8u\n"
"state: %8lx\n"
"wb_mask: %8lx\n"
(unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)),
(unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)),
K(bdi_thresh), K(dirty_thresh),
- K(background_thresh), nr_wb, nr_dirty, nr_io, nr_more_io,
+ K(background_thresh), nr_wb, nr_dirty,
!list_empty(&bdi->bdi_list), bdi->state, bdi->wb_mask,
!list_empty(&bdi->wb_list), bdi->wb_cnt);
#undef K
wb->bdi = bdi;
wb->last_old_flush = jiffies;
- INIT_LIST_HEAD(&wb->b_dirty);
- INIT_LIST_HEAD(&wb->b_io);
- INIT_LIST_HEAD(&wb->b_more_io);
+ wb->flush_tree = RB_ROOT;
}
static void bdi_task_init(struct backing_dev_info *bdi,
return ret;
}
-int bdi_has_dirty_io(struct backing_dev_info *bdi)
+bool bdi_has_dirty_io(struct backing_dev_info *bdi)
{
return wb_has_dirty_io(&bdi->wb);
}
{
int i;
- /*
- * Splice our entries to the default_backing_dev_info, if this
- * bdi disappears
- */
- if (bdi_has_dirty_io(bdi)) {
- struct bdi_writeback *dst = &default_backing_dev_info.wb;
-
- spin_lock(&inode_lock);
- list_splice(&bdi->wb.b_dirty, &dst->b_dirty);
- list_splice(&bdi->wb.b_io, &dst->b_io);
- list_splice(&bdi->wb.b_more_io, &dst->b_more_io);
- spin_unlock(&inode_lock);
- }
+ BUG_ON(bdi_has_dirty_io(bdi));
bdi_unregister(bdi);