[linux-2.6-block.git] / fs / ocfs2 / uptodate.c

/* -*- mode: c; c-basic-offset: 8; -*-
 * vim: noexpandtab sw=8 ts=8 sts=0:
 *
 * uptodate.c
 *
 * Tracking the up-to-date-ness of a local buffer_head with respect to
 * the cluster.
 *
 * Copyright (C) 2002, 2004, 2005 Oracle.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public
 * License as published by the Free Software Foundation; either
 * version 2 of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public
 * License along with this program; if not, write to the
 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
 * Boston, MA 021110-1307, USA.
 *
 * Standard buffer head caching flags (uptodate, etc) are insufficient
 * in a clustered environment - a buffer may be marked up to date on
 * our local node but could have been modified by another cluster
 * member. As a result an additional (and performant) caching scheme
 * is required. A further requirement is that we consume as little
 * memory as possible - we never pin buffer_head structures in order
 * to cache them.
 *
 * We track the existence of up to date buffers on the inodes which
 * are associated with them. Because we don't want to pin
 * buffer_heads, this is only a (strong) hint and several other checks
 * are made in the I/O path to ensure that we don't use a stale or
 * invalid buffer without going to disk:
 *	- buffer_jbd is used liberally - if a bh is in the journal on
 *	  this node then it *must* be up to date.
 *	- the standard buffer_uptodate() macro is used to detect buffers
 *	  which may be invalid (even if we have an up to date tracking
 * 	  item for them)
 *
 * For a full understanding of how this code works together, one
 * should read the callers in dlmglue.c, the I/O functions in
 * buffer_head_io.c and ocfs2_journal_access in journal.c
 */

#include <linux/fs.h>
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/highmem.h>
#include <linux/buffer_head.h>
#include <linux/rbtree.h>
#ifndef CONFIG_OCFS2_COMPAT_JBD
# include <linux/jbd2.h>
#else
# include <linux/jbd.h>
#endif

#define MLOG_MASK_PREFIX ML_UPTODATE

#include <cluster/masklog.h>

#include "ocfs2.h"

#include "inode.h"
#include "uptodate.h"

struct ocfs2_meta_cache_item {
	struct rb_node	c_node;
	sector_t	c_block;
};

static struct kmem_cache *ocfs2_uptodate_cachep = NULL;

void ocfs2_metadata_cache_init(struct inode *inode)
{
	struct ocfs2_inode_info *oi = OCFS2_I(inode);
	struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;

	oi->ip_flags |= OCFS2_INODE_CACHE_INLINE;
	ci->ci_num_cached = 0;
}

/* No lock taken here as 'root' is not expected to be visible to other
 * processes. */
static unsigned int ocfs2_purge_copied_metadata_tree(struct rb_root *root)
{
	unsigned int purged = 0;
	struct rb_node *node;
	struct ocfs2_meta_cache_item *item;

	while ((node = rb_last(root)) != NULL) {
		item = rb_entry(node, struct ocfs2_meta_cache_item, c_node);

		mlog(0, "Purge item %llu\n",
		     (unsigned long long) item->c_block);

		rb_erase(&item->c_node, root);
		kmem_cache_free(ocfs2_uptodate_cachep, item);

		purged++;
	}
	return purged;
}

/* Called from locking and called from ocfs2_clear_inode. Dump the
 * cache for a given inode.
 *
 * This function is a few more lines longer than necessary due to some
 * accounting done here, but I think it's worth tracking down those
 * bugs sooner -- Mark */
void ocfs2_metadata_cache_purge(struct inode *inode)
{
	struct ocfs2_inode_info *oi = OCFS2_I(inode);
	unsigned int tree, to_purge, purged;
	struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
	struct rb_root root = RB_ROOT;

	spin_lock(&oi->ip_lock);
	tree = !(oi->ip_flags & OCFS2_INODE_CACHE_INLINE);
	to_purge = ci->ci_num_cached;

	mlog(0, "Purge %u %s items from Inode %llu\n", to_purge,
	     tree ? "array" : "tree", (unsigned long long)oi->ip_blkno);

	/* If we're a tree, save off the root so that we can safely
	 * initialize the cache. We do the work to free tree members
	 * without the spinlock. */
	if (tree)
		root = ci->ci_cache.ci_tree;

	ocfs2_metadata_cache_init(inode);
	spin_unlock(&oi->ip_lock);

	purged = ocfs2_purge_copied_metadata_tree(&root);
	/* If possible, track the number wiped so that we can more
	 * easily detect counting errors. Unfortunately, this is only
	 * meaningful for trees. */
	if (tree && purged != to_purge)
		mlog(ML_ERROR, "Inode %llu, count = %u, purged = %u\n",
		     (unsigned long long)oi->ip_blkno, to_purge, purged);
}

/* Returns the index in the cache array, -1 if not found.
 * Requires ip_lock. */
static int ocfs2_search_cache_array(struct ocfs2_caching_info *ci,
				    sector_t item)
{
	int i;

	for (i = 0; i < ci->ci_num_cached; i++) {
		if (item == ci->ci_cache.ci_array[i])
			return i;
	}

	return -1;
}

/* Returns the cache item if found, otherwise NULL.
 * Requires ip_lock. */
static struct ocfs2_meta_cache_item *
ocfs2_search_cache_tree(struct ocfs2_caching_info *ci,
			sector_t block)
{
	struct rb_node * n = ci->ci_cache.ci_tree.rb_node;
	struct ocfs2_meta_cache_item *item = NULL;

	while (n) {
		item = rb_entry(n, struct ocfs2_meta_cache_item, c_node);

		if (block < item->c_block)
			n = n->rb_left;
		else if (block > item->c_block)
			n = n->rb_right;
		else
			return item;
	}

	return NULL;
}

static int ocfs2_buffer_cached(struct ocfs2_inode_info *oi,
			       struct buffer_head *bh)
{
	int index = -1;
	struct ocfs2_meta_cache_item *item = NULL;

	spin_lock(&oi->ip_lock);

	mlog(0, "Inode %llu, query block %llu (inline = %u)\n",
	     (unsigned long long)oi->ip_blkno,
	     (unsigned long long) bh->b_blocknr,
	     !!(oi->ip_flags & OCFS2_INODE_CACHE_INLINE));

	if (oi->ip_flags & OCFS2_INODE_CACHE_INLINE)
		index = ocfs2_search_cache_array(&oi->ip_metadata_cache,
						 bh->b_blocknr);
	else
		item = ocfs2_search_cache_tree(&oi->ip_metadata_cache,
					       bh->b_blocknr);

	spin_unlock(&oi->ip_lock);

	mlog(0, "index = %d, item = %p\n", index, item);

	return (index != -1) || (item != NULL);
}

/* Warning: even if it returns true, this does *not* guarantee that
 * the block is stored in our inode metadata cache. 
 * 
 * This can be called under lock_buffer()
 */
int ocfs2_buffer_uptodate(struct inode *inode,
			  struct buffer_head *bh)
{
	/* Doesn't matter if the bh is in our cache or not -- if it's
	 * not marked uptodate then we know it can't have correct
	 * data. */
	if (!buffer_uptodate(bh))
		return 0;

	/* OCFS2 does not allow multiple nodes to be changing the same
	 * block at the same time. */
	if (buffer_jbd(bh))
		return 1;

	/* Ok, locally the buffer is marked as up to date, now search
	 * our cache to see if we can trust that. */
	return ocfs2_buffer_cached(OCFS2_I(inode), bh);
}

/* 
 * Determine whether a buffer is currently out on a read-ahead request.
 * ip_io_sem should be held to serialize submitters with the logic here.
 */
int ocfs2_buffer_read_ahead(struct inode *inode,
			    struct buffer_head *bh)
{
	return buffer_locked(bh) && ocfs2_buffer_cached(OCFS2_I(inode), bh);
}

/* Requires ip_lock */
static void ocfs2_append_cache_array(struct ocfs2_caching_info *ci,
				     sector_t block)
{
	BUG_ON(ci->ci_num_cached >= OCFS2_INODE_MAX_CACHE_ARRAY);

	mlog(0, "block %llu takes position %u\n", (unsigned long long) block,
	     ci->ci_num_cached);

	ci->ci_cache.ci_array[ci->ci_num_cached] = block;
	ci->ci_num_cached++;
}

/* By now the caller should have checked that the item does *not*
 * exist in the tree.
 * Requires ip_lock. */
static void __ocfs2_insert_cache_tree(struct ocfs2_caching_info *ci,
				      struct ocfs2_meta_cache_item *new)
{
	sector_t block = new->c_block;
	struct rb_node *parent = NULL;
	struct rb_node **p = &ci->ci_cache.ci_tree.rb_node;
	struct ocfs2_meta_cache_item *tmp;

	mlog(0, "Insert block %llu num = %u\n", (unsigned long long) block,
	     ci->ci_num_cached);

	while(*p) {
		parent = *p;

		tmp = rb_entry(parent, struct ocfs2_meta_cache_item, c_node);

		if (block < tmp->c_block)
			p = &(*p)->rb_left;
		else if (block > tmp->c_block)
			p = &(*p)->rb_right;
		else {
			/* This should never happen! */
			mlog(ML_ERROR, "Duplicate block %llu cached!\n",
			     (unsigned long long) block);
			BUG();
		}
	}

	rb_link_node(&new->c_node, parent, p);
	rb_insert_color(&new->c_node, &ci->ci_cache.ci_tree);
	ci->ci_num_cached++;
}

static inline int ocfs2_insert_can_use_array(struct ocfs2_inode_info *oi,
					     struct ocfs2_caching_info *ci)
{
	assert_spin_locked(&oi->ip_lock);

	return (oi->ip_flags & OCFS2_INODE_CACHE_INLINE) &&
		(ci->ci_num_cached < OCFS2_INODE_MAX_CACHE_ARRAY);
}

/* tree should be exactly OCFS2_INODE_MAX_CACHE_ARRAY wide. NULL the
 * pointers in tree after we use them - this allows caller to detect
 * when to free in case of error. */
static void ocfs2_expand_cache(struct ocfs2_inode_info *oi,
			       struct ocfs2_meta_cache_item **tree)
{
	int i;
	struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;

	mlog_bug_on_msg(ci->ci_num_cached != OCFS2_INODE_MAX_CACHE_ARRAY,
			"Inode %llu, num cached = %u, should be %u\n",
			(unsigned long long)oi->ip_blkno, ci->ci_num_cached,
			OCFS2_INODE_MAX_CACHE_ARRAY);
	mlog_bug_on_msg(!(oi->ip_flags & OCFS2_INODE_CACHE_INLINE),
			"Inode %llu not marked as inline anymore!\n",
			(unsigned long long)oi->ip_blkno);
	assert_spin_locked(&oi->ip_lock);

	/* Be careful to initialize the tree members *first* because
	 * once the ci_tree is used, the array is junk... */
	for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++)
		tree[i]->c_block = ci->ci_cache.ci_array[i];

	oi->ip_flags &= ~OCFS2_INODE_CACHE_INLINE;
	ci->ci_cache.ci_tree = RB_ROOT;
	/* this will be set again by __ocfs2_insert_cache_tree */
	ci->ci_num_cached = 0;

	for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++) {
		__ocfs2_insert_cache_tree(ci, tree[i]);
		tree[i] = NULL;
	}

	mlog(0, "Expanded %llu to a tree cache: flags 0x%x, num = %u\n",
	     (unsigned long long)oi->ip_blkno, oi->ip_flags, ci->ci_num_cached);
}

/* Slow path function - memory allocation is necessary. See the
 * comment above ocfs2_set_buffer_uptodate for more information. */
static void __ocfs2_set_buffer_uptodate(struct ocfs2_inode_info *oi,
					sector_t block,
					int expand_tree)
{
	int i;
	struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
	struct ocfs2_meta_cache_item *new = NULL;
	struct ocfs2_meta_cache_item *tree[OCFS2_INODE_MAX_CACHE_ARRAY] =
		{ NULL, };

	mlog(0, "Inode %llu, block %llu, expand = %d\n",
	     (unsigned long long)oi->ip_blkno,
	     (unsigned long long)block, expand_tree);

	new = kmem_cache_alloc(ocfs2_uptodate_cachep, GFP_NOFS);
	if (!new) {
		mlog_errno(-ENOMEM);
		return;
	}
	new->c_block = block;

	if (expand_tree) {
		/* Do *not* allocate an array here - the removal code
		 * has no way of tracking that. */
		for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++) {
			tree[i] = kmem_cache_alloc(ocfs2_uptodate_cachep,
						   GFP_NOFS);
			if (!tree[i]) {
				mlog_errno(-ENOMEM);
				goto out_free;
			}

			/* These are initialized in ocfs2_expand_cache! */
		}
	}

	spin_lock(&oi->ip_lock);
	if (ocfs2_insert_can_use_array(oi, ci)) {
		mlog(0, "Someone cleared the tree underneath us\n");
		/* Ok, items were removed from the cache in between
		 * locks. Detect this and revert back to the fast path */
		ocfs2_append_cache_array(ci, block);
		spin_unlock(&oi->ip_lock);
		goto out_free;
	}

	if (expand_tree)
		ocfs2_expand_cache(oi, tree);

	__ocfs2_insert_cache_tree(ci, new);
	spin_unlock(&oi->ip_lock);

	new = NULL;
out_free:
	if (new)
		kmem_cache_free(ocfs2_uptodate_cachep, new);

	/* If these were used, then ocfs2_expand_cache re-set them to
	 * NULL for us. */
	if (tree[0]) {
		for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++)
			if (tree[i])
				kmem_cache_free(ocfs2_uptodate_cachep,
						tree[i]);
	}
}

/* Item insertion is guarded by ip_io_mutex, so the insertion path takes
 * advantage of this by not rechecking for a duplicate insert during
 * the slow case. Additionally, if the cache needs to be bumped up to
 * a tree, the code will not recheck after acquiring the lock --
 * multiple paths cannot be expanding to a tree at the same time.
 *
 * The slow path takes into account that items can be removed
 * (including the whole tree wiped and reset) when this process it out
 * allocating memory. In those cases, it reverts back to the fast
 * path.
 *
 * Note that this function may actually fail to insert the block if
 * memory cannot be allocated. This is not fatal however (but may
 * result in a performance penalty)
 *
 * Readahead buffers can be passed in here before the I/O request is
 * completed.
 */
void ocfs2_set_buffer_uptodate(struct inode *inode,
			       struct buffer_head *bh)
{
	int expand;
	struct ocfs2_inode_info *oi = OCFS2_I(inode);
	struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;

	/* The block may very well exist in our cache already, so avoid
	 * doing any more work in that case. */
	if (ocfs2_buffer_cached(oi, bh))
		return;

	mlog(0, "Inode %llu, inserting block %llu\n",
	     (unsigned long long)oi->ip_blkno,
	     (unsigned long long)bh->b_blocknr);

	/* No need to recheck under spinlock - insertion is guarded by
	 * ip_io_mutex */
	spin_lock(&oi->ip_lock);
	if (ocfs2_insert_can_use_array(oi, ci)) {
		/* Fast case - it's an array and there's a free
		 * spot. */
		ocfs2_append_cache_array(ci, bh->b_blocknr);
		spin_unlock(&oi->ip_lock);
		return;
	}

	expand = 0;
	if (oi->ip_flags & OCFS2_INODE_CACHE_INLINE) {
		/* We need to bump things up to a tree. */
		expand = 1;
	}
	spin_unlock(&oi->ip_lock);

	__ocfs2_set_buffer_uptodate(oi, bh->b_blocknr, expand);
}

/* Called against a newly allocated buffer. Most likely nobody should
 * be able to read this sort of metadata while it's still being
 * allocated, but this is careful to take ip_io_mutex anyway. */
void ocfs2_set_new_buffer_uptodate(struct inode *inode,
				   struct buffer_head *bh)
{
	struct ocfs2_inode_info *oi = OCFS2_I(inode);

	/* This should definitely *not* exist in our cache */
	BUG_ON(ocfs2_buffer_cached(oi, bh));

	set_buffer_uptodate(bh);

	mutex_lock(&oi->ip_io_mutex);
	ocfs2_set_buffer_uptodate(inode, bh);
	mutex_unlock(&oi->ip_io_mutex);
}

/* Requires ip_lock. */
static void ocfs2_remove_metadata_array(struct ocfs2_caching_info *ci,
					int index)
{
	sector_t *array = ci->ci_cache.ci_array;
	int bytes;

	BUG_ON(index < 0 || index >= OCFS2_INODE_MAX_CACHE_ARRAY);
	BUG_ON(index >= ci->ci_num_cached);
	BUG_ON(!ci->ci_num_cached);

	mlog(0, "remove index %d (num_cached = %u\n", index,
	     ci->ci_num_cached);

	ci->ci_num_cached--;

	/* don't need to copy if the array is now empty, or if we
	 * removed at the tail */
	if (ci->ci_num_cached && index < ci->ci_num_cached) {
		bytes = sizeof(sector_t) * (ci->ci_num_cached - index);
		memmove(&array[index], &array[index + 1], bytes);
	}
}

/* Requires ip_lock. */
static void ocfs2_remove_metadata_tree(struct ocfs2_caching_info *ci,
				       struct ocfs2_meta_cache_item *item)
{
	mlog(0, "remove block %llu from tree\n",
	     (unsigned long long) item->c_block);

	rb_erase(&item->c_node, &ci->ci_cache.ci_tree);
	ci->ci_num_cached--;
}

static void ocfs2_remove_block_from_cache(struct inode *inode,
					  sector_t block)
{
	int index;
	struct ocfs2_meta_cache_item *item = NULL;
	struct ocfs2_inode_info *oi = OCFS2_I(inode);
	struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;

	spin_lock(&oi->ip_lock);
	mlog(0, "Inode %llu, remove %llu, items = %u, array = %u\n",
	     (unsigned long long)oi->ip_blkno,
	     (unsigned long long) block, ci->ci_num_cached,
	     oi->ip_flags & OCFS2_INODE_CACHE_INLINE);

	if (oi->ip_flags & OCFS2_INODE_CACHE_INLINE) {
		index = ocfs2_search_cache_array(ci, block);
		if (index != -1)
			ocfs2_remove_metadata_array(ci, index);
	} else {
		item = ocfs2_search_cache_tree(ci, block);
		if (item)
			ocfs2_remove_metadata_tree(ci, item);
	}
	spin_unlock(&oi->ip_lock);

	if (item)
		kmem_cache_free(ocfs2_uptodate_cachep, item);
}

/*
 * Called when we remove a chunk of metadata from an inode. We don't
 * bother reverting things to an inlined array in the case of a remove
 * which moves us back under the limit.
 */
void ocfs2_remove_from_cache(struct inode *inode,
			     struct buffer_head *bh)
{
	sector_t block = bh->b_blocknr;

	ocfs2_remove_block_from_cache(inode, block);
}

/* Called when we remove xattr clusters from an inode. */
void ocfs2_remove_xattr_clusters_from_cache(struct inode *inode,
					    sector_t block,
					    u32 c_len)
{
	u64 i, b_len = ocfs2_clusters_to_blocks(inode->i_sb, 1) * c_len;

	for (i = 0; i < b_len; i++, block++)
		ocfs2_remove_block_from_cache(inode, block);
}

int __init init_ocfs2_uptodate_cache(void)
{
	ocfs2_uptodate_cachep = kmem_cache_create("ocfs2_uptodate",
				  sizeof(struct ocfs2_meta_cache_item),
				  0, SLAB_HWCACHE_ALIGN, NULL);
	if (!ocfs2_uptodate_cachep)
		return -ENOMEM;

	mlog(0, "%u inlined cache items per inode.\n",
	     OCFS2_INODE_MAX_CACHE_ARRAY);

	return 0;
}

void exit_ocfs2_uptodate_cache(void)
{
	if (ocfs2_uptodate_cachep)
		kmem_cache_destroy(ocfs2_uptodate_cachep);
}
Commit	Line	Data
ccd979bd MF	1	/* -- mode: c; c-basic-offset: 8; --
	2	* vim: noexpandtab sw=8 ts=8 sts=0:
	3	*
	4	* uptodate.c
	5	*
	6	* Tracking the up-to-date-ness of a local buffer_head with respect to
	7	* the cluster.
	8	*
	9	* Copyright (C) 2002, 2004, 2005 Oracle. All rights reserved.
	10	*
	11	* This program is free software; you can redistribute it and/or
	12	* modify it under the terms of the GNU General Public
	13	* License as published by the Free Software Foundation; either
	14	* version 2 of the License, or (at your option) any later version.
	15	*
	16	* This program is distributed in the hope that it will be useful,
	17	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	18	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	19	* General Public License for more details.
	20	*
	21	* You should have received a copy of the GNU General Public
	22	* License along with this program; if not, write to the
	23	* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
	24	* Boston, MA 021110-1307, USA.
	25	*
	26	* Standard buffer head caching flags (uptodate, etc) are insufficient
	27	* in a clustered environment - a buffer may be marked up to date on
	28	* our local node but could have been modified by another cluster
	29	* member. As a result an additional (and performant) caching scheme
	30	* is required. A further requirement is that we consume as little
	31	* memory as possible - we never pin buffer_head structures in order
	32	* to cache them.
	33	*
	34	* We track the existence of up to date buffers on the inodes which
	35	* are associated with them. Because we don't want to pin
	36	* buffer_heads, this is only a (strong) hint and several other checks
	37	* are made in the I/O path to ensure that we don't use a stale or
	38	* invalid buffer without going to disk:
	39	* - buffer_jbd is used liberally - if a bh is in the journal on
	40	* this node then it must be up to date.
	41	* - the standard buffer_uptodate() macro is used to detect buffers
	42	* which may be invalid (even if we have an up to date tracking
	43	* item for them)
	44	*
	45	* For a full understanding of how this code works together, one
	46	* should read the callers in dlmglue.c, the I/O functions in
	47	* buffer_head_io.c and ocfs2_journal_access in journal.c
	48	*/
	49
	50	#include <linux/fs.h>
	51	#include <linux/types.h>
	52	#include <linux/slab.h>
	53	#include <linux/highmem.h>
	54	#include <linux/buffer_head.h>
	55	#include <linux/rbtree.h>
2b4e30fb JB	56	#ifndef CONFIG_OCFS2_COMPAT_JBD
	57	# include <linux/jbd2.h>
	58	#else
	59	# include <linux/jbd.h>
	60	#endif
ccd979bd MF	61
	62	#define MLOG_MASK_PREFIX ML_UPTODATE
	63
	64	#include <cluster/masklog.h>
	65
	66	#include "ocfs2.h"
	67
	68	#include "inode.h"
	69	#include "uptodate.h"
	70
	71	struct ocfs2_meta_cache_item {
	72	struct rb_node c_node;
	73	sector_t c_block;
	74	};
	75
e18b890b	76	static struct kmem_cache *ocfs2_uptodate_cachep = NULL;
ccd979bd MF	77
	78	void ocfs2_metadata_cache_init(struct inode *inode)
	79	{
	80	struct ocfs2_inode_info *oi = OCFS2_I(inode);
	81	struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
	82
	83	oi->ip_flags \|= OCFS2_INODE_CACHE_INLINE;
	84	ci->ci_num_cached = 0;
	85	}
	86
	87	/* No lock taken here as 'root' is not expected to be visible to other
	88	* processes. */
	89	static unsigned int ocfs2_purge_copied_metadata_tree(struct rb_root *root)
	90	{
	91	unsigned int purged = 0;
	92	struct rb_node *node;
	93	struct ocfs2_meta_cache_item *item;
	94
	95	while ((node = rb_last(root)) != NULL) {
	96	item = rb_entry(node, struct ocfs2_meta_cache_item, c_node);
	97
	98	mlog(0, "Purge item %llu\n",
	99	(unsigned long long) item->c_block);
	100
	101	rb_erase(&item->c_node, root);
	102	kmem_cache_free(ocfs2_uptodate_cachep, item);
	103
	104	purged++;
	105	}
	106	return purged;
	107	}
	108
	109	/* Called from locking and called from ocfs2_clear_inode. Dump the
	110	* cache for a given inode.
	111	*
	112	* This function is a few more lines longer than necessary due to some
	113	* accounting done here, but I think it's worth tracking down those
	114	* bugs sooner -- Mark */
	115	void ocfs2_metadata_cache_purge(struct inode *inode)
	116	{
	117	struct ocfs2_inode_info *oi = OCFS2_I(inode);
	118	unsigned int tree, to_purge, purged;
	119	struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
	120	struct rb_root root = RB_ROOT;
	121
	122	spin_lock(&oi->ip_lock);
	123	tree = !(oi->ip_flags & OCFS2_INODE_CACHE_INLINE);
	124	to_purge = ci->ci_num_cached;
	125
b0697053 MF	126	mlog(0, "Purge %u %s items from Inode %llu\n", to_purge,
b0697053 MF	127	tree ? "array" : "tree", (unsigned long long)oi->ip_blkno);
ccd979bd MF	128
	129	/* If we're a tree, save off the root so that we can safely
	130	* initialize the cache. We do the work to free tree members
	131	* without the spinlock. */
	132	if (tree)
	133	root = ci->ci_cache.ci_tree;
	134
	135	ocfs2_metadata_cache_init(inode);
	136	spin_unlock(&oi->ip_lock);
	137
	138	purged = ocfs2_purge_copied_metadata_tree(&root);
	139	/* If possible, track the number wiped so that we can more
	140	* easily detect counting errors. Unfortunately, this is only
	141	* meaningful for trees. */
	142	if (tree && purged != to_purge)
b0697053 MF	143	mlog(ML_ERROR, "Inode %llu, count = %u, purged = %u\n",
b0697053 MF	144	(unsigned long long)oi->ip_blkno, to_purge, purged);
ccd979bd MF	145	}
	146
	147	/* Returns the index in the cache array, -1 if not found.
	148	* Requires ip_lock. */
	149	static int ocfs2_search_cache_array(struct ocfs2_caching_info *ci,
	150	sector_t item)
	151	{
	152	int i;
	153
	154	for (i = 0; i < ci->ci_num_cached; i++) {
	155	if (item == ci->ci_cache.ci_array[i])
	156	return i;
	157	}
	158
	159	return -1;
	160	}
	161
	162	/* Returns the cache item if found, otherwise NULL.
	163	* Requires ip_lock. */
	164	static struct ocfs2_meta_cache_item *
	165	ocfs2_search_cache_tree(struct ocfs2_caching_info *ci,
	166	sector_t block)
	167	{
	168	struct rb_node * n = ci->ci_cache.ci_tree.rb_node;
	169	struct ocfs2_meta_cache_item *item = NULL;
	170
	171	while (n) {
	172	item = rb_entry(n, struct ocfs2_meta_cache_item, c_node);
	173
	174	if (block < item->c_block)
	175	n = n->rb_left;
	176	else if (block > item->c_block)
	177	n = n->rb_right;
	178	else
	179	return item;
	180	}
	181
	182	return NULL;
	183	}
	184
	185	static int ocfs2_buffer_cached(struct ocfs2_inode_info *oi,
	186	struct buffer_head *bh)
	187	{
	188	int index = -1;
	189	struct ocfs2_meta_cache_item *item = NULL;
	190
	191	spin_lock(&oi->ip_lock);
	192
b0697053 MF	193	mlog(0, "Inode %llu, query block %llu (inline = %u)\n",
	194	(unsigned long long)oi->ip_blkno,
	195	(unsigned long long) bh->b_blocknr,
ccd979bd MF	196	!!(oi->ip_flags & OCFS2_INODE_CACHE_INLINE));
	197
	198	if (oi->ip_flags & OCFS2_INODE_CACHE_INLINE)
	199	index = ocfs2_search_cache_array(&oi->ip_metadata_cache,
	200	bh->b_blocknr);
	201	else
	202	item = ocfs2_search_cache_tree(&oi->ip_metadata_cache,
	203	bh->b_blocknr);
	204
	205	spin_unlock(&oi->ip_lock);
	206
	207	mlog(0, "index = %d, item = %p\n", index, item);
	208
	209	return (index != -1) \|\| (item != NULL);
	210	}
	211
	212	/* Warning: even if it returns true, this does not guarantee that
aa958874 MF	213	* the block is stored in our inode metadata cache.
	214	*
	215	* This can be called under lock_buffer()
	216	*/
ccd979bd MF	217	int ocfs2_buffer_uptodate(struct inode *inode,
	218	struct buffer_head *bh)
	219	{
	220	/* Doesn't matter if the bh is in our cache or not -- if it's
	221	* not marked uptodate then we know it can't have correct
	222	* data. */
	223	if (!buffer_uptodate(bh))
	224	return 0;
	225
	226	/* OCFS2 does not allow multiple nodes to be changing the same
	227	* block at the same time. */
	228	if (buffer_jbd(bh))
	229	return 1;
	230
	231	/* Ok, locally the buffer is marked as up to date, now search
	232	* our cache to see if we can trust that. */
	233	return ocfs2_buffer_cached(OCFS2_I(inode), bh);
	234	}
	235
aa958874 MF	236	/*
	237	* Determine whether a buffer is currently out on a read-ahead request.
	238	* ip_io_sem should be held to serialize submitters with the logic here.
	239	*/
	240	int ocfs2_buffer_read_ahead(struct inode *inode,
	241	struct buffer_head *bh)
	242	{
	243	return buffer_locked(bh) && ocfs2_buffer_cached(OCFS2_I(inode), bh);
	244	}
	245
ccd979bd MF	246	/* Requires ip_lock */
	247	static void ocfs2_append_cache_array(struct ocfs2_caching_info *ci,
	248	sector_t block)
	249	{
	250	BUG_ON(ci->ci_num_cached >= OCFS2_INODE_MAX_CACHE_ARRAY);
	251
	252	mlog(0, "block %llu takes position %u\n", (unsigned long long) block,
	253	ci->ci_num_cached);
	254
	255	ci->ci_cache.ci_array[ci->ci_num_cached] = block;
	256	ci->ci_num_cached++;
	257	}
	258
	259	/* By now the caller should have checked that the item does not
	260	* exist in the tree.
	261	* Requires ip_lock. */
	262	static void __ocfs2_insert_cache_tree(struct ocfs2_caching_info *ci,
	263	struct ocfs2_meta_cache_item *new)
	264	{
	265	sector_t block = new->c_block;
	266	struct rb_node *parent = NULL;
	267	struct rb_node **p = &ci->ci_cache.ci_tree.rb_node;
	268	struct ocfs2_meta_cache_item *tmp;
	269
	270	mlog(0, "Insert block %llu num = %u\n", (unsigned long long) block,
	271	ci->ci_num_cached);
	272
	273	while(*p) {
	274	parent = *p;
	275
	276	tmp = rb_entry(parent, struct ocfs2_meta_cache_item, c_node);
	277
	278	if (block < tmp->c_block)
	279	p = &(*p)->rb_left;
	280	else if (block > tmp->c_block)
	281	p = &(*p)->rb_right;
	282	else {
	283	/* This should never happen! */
	284	mlog(ML_ERROR, "Duplicate block %llu cached!\n",
	285	(unsigned long long) block);
	286	BUG();
	287	}
	288	}
	289
	290	rb_link_node(&new->c_node, parent, p);
	291	rb_insert_color(&new->c_node, &ci->ci_cache.ci_tree);
	292	ci->ci_num_cached++;
	293	}
	294
	295	static inline int ocfs2_insert_can_use_array(struct ocfs2_inode_info *oi,
	296	struct ocfs2_caching_info *ci)
	297	{
	298	assert_spin_locked(&oi->ip_lock);
	299
	300	return (oi->ip_flags & OCFS2_INODE_CACHE_INLINE) &&
	301	(ci->ci_num_cached < OCFS2_INODE_MAX_CACHE_ARRAY);
	302	}
	303
	304	/* tree should be exactly OCFS2_INODE_MAX_CACHE_ARRAY wide. NULL the
	305	* pointers in tree after we use them - this allows caller to detect
	306	* when to free in case of error. */
	307	static void ocfs2_expand_cache(struct ocfs2_inode_info *oi,
	308	struct ocfs2_meta_cache_item **tree)
	309	{
310	int i;
311	struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
312
313	mlog_bug_on_msg(ci->ci_num_cached != OCFS2_INODE_MAX_CACHE_ARRAY,
b0697053 MF	314	"Inode %llu, num cached = %u, should be %u\n",
b0697053 MF	315	(unsigned long long)oi->ip_blkno, ci->ci_num_cached,
ccd979bd MF	316	OCFS2_INODE_MAX_CACHE_ARRAY);
ccd979bd MF	317	mlog_bug_on_msg(!(oi->ip_flags & OCFS2_INODE_CACHE_INLINE),
b0697053 MF	318	"Inode %llu not marked as inline anymore!\n",
b0697053 MF	319	(unsigned long long)oi->ip_blkno);
ccd979bd MF	320	assert_spin_locked(&oi->ip_lock);
	321
	322	/* Be careful to initialize the tree members first because
	323	* once the ci_tree is used, the array is junk... */
	324	for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++)
	325	tree[i]->c_block = ci->ci_cache.ci_array[i];
	326
	327	oi->ip_flags &= ~OCFS2_INODE_CACHE_INLINE;
	328	ci->ci_cache.ci_tree = RB_ROOT;
	329	/* this will be set again by __ocfs2_insert_cache_tree */
	330	ci->ci_num_cached = 0;
	331
	332	for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++) {
	333	__ocfs2_insert_cache_tree(ci, tree[i]);
	334	tree[i] = NULL;
	335	}
	336
b0697053 MF	337	mlog(0, "Expanded %llu to a tree cache: flags 0x%x, num = %u\n",
b0697053 MF	338	(unsigned long long)oi->ip_blkno, oi->ip_flags, ci->ci_num_cached);
ccd979bd MF	339	}
	340
	341	/* Slow path function - memory allocation is necessary. See the
	342	* comment above ocfs2_set_buffer_uptodate for more information. */
	343	static void __ocfs2_set_buffer_uptodate(struct ocfs2_inode_info *oi,
	344	sector_t block,
	345	int expand_tree)
	346	{
	347	int i;
	348	struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
	349	struct ocfs2_meta_cache_item *new = NULL;
	350	struct ocfs2_meta_cache_item *tree[OCFS2_INODE_MAX_CACHE_ARRAY] =
	351	{ NULL, };
	352
b0697053 MF	353	mlog(0, "Inode %llu, block %llu, expand = %d\n",
	354	(unsigned long long)oi->ip_blkno,
	355	(unsigned long long)block, expand_tree);
ccd979bd	356
afae00ab	357	new = kmem_cache_alloc(ocfs2_uptodate_cachep, GFP_NOFS);
ccd979bd MF	358	if (!new) {
	359	mlog_errno(-ENOMEM);
	360	return;
	361	}
	362	new->c_block = block;
	363
	364	if (expand_tree) {
	365	/* Do not allocate an array here - the removal code
	366	* has no way of tracking that. */
	367	for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++) {
	368	tree[i] = kmem_cache_alloc(ocfs2_uptodate_cachep,
afae00ab	369	GFP_NOFS);
ccd979bd MF	370	if (!tree[i]) {
	371	mlog_errno(-ENOMEM);
	372	goto out_free;
	373	}
	374
	375	/* These are initialized in ocfs2_expand_cache! */
	376	}
	377	}
	378
	379	spin_lock(&oi->ip_lock);
	380	if (ocfs2_insert_can_use_array(oi, ci)) {
	381	mlog(0, "Someone cleared the tree underneath us\n");
	382	/* Ok, items were removed from the cache in between
	383	* locks. Detect this and revert back to the fast path */
	384	ocfs2_append_cache_array(ci, block);
	385	spin_unlock(&oi->ip_lock);
	386	goto out_free;
	387	}
	388
	389	if (expand_tree)
	390	ocfs2_expand_cache(oi, tree);
	391
	392	__ocfs2_insert_cache_tree(ci, new);
	393	spin_unlock(&oi->ip_lock);
	394
	395	new = NULL;
	396	out_free:
	397	if (new)
	398	kmem_cache_free(ocfs2_uptodate_cachep, new);
	399
	400	/* If these were used, then ocfs2_expand_cache re-set them to
	401	* NULL for us. */
	402	if (tree[0]) {
	403	for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++)
	404	if (tree[i])
	405	kmem_cache_free(ocfs2_uptodate_cachep,
	406	tree[i]);
	407	}
	408	}
	409
251b6ecc	410	/* Item insertion is guarded by ip_io_mutex, so the insertion path takes
ccd979bd MF	411	* advantage of this by not rechecking for a duplicate insert during
	412	* the slow case. Additionally, if the cache needs to be bumped up to
	413	* a tree, the code will not recheck after acquiring the lock --
	414	* multiple paths cannot be expanding to a tree at the same time.
	415	*
	416	* The slow path takes into account that items can be removed
	417	* (including the whole tree wiped and reset) when this process it out
	418	* allocating memory. In those cases, it reverts back to the fast
	419	* path.
	420	*
	421	* Note that this function may actually fail to insert the block if
	422	* memory cannot be allocated. This is not fatal however (but may
aa958874 MF	423	* result in a performance penalty)
	424	*
	425	* Readahead buffers can be passed in here before the I/O request is
	426	* completed.
	427	*/
ccd979bd MF	428	void ocfs2_set_buffer_uptodate(struct inode *inode,
	429	struct buffer_head *bh)
	430	{
	431	int expand;
	432	struct ocfs2_inode_info *oi = OCFS2_I(inode);
	433	struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
	434
	435	/* The block may very well exist in our cache already, so avoid
	436	* doing any more work in that case. */
	437	if (ocfs2_buffer_cached(oi, bh))
	438	return;
	439
b0697053 MF	440	mlog(0, "Inode %llu, inserting block %llu\n",
	441	(unsigned long long)oi->ip_blkno,
	442	(unsigned long long)bh->b_blocknr);
ccd979bd MF	443
ccd979bd MF	444	/* No need to recheck under spinlock - insertion is guarded by
251b6ecc	445	* ip_io_mutex */
ccd979bd MF	446	spin_lock(&oi->ip_lock);
	447	if (ocfs2_insert_can_use_array(oi, ci)) {
	448	/* Fast case - it's an array and there's a free
	449	* spot. */
	450	ocfs2_append_cache_array(ci, bh->b_blocknr);
	451	spin_unlock(&oi->ip_lock);
	452	return;
	453	}
	454
	455	expand = 0;
	456	if (oi->ip_flags & OCFS2_INODE_CACHE_INLINE) {
	457	/* We need to bump things up to a tree. */
	458	expand = 1;
	459	}
	460	spin_unlock(&oi->ip_lock);
	461
	462	__ocfs2_set_buffer_uptodate(oi, bh->b_blocknr, expand);
	463	}
	464
	465	/* Called against a newly allocated buffer. Most likely nobody should
	466	* be able to read this sort of metadata while it's still being
251b6ecc	467	* allocated, but this is careful to take ip_io_mutex anyway. */
ccd979bd MF	468	void ocfs2_set_new_buffer_uptodate(struct inode *inode,
	469	struct buffer_head *bh)
	470	{
	471	struct ocfs2_inode_info *oi = OCFS2_I(inode);
	472
	473	/* This should definitely not exist in our cache */
	474	BUG_ON(ocfs2_buffer_cached(oi, bh));
	475
	476	set_buffer_uptodate(bh);
	477
251b6ecc	478	mutex_lock(&oi->ip_io_mutex);
ccd979bd	479	ocfs2_set_buffer_uptodate(inode, bh);
251b6ecc	480	mutex_unlock(&oi->ip_io_mutex);
ccd979bd MF	481	}
	482
	483	/* Requires ip_lock. */
	484	static void ocfs2_remove_metadata_array(struct ocfs2_caching_info *ci,
	485	int index)
	486	{
	487	sector_t *array = ci->ci_cache.ci_array;
	488	int bytes;
	489
	490	BUG_ON(index < 0 \|\| index >= OCFS2_INODE_MAX_CACHE_ARRAY);
	491	BUG_ON(index >= ci->ci_num_cached);
	492	BUG_ON(!ci->ci_num_cached);
	493
	494	mlog(0, "remove index %d (num_cached = %u\n", index,
	495	ci->ci_num_cached);
	496
	497	ci->ci_num_cached--;
	498
	499	/* don't need to copy if the array is now empty, or if we
	500	* removed at the tail */
	501	if (ci->ci_num_cached && index < ci->ci_num_cached) {
	502	bytes = sizeof(sector_t) * (ci->ci_num_cached - index);
	503	memmove(&array[index], &array[index + 1], bytes);
	504	}
	505	}
	506
	507	/* Requires ip_lock. */
	508	static void ocfs2_remove_metadata_tree(struct ocfs2_caching_info *ci,
	509	struct ocfs2_meta_cache_item *item)
	510	{
	511	mlog(0, "remove block %llu from tree\n",
	512	(unsigned long long) item->c_block);
	513
	514	rb_erase(&item->c_node, &ci->ci_cache.ci_tree);
	515	ci->ci_num_cached--;
	516	}
	517
ac11c827 TM	518	static void ocfs2_remove_block_from_cache(struct inode *inode,
ac11c827 TM	519	sector_t block)
ccd979bd MF	520	{
ccd979bd MF	521	int index;
ccd979bd MF	522	struct ocfs2_meta_cache_item *item = NULL;
	523	struct ocfs2_inode_info *oi = OCFS2_I(inode);
	524	struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
	525
	526	spin_lock(&oi->ip_lock);
b0697053 MF	527	mlog(0, "Inode %llu, remove %llu, items = %u, array = %u\n",
	528	(unsigned long long)oi->ip_blkno,
	529	(unsigned long long) block, ci->ci_num_cached,
ccd979bd MF	530	oi->ip_flags & OCFS2_INODE_CACHE_INLINE);
	531
	532	if (oi->ip_flags & OCFS2_INODE_CACHE_INLINE) {
	533	index = ocfs2_search_cache_array(ci, block);
	534	if (index != -1)
	535	ocfs2_remove_metadata_array(ci, index);
	536	} else {
	537	item = ocfs2_search_cache_tree(ci, block);
	538	if (item)
	539	ocfs2_remove_metadata_tree(ci, item);
	540	}
	541	spin_unlock(&oi->ip_lock);
	542
	543	if (item)
	544	kmem_cache_free(ocfs2_uptodate_cachep, item);
	545	}
	546
ac11c827 TM	547	/*
	548	* Called when we remove a chunk of metadata from an inode. We don't
	549	* bother reverting things to an inlined array in the case of a remove
	550	* which moves us back under the limit.
	551	*/
	552	void ocfs2_remove_from_cache(struct inode *inode,
	553	struct buffer_head *bh)
	554	{
	555	sector_t block = bh->b_blocknr;
	556
	557	ocfs2_remove_block_from_cache(inode, block);
	558	}
	559
	560	/* Called when we remove xattr clusters from an inode. */
	561	void ocfs2_remove_xattr_clusters_from_cache(struct inode *inode,
	562	sector_t block,
	563	u32 c_len)
	564	{
	565	u64 i, b_len = ocfs2_clusters_to_blocks(inode->i_sb, 1) * c_len;
	566
	567	for (i = 0; i < b_len; i++, block++)
	568	ocfs2_remove_block_from_cache(inode, block);
	569	}
	570
ccd979bd MF	571	int __init init_ocfs2_uptodate_cache(void)
	572	{
	573	ocfs2_uptodate_cachep = kmem_cache_create("ocfs2_uptodate",
	574	sizeof(struct ocfs2_meta_cache_item),
20c2df83	575	0, SLAB_HWCACHE_ALIGN, NULL);
ccd979bd MF	576	if (!ocfs2_uptodate_cachep)
	577	return -ENOMEM;
	578
	579	mlog(0, "%u inlined cache items per inode.\n",
	580	OCFS2_INODE_MAX_CACHE_ARRAY);
	581
	582	return 0;
	583	}
	584
0c6c98fb	585	void exit_ocfs2_uptodate_cache(void)
ccd979bd MF	586	{
	587	if (ocfs2_uptodate_cachep)
	588	kmem_cache_destroy(ocfs2_uptodate_cachep);
	589	}