Btrfs: use a lock to protect incompat/compat flag of the super block

[linux-2.6-block.git] / fs / btrfs / ctree.h
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h

index 547b7b05727f917dfc2bad6516f5c5b30ea12c68..c3b15f8dca009f4956a965ee00a88d3ec95c9165 100644 (file)
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -31,10 +31,10 @@
  #include <trace/events/btrfs.h>
  #include <asm/kmap_types.h>
  #include <linux/pagemap.h>
+#include <linux/btrfs.h>
  #include "extent_io.h"
  #include "extent_map.h"
  #include "async-thread.h"
-#include "ioctl.h"
  
  struct btrfs_trans_handle;
  struct btrfs_transaction;
@@ -46,7 +46,7 @@ extern struct kmem_cache *btrfs_path_cachep;
  extern struct kmem_cache *btrfs_free_space_cachep;
  struct btrfs_ordered_sum;
  
-#define BTRFS_MAGIC "_BHRfS_M"
+#define BTRFS_MAGIC 0x4D5F53665248425FULL /* ascii _BHRfS_M, no null */
  
  #define BTRFS_MAX_MIRRORS 3
  
@@ -191,6 +191,8 @@ static int btrfs_csum_sizes[] = { 4, 0 };
  /* ioprio of readahead is set to idle */
  #define BTRFS_IOPRIO_READA (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0))
  
+#define BTRFS_DIRTY_METADATA_THRESH    (32 * 1024 * 1024)
+
  /*
   * The key defines the order in the tree, and so it also defines (optimal)
   * block layout.
@@ -336,7 +338,11 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes)
  /*
   * File system states
   */
+#define BTRFS_FS_STATE_ERROR           0
+#define BTRFS_FS_STATE_REMOUNTING      1
+#define BTRFS_FS_STATE_TRANS_ABORTED   2
  
+/* Super block flags */
  /* Errors detected */
  #define BTRFS_SUPER_FLAG_ERROR         (1ULL << 2)
  
@@ -502,6 +508,8 @@ struct btrfs_super_block {
  #define BTRFS_FEATURE_INCOMPAT_BIG_METADATA    (1ULL << 5)
  
  #define BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF   (1ULL << 6)
+#define BTRFS_FEATURE_INCOMPAT_RAID56          (1ULL << 7)
+#define BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA (1ULL << 8)
  
  #define BTRFS_FEATURE_COMPAT_SUPP              0ULL
  #define BTRFS_FEATURE_COMPAT_RO_SUPP           0ULL
@@ -511,7 +519,9 @@ struct btrfs_super_block {
          BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS |          \
          BTRFS_FEATURE_INCOMPAT_BIG_METADATA |          \
          BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO |          \
-        BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF)
+        BTRFS_FEATURE_INCOMPAT_RAID56 |                \
+        BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF |         \
+        BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA)
  
  /*
   * A leaf is full of items. offset and size tell us where to find
@@ -576,7 +586,6 @@ struct btrfs_path {
         unsigned int skip_locking:1;
         unsigned int leave_spinning:1;
         unsigned int search_commit_root:1;
-       unsigned int really_keep_locks:1;
  };
  
  /*
@@ -952,8 +961,20 @@ struct btrfs_dev_replace_item {
  #define BTRFS_BLOCK_GROUP_RAID1                (1ULL << 4)
  #define BTRFS_BLOCK_GROUP_DUP          (1ULL << 5)
  #define BTRFS_BLOCK_GROUP_RAID10       (1ULL << 6)
+#define BTRFS_BLOCK_GROUP_RAID5    (1 << 7)
+#define BTRFS_BLOCK_GROUP_RAID6    (1 << 8)
  #define BTRFS_BLOCK_GROUP_RESERVED     BTRFS_AVAIL_ALLOC_BIT_SINGLE
-#define BTRFS_NR_RAID_TYPES            5
+
+enum btrfs_raid_types {
+       BTRFS_RAID_RAID10,
+       BTRFS_RAID_RAID1,
+       BTRFS_RAID_DUP,
+       BTRFS_RAID_RAID0,
+       BTRFS_RAID_SINGLE,
+       BTRFS_RAID_RAID5,
+       BTRFS_RAID_RAID6,
+       BTRFS_NR_RAID_TYPES
+};
  
  #define BTRFS_BLOCK_GROUP_TYPE_MASK    (BTRFS_BLOCK_GROUP_DATA |    \
                                          BTRFS_BLOCK_GROUP_SYSTEM |  \
@@ -961,6 +982,8 @@ struct btrfs_dev_replace_item {
  
  #define BTRFS_BLOCK_GROUP_PROFILE_MASK (BTRFS_BLOCK_GROUP_RAID0 |   \
                                          BTRFS_BLOCK_GROUP_RAID1 |   \
+                                        BTRFS_BLOCK_GROUP_RAID5 |   \
+                                        BTRFS_BLOCK_GROUP_RAID6 |   \
                                          BTRFS_BLOCK_GROUP_DUP |     \
                                          BTRFS_BLOCK_GROUP_RAID10)
  /*
@@ -1185,6 +1208,10 @@ struct btrfs_block_group_cache {
         u64 flags;
         u64 sectorsize;
         u64 cache_generation;
+
+       /* for raid56, this is a full stripe, without parity */
+       unsigned long full_stripe_len;
+
         unsigned int ro:1;
         unsigned int dirty:1;
         unsigned int iref:1;
@@ -1225,6 +1252,28 @@ struct seq_list {
         u64 seq;
  };
  
+enum btrfs_orphan_cleanup_state {
+       ORPHAN_CLEANUP_STARTED  = 1,
+       ORPHAN_CLEANUP_DONE     = 2,
+};
+
+/* used by the raid56 code to lock stripes for read/modify/write */
+struct btrfs_stripe_hash {
+       struct list_head hash_list;
+       wait_queue_head_t wait;
+       spinlock_t lock;
+};
+
+/* used by the raid56 code to lock stripes for read/modify/write */
+struct btrfs_stripe_hash_table {
+       struct list_head stripe_cache;
+       spinlock_t cache_lock;
+       int cache_size;
+       struct btrfs_stripe_hash table[];
+};
+
+#define BTRFS_STRIPE_HASH_TABLE_BITS 11
+
  /* fs_info */
  struct reloc_control;
  struct btrfs_device;
@@ -1250,6 +1299,7 @@ struct btrfs_fs_info {
  
         /* block group cache stuff */
         spinlock_t block_group_cache_lock;
+       u64 first_logical_byte;
         struct rb_root block_group_cache_tree;
  
         /* keep track of unallocated space */
@@ -1288,7 +1338,23 @@ struct btrfs_fs_info {
         u64 last_trans_log_full_commit;
         unsigned long mount_opt;
         unsigned long compress_type:4;
+       /*
+        * It is a suggestive number, the read side is safe even it gets a
+        * wrong number because we will write out the data into a regular
+        * extent. The write side(mount/remount) is under ->s_umount lock,
+        * so it is also safe.
+        */
         u64 max_inline;
+       /*
+        * Protected by ->chunk_mutex and sb->s_umount.
+        *
+        * The reason that we use two lock to protect it is because only
+        * remount and mount operations can change it and these two operations
+        * are under sb->s_umount, but the read side (chunk allocation) can not
+        * acquire sb->s_umount or the deadlock would happen. So we use two
+        * locks to protect it. On the write side, we must acquire two locks,
+        * and on the read side, we just need acquire one of them.
+        */
         u64 alloc_start;
         struct btrfs_transaction *running_transaction;
         wait_queue_head_t transaction_throttle;
@@ -1296,6 +1362,17 @@ struct btrfs_fs_info {
         wait_queue_head_t transaction_blocked_wait;
         wait_queue_head_t async_submit_wait;
  
+       /*
+        * Used to protect the incompat_flags, compat_flags, compat_ro_flags
+        * when they are updated.
+        *
+        * Because we do not clear the flags for ever, so we needn't use
+        * the lock on the read side.
+        *
+        * We also needn't use the lock when we mount the fs, because
+        * there is no other task which will update the flag.
+        */
+       spinlock_t super_lock;
         struct btrfs_super_block *super_copy;
         struct btrfs_super_block *super_for_commit;
         struct block_device *__bdev;
@@ -1307,6 +1384,13 @@ struct btrfs_fs_info {
         struct mutex cleaner_mutex;
         struct mutex chunk_mutex;
         struct mutex volume_mutex;
+
+       /* this is used during read/modify/write to make sure
+        * no two ios are trying to mod the same stripe at the same
+        * time
+        */
+       struct btrfs_stripe_hash_table *stripe_hash_table;
+
         /*
          * this protects the ordered operations list only while we are
          * processing all of the entries on it.  This way we make
@@ -1365,6 +1449,7 @@ struct btrfs_fs_info {
          */
         struct list_head ordered_extents;
  
+       spinlock_t delalloc_lock;
         /*
          * all of the inodes that have delalloc bytes.  It is possible for
          * this list to be empty even when there is still dirty data=ordered
@@ -1372,13 +1457,6 @@ struct btrfs_fs_info {
          */
         struct list_head delalloc_inodes;
  
-       /*
-        * special rename and truncate targets that must be on disk before
-        * we're allowed to commit.  This is basically the ext3 style
-        * data=ordered list.
-        */
-       struct list_head ordered_operations;
-
         /*
          * there is a pool of worker threads for checksumming during writes
          * and a pool for checksumming after reads.  This is because readers
@@ -1395,6 +1473,8 @@ struct btrfs_fs_info {
         struct btrfs_workers flush_workers;
         struct btrfs_workers endio_workers;
         struct btrfs_workers endio_meta_workers;
+       struct btrfs_workers endio_raid56_workers;
+       struct btrfs_workers rmw_workers;
         struct btrfs_workers endio_meta_write_workers;
         struct btrfs_workers endio_write_workers;
         struct btrfs_workers endio_freespace_worker;
@@ -1423,10 +1503,12 @@ struct btrfs_fs_info {
  
         u64 total_pinned;
  
-       /* protected by the delalloc lock, used to keep from writing
-        * metadata until there is a nice batch
-        */
-       u64 dirty_metadata_bytes;
+       /* used to keep from writing metadata until there is a nice batch */
+       struct percpu_counter dirty_metadata_bytes;
+       struct percpu_counter delalloc_bytes;
+       s32 dirty_metadata_batch;
+       s32 delalloc_batch;
+
         struct list_head dirty_cowonly_roots;
  
         struct btrfs_fs_devices *fs_devices;
@@ -1442,9 +1524,6 @@ struct btrfs_fs_info {
  
         struct reloc_control *reloc_ctl;
  
-       spinlock_t delalloc_lock;
-       u64 delalloc_bytes;
-
         /* data_alloc_cluster is only used in ssd mode */
         struct btrfs_free_cluster data_alloc_cluster;
  
@@ -1456,6 +1535,8 @@ struct btrfs_fs_info {
         struct rb_root defrag_inodes;
         atomic_t defrag_running;
  
+       /* Used to protect avail_{data, metadata, system}_alloc_bits */
+       seqlock_t profiles_lock;
         /*
          * these three are in extended format (availability of single
          * chunks is denoted by BTRFS_AVAIL_ALLOC_BIT_SINGLE bit, other
@@ -1513,6 +1594,9 @@ struct btrfs_fs_info {
         struct rb_root qgroup_tree;
         spinlock_t qgroup_lock;
  
+       /* protect user change for quota operations */
+       struct mutex qgroup_ioctl_lock;
+
         /* list of dirty qgroups to be written at next commit */
         struct list_head dirty_qgroups;
  
@@ -1520,7 +1604,7 @@ struct btrfs_fs_info {
         u64 qgroup_seq;
  
         /* filesystem state */
-       u64 fs_state;
+       unsigned long fs_state;
  
         struct btrfs_delayed_root *delayed_root;
  
@@ -1623,6 +1707,9 @@ struct btrfs_root {
  
         struct list_head root_list;
  
+       spinlock_t log_extents_lock[2];
+       struct list_head logged_list[2];
+
         spinlock_t orphan_lock;
         atomic_t orphan_inodes;
         struct btrfs_block_rsv *orphan_block_rsv;
@@ -1737,6 +1824,12 @@ struct btrfs_ioctl_defrag_range_args {
   */
  #define BTRFS_EXTENT_ITEM_KEY  168
  
+/*
+ * The same as the BTRFS_EXTENT_ITEM_KEY, except it's metadata we already know
+ * the length, so we save the level in key->offset instead of the length.
+ */
+#define BTRFS_METADATA_ITEM_KEY        169
+
  #define BTRFS_TREE_BLOCK_REF_KEY       176
  
  #define BTRFS_EXTENT_DATA_REF_KEY      178
@@ -1832,6 +1925,7 @@ struct btrfs_ioctl_defrag_range_args {
  
  #define btrfs_clear_opt(o, opt)                ((o) &= ~BTRFS_MOUNT_##opt)
  #define btrfs_set_opt(o, opt)          ((o) |= BTRFS_MOUNT_##opt)
+#define btrfs_raw_test_opt(o, opt)     ((o) & BTRFS_MOUNT_##opt)
  #define btrfs_test_opt(root, opt)      ((root)->fs_info->mount_opt & \
                                          BTRFS_MOUNT_##opt)
  /*
@@ -2933,11 +3027,10 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
  int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len);
  int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
                              struct btrfs_root *root, u64 bytenr,
-                            u64 num_bytes, u64 *refs, u64 *flags);
+                            u64 offset, int metadata, u64 *refs, u64 *flags);
  int btrfs_pin_extent(struct btrfs_root *root,
                      u64 bytenr, u64 num, int reserved);
-int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
-                                   struct btrfs_root *root,
+int btrfs_pin_extent_for_log_replay(struct btrfs_root *root,
                                     u64 bytenr, u64 num_bytes);
  int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
                           struct btrfs_root *root,
@@ -3035,8 +3128,13 @@ void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
  int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
                                   struct inode *inode);
  void btrfs_orphan_release_metadata(struct inode *inode);
-int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
-                               struct btrfs_pending_snapshot *pending);
+int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
+                                    struct btrfs_block_rsv *rsv,
+                                    int nitems,
+                                    u64 *qgroup_reserved);
+void btrfs_subvolume_release_metadata(struct btrfs_root *root,
+                                     struct btrfs_block_rsv *rsv,
+                                     u64 qgroup_reserved);
  int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes);
  void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes);
  int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes);
@@ -3092,10 +3190,10 @@ struct extent_buffer *btrfs_root_node(struct btrfs_root *root);
  struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root);
  int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
                         struct btrfs_key *key, int lowest_level,
-                       int cache_only, u64 min_trans);
+                       u64 min_trans);
  int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
                          struct btrfs_key *max_key,
-                        struct btrfs_path *path, int cache_only,
+                        struct btrfs_path *path,
                          u64 min_trans);
  enum btrfs_compare_tree_result {
         BTRFS_COMPARE_TREE_NEW,
@@ -3148,7 +3246,7 @@ int btrfs_search_slot_for_read(struct btrfs_root *root,
                                int find_higher, int return_any);
  int btrfs_realloc_node(struct btrfs_trans_handle *trans,
                        struct btrfs_root *root, struct extent_buffer *parent,
-                      int start_slot, int cache_only, u64 *last_ret,
+                      int start_slot, u64 *last_ret,
                        struct btrfs_key *progress);
  void btrfs_release_path(struct btrfs_path *p);
  struct btrfs_path *btrfs_alloc_path(void);
@@ -3188,9 +3286,6 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
  }
  
  int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
-int btrfs_next_leaf_write(struct btrfs_trans_handle *trans,
-                         struct btrfs_root *root, struct btrfs_path *path,
-                         int del);
  int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
                         u64 time_seq);
  static inline int btrfs_next_old_item(struct btrfs_root *root,
@@ -3459,9 +3554,9 @@ int btrfs_writepages(struct address_space *mapping,
                      struct writeback_control *wbc);
  int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
                              struct btrfs_root *new_root, u64 new_dirid);
-int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
-                        size_t size, struct bio *bio, unsigned long bio_flags);
-
+int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset,
+                        size_t size, struct bio *bio,
+                        unsigned long bio_flags);
  int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
  int btrfs_readpage(struct file *file, struct page *page);
  void btrfs_evict_inode(struct inode *inode);
@@ -3543,7 +3638,7 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
  
  /* tree-defrag.c */
  int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
-                       struct btrfs_root *root, int cache_only);
+                       struct btrfs_root *root);
  
  /* sysfs.c */
  int btrfs_init_sysfs(void);
@@ -3558,14 +3653,31 @@ int btrfs_sync_fs(struct super_block *sb, int wait);
  
  #ifdef CONFIG_PRINTK
  __printf(2, 3)
-void btrfs_printk(struct btrfs_fs_info *fs_info, const char *fmt, ...);
+void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...);
  #else
  static inline __printf(2, 3)
-void btrfs_printk(struct btrfs_fs_info *fs_info, const char *fmt, ...)
+void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
  {
  }
  #endif
  
+#define btrfs_emerg(fs_info, fmt, args...) \
+       btrfs_printk(fs_info, KERN_EMERG fmt, ##args)
+#define btrfs_alert(fs_info, fmt, args...) \
+       btrfs_printk(fs_info, KERN_ALERT fmt, ##args)
+#define btrfs_crit(fs_info, fmt, args...) \
+       btrfs_printk(fs_info, KERN_CRIT fmt, ##args)
+#define btrfs_err(fs_info, fmt, args...) \
+       btrfs_printk(fs_info, KERN_ERR fmt, ##args)
+#define btrfs_warn(fs_info, fmt, args...) \
+       btrfs_printk(fs_info, KERN_WARNING fmt, ##args)
+#define btrfs_notice(fs_info, fmt, args...) \
+       btrfs_printk(fs_info, KERN_NOTICE fmt, ##args)
+#define btrfs_info(fs_info, fmt, args...) \
+       btrfs_printk(fs_info, KERN_INFO fmt, ##args)
+#define btrfs_debug(fs_info, fmt, args...) \
+       btrfs_printk(fs_info, KERN_DEBUG fmt, ##args)
+
  __printf(5, 6)
  void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
                      unsigned int line, int errno, const char *fmt, ...);
@@ -3587,11 +3699,28 @@ static inline void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info,
         disk_super = fs_info->super_copy;
         features = btrfs_super_incompat_flags(disk_super);
         if (!(features & flag)) {
-               features |= flag;
-               btrfs_set_super_incompat_flags(disk_super, features);
+               spin_lock(&fs_info->super_lock);
+               features = btrfs_super_incompat_flags(disk_super);
+               if (!(features & flag)) {
+                       features |= flag;
+                       btrfs_set_super_incompat_flags(disk_super, features);
+                       printk(KERN_INFO "btrfs: setting %llu feature flag\n",
+                                        flag);
+               }
+               spin_unlock(&fs_info->super_lock);
         }
  }
  
+#define btrfs_fs_incompat(fs_info, opt) \
+       __btrfs_fs_incompat((fs_info), BTRFS_FEATURE_INCOMPAT_##opt)
+
+static inline int __btrfs_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag)
+{
+       struct btrfs_super_block *disk_super;
+       disk_super = fs_info->super_copy;
+       return !!(btrfs_super_incompat_flags(disk_super) & flag);
+}
+
  /*
   * Call btrfs_abort_transaction as early as possible when an error condition is
   * detected, that way the exact line number is reported.
@@ -3620,11 +3749,14 @@ __printf(5, 6)
  void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
                    unsigned int line, int errno, const char *fmt, ...);
  
+/*
+ * If BTRFS_MOUNT_PANIC_ON_FATAL_ERROR is in mount_opt, __btrfs_panic
+ * will panic().  Otherwise we BUG() here.
+ */
  #define btrfs_panic(fs_info, errno, fmt, args...)                      \
  do {                                                                   \
-       struct btrfs_fs_info *_i = (fs_info);                           \
-       __btrfs_panic(_i, __func__, __LINE__, errno, fmt, ##args);      \
-       BUG_ON(!(_i->mount_opt & BTRFS_MOUNT_PANIC_ON_FATAL_ERROR));    \
+       __btrfs_panic(fs_info, __func__, __LINE__, errno, fmt, ##args); \
+       BUG();                                                          \
  } while (0)
  
  /* acl.c */
@@ -3745,4 +3877,11 @@ static inline int is_fstree(u64 rootid)
                 return 1;
         return 0;
  }
+
+static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info)
+{
+       return signal_pending(current);
+}
+
+
  #endif