fs/ext4/super.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  *  linux/fs/ext4/super.c
   4  *
   5  * Copyright (C) 1992, 1993, 1994, 1995
   6  * Remy Card (card@masi.ibp.fr)
   7  * Laboratoire MASI - Institut Blaise Pascal
   8  * Universite Pierre et Marie Curie (Paris VI)
   9  *
  10  *  from
  11  *
  12  *  linux/fs/minix/inode.c
  13  *
  14  *  Copyright (C) 1991, 1992  Linus Torvalds
  15  *
  16  *  Big-endian to little-endian byte-swapping/bitmaps by
  17  *        David S. Miller (davem@caip.rutgers.edu), 1995
  18  */
  19
  20 #include <linux/module.h>
  21 #include <linux/string.h>
  22 #include <linux/fs.h>
  23 #include <linux/time.h>
  24 #include <linux/vmalloc.h>
  25 #include <linux/slab.h>
  26 #include <linux/init.h>
  27 #include <linux/blkdev.h>
  28 #include <linux/backing-dev.h>
  29 #include <linux/parser.h>
  30 #include <linux/buffer_head.h>
  31 #include <linux/exportfs.h>
  32 #include <linux/vfs.h>
  33 #include <linux/random.h>
  34 #include <linux/mount.h>
  35 #include <linux/namei.h>
  36 #include <linux/quotaops.h>
  37 #include <linux/seq_file.h>
  38 #include <linux/ctype.h>
  39 #include <linux/log2.h>
  40 #include <linux/crc16.h>
  41 #include <linux/dax.h>
  42 #include <linux/cleancache.h>
  43 #include <linux/uaccess.h>
  44 #include <linux/iversion.h>
  45 #include <linux/unicode.h>
  46 #include <linux/part_stat.h>
  47 #include <linux/kthread.h>
  48 #include <linux/freezer.h>
  49
  50 #include "ext4.h"
  51 #include "ext4_extents.h"       /* Needed for trace points definition */
  52 #include "ext4_jbd2.h"
  53 #include "xattr.h"
  54 #include "acl.h"
  55 #include "mballoc.h"
  56 #include "fsmap.h"
  57
  58 #define CREATE_TRACE_POINTS
  59 #include <trace/events/ext4.h>
  60
  61 static struct ext4_lazy_init *ext4_li_info;
  62 static struct mutex ext4_li_mtx;
  63 static struct ratelimit_state ext4_mount_msg_ratelimit;
  64
  65 static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
  66                              unsigned long journal_devnum);
  67 static int ext4_show_options(struct seq_file *seq, struct dentry *root);
  68 static int ext4_commit_super(struct super_block *sb, int sync);
  69 static void ext4_mark_recovery_complete(struct super_block *sb,
  70                                         struct ext4_super_block *es);
  71 static void ext4_clear_journal_err(struct super_block *sb,
  72                                    struct ext4_super_block *es);
  73 static int ext4_sync_fs(struct super_block *sb, int wait);
  74 static int ext4_remount(struct super_block *sb, int *flags, char *data);
  75 static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf);
  76 static int ext4_unfreeze(struct super_block *sb);
  77 static int ext4_freeze(struct super_block *sb);
  78 static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
  79                        const char *dev_name, void *data);
  80 static inline int ext2_feature_set_ok(struct super_block *sb);
  81 static inline int ext3_feature_set_ok(struct super_block *sb);
  82 static int ext4_feature_set_ok(struct super_block *sb, int readonly);
  83 static void ext4_destroy_lazyinit_thread(void);
  84 static void ext4_unregister_li_request(struct super_block *sb);
  85 static void ext4_clear_request_list(void);
  86 static struct inode *ext4_get_journal_inode(struct super_block *sb,
  87                                             unsigned int journal_inum);
  88
  89 /*
  90  * Lock ordering
  91  *
  92  * Note the difference between i_mmap_sem (EXT4_I(inode)->i_mmap_sem) and
  93  * i_mmap_rwsem (inode->i_mmap_rwsem)!
  94  *
  95  * page fault path:
  96  * mmap_sem -> sb_start_pagefault -> i_mmap_sem (r) -> transaction start ->
  97  *   page lock -> i_data_sem (rw)
  98  *
  99  * buffered write path:
 100  * sb_start_write -> i_mutex -> mmap_sem
 101  * sb_start_write -> i_mutex -> transaction start -> page lock ->
 102  *   i_data_sem (rw)
 103  *
 104  * truncate:
 105  * sb_start_write -> i_mutex -> i_mmap_sem (w) -> i_mmap_rwsem (w) -> page lock
 106  * sb_start_write -> i_mutex -> i_mmap_sem (w) -> transaction start ->
 107  *   i_data_sem (rw)
 108  *
 109  * direct IO:
 110  * sb_start_write -> i_mutex -> mmap_sem
 111  * sb_start_write -> i_mutex -> transaction start -> i_data_sem (rw)
 112  *
 113  * writepages:
 114  * transaction start -> page lock(s) -> i_data_sem (rw)
 115  */
 116
 117 #if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT2)
 118 static struct file_system_type ext2_fs_type = {
 119         .owner          = THIS_MODULE,
 120         .name           = "ext2",
 121         .mount          = ext4_mount,
 122         .kill_sb        = kill_block_super,
 123         .fs_flags       = FS_REQUIRES_DEV,
 124 };
 125 MODULE_ALIAS_FS("ext2");
 126 MODULE_ALIAS("ext2");
 127 #define IS_EXT2_SB(sb) ((sb)->s_bdev->bd_holder == &ext2_fs_type)
 128 #else
 129 #define IS_EXT2_SB(sb) (0)
 130 #endif
 131
 132
 133 static struct file_system_type ext3_fs_type = {
 134         .owner          = THIS_MODULE,
 135         .name           = "ext3",
 136         .mount          = ext4_mount,
 137         .kill_sb        = kill_block_super,
 138         .fs_flags       = FS_REQUIRES_DEV,
 139 };
 140 MODULE_ALIAS_FS("ext3");
 141 MODULE_ALIAS("ext3");
 142 #define IS_EXT3_SB(sb) ((sb)->s_bdev->bd_holder == &ext3_fs_type)
 143
 144 /*
 145  * This works like sb_bread() except it uses ERR_PTR for error
 146  * returns.  Currently with sb_bread it's impossible to distinguish
 147  * between ENOMEM and EIO situations (since both result in a NULL
 148  * return.
 149  */
 150 struct buffer_head *
 151 ext4_sb_bread(struct super_block *sb, sector_t block, int op_flags)
 152 {
 153         struct buffer_head *bh = sb_getblk(sb, block);
 154
 155         if (bh == NULL)
 156                 return ERR_PTR(-ENOMEM);
 157         if (ext4_buffer_uptodate(bh))
 158                 return bh;
 159         ll_rw_block(REQ_OP_READ, REQ_META | op_flags, 1, &bh);
 160         wait_on_buffer(bh);
 161         if (buffer_uptodate(bh))
 162                 return bh;
 163         put_bh(bh);
 164         return ERR_PTR(-EIO);
 165 }
 166
 167 static int ext4_verify_csum_type(struct super_block *sb,
 168                                  struct ext4_super_block *es)
 169 {
 170         if (!ext4_has_feature_metadata_csum(sb))
 171                 return 1;
 172
 173         return es->s_checksum_type == EXT4_CRC32C_CHKSUM;
 174 }
 175
 176 static __le32 ext4_superblock_csum(struct super_block *sb,
 177                                    struct ext4_super_block *es)
 178 {
 179         struct ext4_sb_info *sbi = EXT4_SB(sb);
 180         int offset = offsetof(struct ext4_super_block, s_checksum);
 181         __u32 csum;
 182
 183         csum = ext4_chksum(sbi, ~0, (char *)es, offset);
 184
 185         return cpu_to_le32(csum);
 186 }
 187
 188 static int ext4_superblock_csum_verify(struct super_block *sb,
 189                                        struct ext4_super_block *es)
 190 {
 191         if (!ext4_has_metadata_csum(sb))
 192                 return 1;
 193
 194         return es->s_checksum == ext4_superblock_csum(sb, es);
 195 }
 196
 197 void ext4_superblock_csum_set(struct super_block *sb)
 198 {
 199         struct ext4_super_block *es = EXT4_SB(sb)->s_es;
 200
 201         if (!ext4_has_metadata_csum(sb))
 202                 return;
 203
 204         es->s_checksum = ext4_superblock_csum(sb, es);
 205 }
 206
 207 ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
 208                                struct ext4_group_desc *bg)
 209 {
 210         return le32_to_cpu(bg->bg_block_bitmap_lo) |
 211                 (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
 212                  (ext4_fsblk_t)le32_to_cpu(bg->bg_block_bitmap_hi) << 32 : 0);
 213 }
 214
 215 ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb,
 216                                struct ext4_group_desc *bg)
 217 {
 218         return le32_to_cpu(bg->bg_inode_bitmap_lo) |
 219                 (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
 220                  (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_bitmap_hi) << 32 : 0);
 221 }
 222
 223 ext4_fsblk_t ext4_inode_table(struct super_block *sb,
 224                               struct ext4_group_desc *bg)
 225 {
 226         return le32_to_cpu(bg->bg_inode_table_lo) |
 227                 (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
 228                  (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0);
 229 }
 230
 231 __u32 ext4_free_group_clusters(struct super_block *sb,
 232                                struct ext4_group_desc *bg)
 233 {
 234         return le16_to_cpu(bg->bg_free_blocks_count_lo) |
 235                 (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
 236                  (__u32)le16_to_cpu(bg->bg_free_blocks_count_hi) << 16 : 0);
 237 }
 238
 239 __u32 ext4_free_inodes_count(struct super_block *sb,
 240                               struct ext4_group_desc *bg)
 241 {
 242         return le16_to_cpu(bg->bg_free_inodes_count_lo) |
 243                 (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
 244                  (__u32)le16_to_cpu(bg->bg_free_inodes_count_hi) << 16 : 0);
 245 }
 246
 247 __u32 ext4_used_dirs_count(struct super_block *sb,
 248                               struct ext4_group_desc *bg)
 249 {
 250         return le16_to_cpu(bg->bg_used_dirs_count_lo) |
 251                 (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
 252                  (__u32)le16_to_cpu(bg->bg_used_dirs_count_hi) << 16 : 0);
 253 }
 254
 255 __u32 ext4_itable_unused_count(struct super_block *sb,
 256                               struct ext4_group_desc *bg)
 257 {
 258         return le16_to_cpu(bg->bg_itable_unused_lo) |
 259                 (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
 260                  (__u32)le16_to_cpu(bg->bg_itable_unused_hi) << 16 : 0);
 261 }
 262
 263 void ext4_block_bitmap_set(struct super_block *sb,
 264                            struct ext4_group_desc *bg, ext4_fsblk_t blk)
 265 {
 266         bg->bg_block_bitmap_lo = cpu_to_le32((u32)blk);
 267         if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
 268                 bg->bg_block_bitmap_hi = cpu_to_le32(blk >> 32);
 269 }
 270
 271 void ext4_inode_bitmap_set(struct super_block *sb,
 272                            struct ext4_group_desc *bg, ext4_fsblk_t blk)
 273 {
 274         bg->bg_inode_bitmap_lo  = cpu_to_le32((u32)blk);
 275         if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
 276                 bg->bg_inode_bitmap_hi = cpu_to_le32(blk >> 32);
 277 }
 278
 279 void ext4_inode_table_set(struct super_block *sb,
 280                           struct ext4_group_desc *bg, ext4_fsblk_t blk)
 281 {
 282         bg->bg_inode_table_lo = cpu_to_le32((u32)blk);
 283         if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
 284                 bg->bg_inode_table_hi = cpu_to_le32(blk >> 32);
 285 }
 286
 287 void ext4_free_group_clusters_set(struct super_block *sb,
 288                                   struct ext4_group_desc *bg, __u32 count)
 289 {
 290         bg->bg_free_blocks_count_lo = cpu_to_le16((__u16)count);
 291         if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
 292                 bg->bg_free_blocks_count_hi = cpu_to_le16(count >> 16);
 293 }
 294
 295 void ext4_free_inodes_set(struct super_block *sb,
 296                           struct ext4_group_desc *bg, __u32 count)
 297 {
 298         bg->bg_free_inodes_count_lo = cpu_to_le16((__u16)count);
 299         if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
 300                 bg->bg_free_inodes_count_hi = cpu_to_le16(count >> 16);
 301 }
 302
 303 void ext4_used_dirs_set(struct super_block *sb,
 304                           struct ext4_group_desc *bg, __u32 count)
 305 {
 306         bg->bg_used_dirs_count_lo = cpu_to_le16((__u16)count);
 307         if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
 308                 bg->bg_used_dirs_count_hi = cpu_to_le16(count >> 16);
 309 }
 310
 311 void ext4_itable_unused_set(struct super_block *sb,
 312                           struct ext4_group_desc *bg, __u32 count)
 313 {
 314         bg->bg_itable_unused_lo = cpu_to_le16((__u16)count);
 315         if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
 316                 bg->bg_itable_unused_hi = cpu_to_le16(count >> 16);
 317 }
 318
 319 static void __ext4_update_tstamp(__le32 *lo, __u8 *hi)
 320 {
 321         time64_t now = ktime_get_real_seconds();
 322
 323         now = clamp_val(now, 0, (1ull << 40) - 1);
 324
 325         *lo = cpu_to_le32(lower_32_bits(now));
 326         *hi = upper_32_bits(now);
 327 }
 328
 329 static time64_t __ext4_get_tstamp(__le32 *lo, __u8 *hi)
 330 {
 331         return ((time64_t)(*hi) << 32) + le32_to_cpu(*lo);
 332 }
 333 #define ext4_update_tstamp(es, tstamp) \
 334         __ext4_update_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi)
 335 #define ext4_get_tstamp(es, tstamp) \
 336         __ext4_get_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi)
 337
 338 static void __save_error_info(struct super_block *sb, int error,
 339                               __u32 ino, __u64 block,
 340                               const char *func, unsigned int line)
 341 {
 342         struct ext4_super_block *es = EXT4_SB(sb)->s_es;
 343         int err;
 344
 345         EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
 346         if (bdev_read_only(sb->s_bdev))
 347                 return;
 348         es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
 349         ext4_update_tstamp(es, s_last_error_time);
 350         strncpy(es->s_last_error_func, func, sizeof(es->s_last_error_func));
 351         es->s_last_error_line = cpu_to_le32(line);
 352         es->s_last_error_ino = cpu_to_le32(ino);
 353         es->s_last_error_block = cpu_to_le64(block);
 354         switch (error) {
 355         case EIO:
 356                 err = EXT4_ERR_EIO;
 357                 break;
 358         case ENOMEM:
 359                 err = EXT4_ERR_ENOMEM;
 360                 break;
 361         case EFSBADCRC:
 362                 err = EXT4_ERR_EFSBADCRC;
 363                 break;
 364         case 0:
 365         case EFSCORRUPTED:
 366                 err = EXT4_ERR_EFSCORRUPTED;
 367                 break;
 368         case ENOSPC:
 369                 err = EXT4_ERR_ENOSPC;
 370                 break;
 371         case ENOKEY:
 372                 err = EXT4_ERR_ENOKEY;
 373                 break;
 374         case EROFS:
 375                 err = EXT4_ERR_EROFS;
 376                 break;
 377         case EFBIG:
 378                 err = EXT4_ERR_EFBIG;
 379                 break;
 380         case EEXIST:
 381                 err = EXT4_ERR_EEXIST;
 382                 break;
 383         case ERANGE:
 384                 err = EXT4_ERR_ERANGE;
 385                 break;
 386         case EOVERFLOW:
 387                 err = EXT4_ERR_EOVERFLOW;
 388                 break;
 389         case EBUSY:
 390                 err = EXT4_ERR_EBUSY;
 391                 break;
 392         case ENOTDIR:
 393                 err = EXT4_ERR_ENOTDIR;
 394                 break;
 395         case ENOTEMPTY:
 396                 err = EXT4_ERR_ENOTEMPTY;
 397                 break;
 398         case ESHUTDOWN:
 399                 err = EXT4_ERR_ESHUTDOWN;
 400                 break;
 401         case EFAULT:
 402                 err = EXT4_ERR_EFAULT;
 403                 break;
 404         default:
 405                 err = EXT4_ERR_UNKNOWN;
 406         }
 407         es->s_last_error_errcode = err;
 408         if (!es->s_first_error_time) {
 409                 es->s_first_error_time = es->s_last_error_time;
 410                 es->s_first_error_time_hi = es->s_last_error_time_hi;
 411                 strncpy(es->s_first_error_func, func,
 412                         sizeof(es->s_first_error_func));
 413                 es->s_first_error_line = cpu_to_le32(line);
 414                 es->s_first_error_ino = es->s_last_error_ino;
 415                 es->s_first_error_block = es->s_last_error_block;
 416                 es->s_first_error_errcode = es->s_last_error_errcode;
 417         }
 418         /*
 419          * Start the daily error reporting function if it hasn't been
 420          * started already
 421          */
 422         if (!es->s_error_count)
 423                 mod_timer(&EXT4_SB(sb)->s_err_report, jiffies + 24*60*60*HZ);
 424         le32_add_cpu(&es->s_error_count, 1);
 425 }
 426
 427 static void save_error_info(struct super_block *sb, int error,
 428                             __u32 ino, __u64 block,
 429                             const char *func, unsigned int line)
 430 {
 431         __save_error_info(sb, error, ino, block, func, line);
 432         if (!bdev_read_only(sb->s_bdev))
 433                 ext4_commit_super(sb, 1);
 434 }
 435
 436 /*
 437  * The del_gendisk() function uninitializes the disk-specific data
 438  * structures, including the bdi structure, without telling anyone
 439  * else.  Once this happens, any attempt to call mark_buffer_dirty()
 440  * (for example, by ext4_commit_super), will cause a kernel OOPS.
 441  * This is a kludge to prevent these oops until we can put in a proper
 442  * hook in del_gendisk() to inform the VFS and file system layers.
 443  */
 444 static int block_device_ejected(struct super_block *sb)
 445 {
 446         struct inode *bd_inode = sb->s_bdev->bd_inode;
 447         struct backing_dev_info *bdi = inode_to_bdi(bd_inode);
 448
 449         return bdi->dev == NULL;
 450 }
 451
 452 static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn)
 453 {
 454         struct super_block              *sb = journal->j_private;
 455         struct ext4_sb_info             *sbi = EXT4_SB(sb);
 456         int                             error = is_journal_aborted(journal);
 457         struct ext4_journal_cb_entry    *jce;
 458
 459         BUG_ON(txn->t_state == T_FINISHED);
 460
 461         ext4_process_freed_data(sb, txn->t_tid);
 462
 463         spin_lock(&sbi->s_md_lock);
 464         while (!list_empty(&txn->t_private_list)) {
 465                 jce = list_entry(txn->t_private_list.next,
 466                                  struct ext4_journal_cb_entry, jce_list);
 467                 list_del_init(&jce->jce_list);
 468                 spin_unlock(&sbi->s_md_lock);
 469                 jce->jce_func(sb, jce, error);
 470                 spin_lock(&sbi->s_md_lock);
 471         }
 472         spin_unlock(&sbi->s_md_lock);
 473 }
 474
 475 static bool system_going_down(void)
 476 {
 477         return system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF
 478                 || system_state == SYSTEM_RESTART;
 479 }
 480
 481 /* Deal with the reporting of failure conditions on a filesystem such as
 482  * inconsistencies detected or read IO failures.
 483  *
 484  * On ext2, we can store the error state of the filesystem in the
 485  * superblock.  That is not possible on ext4, because we may have other
 486  * write ordering constraints on the superblock which prevent us from
 487  * writing it out straight away; and given that the journal is about to
 488  * be aborted, we can't rely on the current, or future, transactions to
 489  * write out the superblock safely.
 490  *
 491  * We'll just use the jbd2_journal_abort() error code to record an error in
 492  * the journal instead.  On recovery, the journal will complain about
 493  * that error until we've noted it down and cleared it.
 494  */
 495
 496 static void ext4_handle_error(struct super_block *sb)
 497 {
 498         if (test_opt(sb, WARN_ON_ERROR))
 499                 WARN_ON_ONCE(1);
 500
 501         if (sb_rdonly(sb))
 502                 return;
 503
 504         if (!test_opt(sb, ERRORS_CONT)) {
 505                 journal_t *journal = EXT4_SB(sb)->s_journal;
 506
 507                 EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED;
 508                 if (journal)
 509                         jbd2_journal_abort(journal, -EIO);
 510         }
 511         /*
 512          * We force ERRORS_RO behavior when system is rebooting. Otherwise we
 513          * could panic during 'reboot -f' as the underlying device got already
 514          * disabled.
 515          */
 516         if (test_opt(sb, ERRORS_RO) || system_going_down()) {
 517                 ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
 518                 /*
 519                  * Make sure updated value of ->s_mount_flags will be visible
 520                  * before ->s_flags update
 521                  */
 522                 smp_wmb();
 523                 sb->s_flags |= SB_RDONLY;
 524         } else if (test_opt(sb, ERRORS_PANIC)) {
 525                 if (EXT4_SB(sb)->s_journal &&
 526                   !(EXT4_SB(sb)->s_journal->j_flags & JBD2_REC_ERR))
 527                         return;
 528                 panic("EXT4-fs (device %s): panic forced after error\n",
 529                         sb->s_id);
 530         }
 531 }
 532
 533 #define ext4_error_ratelimit(sb)                                        \
 534                 ___ratelimit(&(EXT4_SB(sb)->s_err_ratelimit_state),     \
 535                              "EXT4-fs error")
 536
 537 void __ext4_error(struct super_block *sb, const char *function,
 538                   unsigned int line, int error, __u64 block,
 539                   const char *fmt, ...)
 540 {
 541         struct va_format vaf;
 542         va_list args;
 543
 544         if (unlikely(ext4_forced_shutdown(EXT4_SB(sb))))
 545                 return;
 546
 547         trace_ext4_error(sb, function, line);
 548         if (ext4_error_ratelimit(sb)) {
 549                 va_start(args, fmt);
 550                 vaf.fmt = fmt;
 551                 vaf.va = &args;
 552                 printk(KERN_CRIT
 553                        "EXT4-fs error (device %s): %s:%d: comm %s: %pV\n",
 554                        sb->s_id, function, line, current->comm, &vaf);
 555                 va_end(args);
 556         }
 557         save_error_info(sb, error, 0, block, function, line);
 558         ext4_handle_error(sb);
 559 }
 560
 561 void __ext4_error_inode(struct inode *inode, const char *function,
 562                         unsigned int line, ext4_fsblk_t block, int error,
 563                         const char *fmt, ...)
 564 {
 565         va_list args;
 566         struct va_format vaf;
 567
 568         if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
 569                 return;
 570
 571         trace_ext4_error(inode->i_sb, function, line);
 572         if (ext4_error_ratelimit(inode->i_sb)) {
 573                 va_start(args, fmt);
 574                 vaf.fmt = fmt;
 575                 vaf.va = &args;
 576                 if (block)
 577                         printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: "
 578                                "inode #%lu: block %llu: comm %s: %pV\n",
 579                                inode->i_sb->s_id, function, line, inode->i_ino,
 580                                block, current->comm, &vaf);
 581                 else
 582                         printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: "
 583                                "inode #%lu: comm %s: %pV\n",
 584                                inode->i_sb->s_id, function, line, inode->i_ino,
 585                                current->comm, &vaf);
 586                 va_end(args);
 587         }
 588         save_error_info(inode->i_sb, error, inode->i_ino, block,
 589                         function, line);
 590         ext4_handle_error(inode->i_sb);
 591 }
 592
 593 void __ext4_error_file(struct file *file, const char *function,
 594                        unsigned int line, ext4_fsblk_t block,
 595                        const char *fmt, ...)
 596 {
 597         va_list args;
 598         struct va_format vaf;
 599         struct inode *inode = file_inode(file);
 600         char pathname[80], *path;
 601
 602         if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
 603                 return;
 604
 605         trace_ext4_error(inode->i_sb, function, line);
 606         if (ext4_error_ratelimit(inode->i_sb)) {
 607                 path = file_path(file, pathname, sizeof(pathname));
 608                 if (IS_ERR(path))
 609                         path = "(unknown)";
 610                 va_start(args, fmt);
 611                 vaf.fmt = fmt;
 612                 vaf.va = &args;
 613                 if (block)
 614                         printk(KERN_CRIT
 615                                "EXT4-fs error (device %s): %s:%d: inode #%lu: "
 616                                "block %llu: comm %s: path %s: %pV\n",
 617                                inode->i_sb->s_id, function, line, inode->i_ino,
 618                                block, current->comm, path, &vaf);
 619                 else
 620                         printk(KERN_CRIT
 621                                "EXT4-fs error (device %s): %s:%d: inode #%lu: "
 622                                "comm %s: path %s: %pV\n",
 623                                inode->i_sb->s_id, function, line, inode->i_ino,
 624                                current->comm, path, &vaf);
 625                 va_end(args);
 626         }
 627         save_error_info(inode->i_sb, EFSCORRUPTED, inode->i_ino, block,
 628                         function, line);
 629         ext4_handle_error(inode->i_sb);
 630 }
 631
 632 const char *ext4_decode_error(struct super_block *sb, int errno,
 633                               char nbuf[16])
 634 {
 635         char *errstr = NULL;
 636
 637         switch (errno) {
 638         case -EFSCORRUPTED:
 639                 errstr = "Corrupt filesystem";
 640                 break;
 641         case -EFSBADCRC:
 642                 errstr = "Filesystem failed CRC";
 643                 break;
 644         case -EIO:
 645                 errstr = "IO failure";
 646                 break;
 647         case -ENOMEM:
 648                 errstr = "Out of memory";
 649                 break;
 650         case -EROFS:
 651                 if (!sb || (EXT4_SB(sb)->s_journal &&
 652                             EXT4_SB(sb)->s_journal->j_flags & JBD2_ABORT))
 653                         errstr = "Journal has aborted";
 654                 else
 655                         errstr = "Readonly filesystem";
 656                 break;
 657         default:
 658                 /* If the caller passed in an extra buffer for unknown
 659                  * errors, textualise them now.  Else we just return
 660                  * NULL. */
 661                 if (nbuf) {
 662                         /* Check for truncated error codes... */
 663                         if (snprintf(nbuf, 16, "error %d", -errno) >= 0)
 664                                 errstr = nbuf;
 665                 }
 666                 break;
 667         }
 668
 669         return errstr;
 670 }
 671
 672 /* __ext4_std_error decodes expected errors from journaling functions
 673  * automatically and invokes the appropriate error response.  */
 674
 675 void __ext4_std_error(struct super_block *sb, const char *function,
 676                       unsigned int line, int errno)
 677 {
 678         char nbuf[16];
 679         const char *errstr;
 680
 681         if (unlikely(ext4_forced_shutdown(EXT4_SB(sb))))
 682                 return;
 683
 684         /* Special case: if the error is EROFS, and we're not already
 685          * inside a transaction, then there's really no point in logging
 686          * an error. */
 687         if (errno == -EROFS && journal_current_handle() == NULL && sb_rdonly(sb))
 688                 return;
 689
 690         if (ext4_error_ratelimit(sb)) {
 691                 errstr = ext4_decode_error(sb, errno, nbuf);
 692                 printk(KERN_CRIT "EXT4-fs error (device %s) in %s:%d: %s\n",
 693                        sb->s_id, function, line, errstr);
 694         }
 695
 696         save_error_info(sb, -errno, 0, 0, function, line);
 697         ext4_handle_error(sb);
 698 }
 699
 700 /*
 701  * ext4_abort is a much stronger failure handler than ext4_error.  The
 702  * abort function may be used to deal with unrecoverable failures such
 703  * as journal IO errors or ENOMEM at a critical moment in log management.
 704  *
 705  * We unconditionally force the filesystem into an ABORT|READONLY state,
 706  * unless the error response on the fs has been set to panic in which
 707  * case we take the easy way out and panic immediately.
 708  */
 709
 710 void __ext4_abort(struct super_block *sb, const char *function,
 711                   unsigned int line, int error, const char *fmt, ...)
 712 {
 713         struct va_format vaf;
 714         va_list args;
 715
 716         if (unlikely(ext4_forced_shutdown(EXT4_SB(sb))))
 717                 return;
 718
 719         save_error_info(sb, error, 0, 0, function, line);
 720         va_start(args, fmt);
 721         vaf.fmt = fmt;
 722         vaf.va = &args;
 723         printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: %pV\n",
 724                sb->s_id, function, line, &vaf);
 725         va_end(args);
 726
 727         if (sb_rdonly(sb) == 0) {
 728                 ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
 729                 EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED;
 730                 /*
 731                  * Make sure updated value of ->s_mount_flags will be visible
 732                  * before ->s_flags update
 733                  */
 734                 smp_wmb();
 735                 sb->s_flags |= SB_RDONLY;
 736                 if (EXT4_SB(sb)->s_journal)
 737                         jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
 738         }
 739         if (test_opt(sb, ERRORS_PANIC) && !system_going_down()) {
 740                 if (EXT4_SB(sb)->s_journal &&
 741                   !(EXT4_SB(sb)->s_journal->j_flags & JBD2_REC_ERR))
 742                         return;
 743                 panic("EXT4-fs panic from previous error\n");
 744         }
 745 }
 746
 747 void __ext4_msg(struct super_block *sb,
 748                 const char *prefix, const char *fmt, ...)
 749 {
 750         struct va_format vaf;
 751         va_list args;
 752
 753         if (!___ratelimit(&(EXT4_SB(sb)->s_msg_ratelimit_state), "EXT4-fs"))
 754                 return;
 755
 756         va_start(args, fmt);
 757         vaf.fmt = fmt;
 758         vaf.va = &args;
 759         printk("%sEXT4-fs (%s): %pV\n", prefix, sb->s_id, &vaf);
 760         va_end(args);
 761 }
 762
 763 #define ext4_warning_ratelimit(sb)                                      \
 764                 ___ratelimit(&(EXT4_SB(sb)->s_warning_ratelimit_state), \
 765                              "EXT4-fs warning")
 766
 767 void __ext4_warning(struct super_block *sb, const char *function,
 768                     unsigned int line, const char *fmt, ...)
 769 {
 770         struct va_format vaf;
 771         va_list args;
 772
 773         if (!ext4_warning_ratelimit(sb))
 774                 return;
 775
 776         va_start(args, fmt);
 777         vaf.fmt = fmt;
 778         vaf.va = &args;
 779         printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: %pV\n",
 780                sb->s_id, function, line, &vaf);
 781         va_end(args);
 782 }
 783
 784 void __ext4_warning_inode(const struct inode *inode, const char *function,
 785                           unsigned int line, const char *fmt, ...)
 786 {
 787         struct va_format vaf;
 788         va_list args;
 789
 790         if (!ext4_warning_ratelimit(inode->i_sb))
 791                 return;
 792
 793         va_start(args, fmt);
 794         vaf.fmt = fmt;
 795         vaf.va = &args;
 796         printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: "
 797                "inode #%lu: comm %s: %pV\n", inode->i_sb->s_id,
 798                function, line, inode->i_ino, current->comm, &vaf);
 799         va_end(args);
 800 }
 801
 802 void __ext4_grp_locked_error(const char *function, unsigned int line,
 803                              struct super_block *sb, ext4_group_t grp,
 804                              unsigned long ino, ext4_fsblk_t block,
 805                              const char *fmt, ...)
 806 __releases(bitlock)
 807 __acquires(bitlock)
 808 {
 809         struct va_format vaf;
 810         va_list args;
 811
 812         if (unlikely(ext4_forced_shutdown(EXT4_SB(sb))))
 813                 return;
 814
 815         trace_ext4_error(sb, function, line);
 816         __save_error_info(sb, EFSCORRUPTED, ino, block, function, line);
 817
 818         if (ext4_error_ratelimit(sb)) {
 819                 va_start(args, fmt);
 820                 vaf.fmt = fmt;
 821                 vaf.va = &args;
 822                 printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: group %u, ",
 823                        sb->s_id, function, line, grp);
 824                 if (ino)
 825                         printk(KERN_CONT "inode %lu: ", ino);
 826                 if (block)
 827                         printk(KERN_CONT "block %llu:",
 828                                (unsigned long long) block);
 829                 printk(KERN_CONT "%pV\n", &vaf);
 830                 va_end(args);
 831         }
 832
 833         if (test_opt(sb, WARN_ON_ERROR))
 834                 WARN_ON_ONCE(1);
 835
 836         if (test_opt(sb, ERRORS_CONT)) {
 837                 ext4_commit_super(sb, 0);
 838                 return;
 839         }
 840
 841         ext4_unlock_group(sb, grp);
 842         ext4_commit_super(sb, 1);
 843         ext4_handle_error(sb);
 844         /*
 845          * We only get here in the ERRORS_RO case; relocking the group
 846          * may be dangerous, but nothing bad will happen since the
 847          * filesystem will have already been marked read/only and the
 848          * journal has been aborted.  We return 1 as a hint to callers
 849          * who might what to use the return value from
 850          * ext4_grp_locked_error() to distinguish between the
 851          * ERRORS_CONT and ERRORS_RO case, and perhaps return more
 852          * aggressively from the ext4 function in question, with a
 853          * more appropriate error code.
 854          */
 855         ext4_lock_group(sb, grp);
 856         return;
 857 }
 858
 859 void ext4_mark_group_bitmap_corrupted(struct super_block *sb,
 860                                      ext4_group_t group,
 861                                      unsigned int flags)
 862 {
 863         struct ext4_sb_info *sbi = EXT4_SB(sb);
 864         struct ext4_group_info *grp = ext4_get_group_info(sb, group);
 865         struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL);
 866         int ret;
 867
 868         if (flags & EXT4_GROUP_INFO_BBITMAP_CORRUPT) {
 869                 ret = ext4_test_and_set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT,
 870                                             &grp->bb_state);
 871                 if (!ret)
 872                         percpu_counter_sub(&sbi->s_freeclusters_counter,
 873                                            grp->bb_free);
 874         }
 875
 876         if (flags & EXT4_GROUP_INFO_IBITMAP_CORRUPT) {
 877                 ret = ext4_test_and_set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT,
 878                                             &grp->bb_state);
 879                 if (!ret && gdp) {
 880                         int count;
 881
 882                         count = ext4_free_inodes_count(sb, gdp);
 883                         percpu_counter_sub(&sbi->s_freeinodes_counter,
 884                                            count);
 885                 }
 886         }
 887 }
 888
 889 void ext4_update_dynamic_rev(struct super_block *sb)
 890 {
 891         struct ext4_super_block *es = EXT4_SB(sb)->s_es;
 892
 893         if (le32_to_cpu(es->s_rev_level) > EXT4_GOOD_OLD_REV)
 894                 return;
 895
 896         ext4_warning(sb,
 897                      "updating to rev %d because of new feature flag, "
 898                      "running e2fsck is recommended",
 899                      EXT4_DYNAMIC_REV);
 900
 901         es->s_first_ino = cpu_to_le32(EXT4_GOOD_OLD_FIRST_INO);
 902         es->s_inode_size = cpu_to_le16(EXT4_GOOD_OLD_INODE_SIZE);
 903         es->s_rev_level = cpu_to_le32(EXT4_DYNAMIC_REV);
 904         /* leave es->s_feature_*compat flags alone */
 905         /* es->s_uuid will be set by e2fsck if empty */
 906
 907         /*
 908          * The rest of the superblock fields should be zero, and if not it
 909          * means they are likely already in use, so leave them alone.  We
 910          * can leave it up to e2fsck to clean up any inconsistencies there.
 911          */
 912 }
 913
 914 /*
 915  * Open the external journal device
 916  */
 917 static struct block_device *ext4_blkdev_get(dev_t dev, struct super_block *sb)
 918 {
 919         struct block_device *bdev;
 920
 921         bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb);
 922         if (IS_ERR(bdev))
 923                 goto fail;
 924         return bdev;
 925
 926 fail:
 927         ext4_msg(sb, KERN_ERR,
 928                  "failed to open journal device unknown-block(%u,%u) %ld",
 929                  MAJOR(dev), MINOR(dev), PTR_ERR(bdev));
 930         return NULL;
 931 }
 932
 933 /*
 934  * Release the journal device
 935  */
 936 static void ext4_blkdev_put(struct block_device *bdev)
 937 {
 938         blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
 939 }
 940
 941 static void ext4_blkdev_remove(struct ext4_sb_info *sbi)
 942 {
 943         struct block_device *bdev;
 944         bdev = sbi->journal_bdev;
 945         if (bdev) {
 946                 ext4_blkdev_put(bdev);
 947                 sbi->journal_bdev = NULL;
 948         }
 949 }
 950
 951 static inline struct inode *orphan_list_entry(struct list_head *l)
 952 {
 953         return &list_entry(l, struct ext4_inode_info, i_orphan)->vfs_inode;
 954 }
 955
 956 static void dump_orphan_list(struct super_block *sb, struct ext4_sb_info *sbi)
 957 {
 958         struct list_head *l;
 959
 960         ext4_msg(sb, KERN_ERR, "sb orphan head is %d",
 961                  le32_to_cpu(sbi->s_es->s_last_orphan));
 962
 963         printk(KERN_ERR "sb_info orphan list:\n");
 964         list_for_each(l, &sbi->s_orphan) {
 965                 struct inode *inode = orphan_list_entry(l);
 966                 printk(KERN_ERR "  "
 967                        "inode %s:%lu at %p: mode %o, nlink %d, next %d\n",
 968                        inode->i_sb->s_id, inode->i_ino, inode,
 969                        inode->i_mode, inode->i_nlink,
 970                        NEXT_ORPHAN(inode));
 971         }
 972 }
 973
 974 #ifdef CONFIG_QUOTA
 975 static int ext4_quota_off(struct super_block *sb, int type);
 976
 977 static inline void ext4_quota_off_umount(struct super_block *sb)
 978 {
 979         int type;
 980
 981         /* Use our quota_off function to clear inode flags etc. */
 982         for (type = 0; type < EXT4_MAXQUOTAS; type++)
 983                 ext4_quota_off(sb, type);
 984 }
 985
 986 /*
 987  * This is a helper function which is used in the mount/remount
 988  * codepaths (which holds s_umount) to fetch the quota file name.
 989  */
 990 static inline char *get_qf_name(struct super_block *sb,
 991                                 struct ext4_sb_info *sbi,
 992                                 int type)
 993 {
 994         return rcu_dereference_protected(sbi->s_qf_names[type],
 995                                          lockdep_is_held(&sb->s_umount));
 996 }
 997 #else
 998 static inline void ext4_quota_off_umount(struct super_block *sb)
 999 {
1000 }
1001 #endif
1002
1003 static void ext4_put_super(struct super_block *sb)
1004 {
1005         struct ext4_sb_info *sbi = EXT4_SB(sb);
1006         struct ext4_super_block *es = sbi->s_es;
1007         struct buffer_head **group_desc;
1008         struct flex_groups **flex_groups;
1009         int aborted = 0;
1010         int i, err;
1011
1012         ext4_unregister_li_request(sb);
1013         ext4_quota_off_umount(sb);
1014
1015         destroy_workqueue(sbi->rsv_conversion_wq);
1016
1017         /*
1018          * Unregister sysfs before destroying jbd2 journal.
1019          * Since we could still access attr_journal_task attribute via sysfs
1020          * path which could have sbi->s_journal->j_task as NULL
1021          */
1022         ext4_unregister_sysfs(sb);
1023
1024         if (sbi->s_journal) {
1025                 aborted = is_journal_aborted(sbi->s_journal);
1026                 err = jbd2_journal_destroy(sbi->s_journal);
1027                 sbi->s_journal = NULL;
1028                 if ((err < 0) && !aborted) {
1029                         ext4_abort(sb, -err, "Couldn't clean up the journal");
1030                 }
1031         }
1032
1033         ext4_es_unregister_shrinker(sbi);
1034         del_timer_sync(&sbi->s_err_report);
1035         ext4_release_system_zone(sb);
1036         ext4_mb_release(sb);
1037         ext4_ext_release(sb);
1038
1039         if (!sb_rdonly(sb) && !aborted) {
1040                 ext4_clear_feature_journal_needs_recovery(sb);
1041                 es->s_state = cpu_to_le16(sbi->s_mount_state);
1042         }
1043         if (!sb_rdonly(sb))
1044                 ext4_commit_super(sb, 1);
1045
1046         rcu_read_lock();
1047         group_desc = rcu_dereference(sbi->s_group_desc);
1048         for (i = 0; i < sbi->s_gdb_count; i++)
1049                 brelse(group_desc[i]);
1050         kvfree(group_desc);
1051         flex_groups = rcu_dereference(sbi->s_flex_groups);
1052         if (flex_groups) {
1053                 for (i = 0; i < sbi->s_flex_groups_allocated; i++)
1054                         kvfree(flex_groups[i]);
1055                 kvfree(flex_groups);
1056         }
1057         rcu_read_unlock();
1058         percpu_counter_destroy(&sbi->s_freeclusters_counter);
1059         percpu_counter_destroy(&sbi->s_freeinodes_counter);
1060         percpu_counter_destroy(&sbi->s_dirs_counter);
1061         percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
1062         percpu_free_rwsem(&sbi->s_writepages_rwsem);
1063 #ifdef CONFIG_QUOTA
1064         for (i = 0; i < EXT4_MAXQUOTAS; i++)
1065                 kfree(get_qf_name(sb, sbi, i));
1066 #endif
1067
1068         /* Debugging code just in case the in-memory inode orphan list
1069          * isn't empty.  The on-disk one can be non-empty if we've
1070          * detected an error and taken the fs readonly, but the
1071          * in-memory list had better be clean by this point. */
1072         if (!list_empty(&sbi->s_orphan))
1073                 dump_orphan_list(sb, sbi);
1074         J_ASSERT(list_empty(&sbi->s_orphan));
1075
1076         sync_blockdev(sb->s_bdev);
1077         invalidate_bdev(sb->s_bdev);
1078         if (sbi->journal_bdev && sbi->journal_bdev != sb->s_bdev) {
1079                 /*
1080                  * Invalidate the journal device's buffers.  We don't want them
1081                  * floating about in memory - the physical journal device may
1082                  * hotswapped, and it breaks the `ro-after' testing code.
1083                  */
1084                 sync_blockdev(sbi->journal_bdev);
1085                 invalidate_bdev(sbi->journal_bdev);
1086                 ext4_blkdev_remove(sbi);
1087         }
1088
1089         ext4_xattr_destroy_cache(sbi->s_ea_inode_cache);
1090         sbi->s_ea_inode_cache = NULL;
1091
1092         ext4_xattr_destroy_cache(sbi->s_ea_block_cache);
1093         sbi->s_ea_block_cache = NULL;
1094
1095         if (sbi->s_mmp_tsk)
1096                 kthread_stop(sbi->s_mmp_tsk);
1097         brelse(sbi->s_sbh);
1098         sb->s_fs_info = NULL;
1099         /*
1100          * Now that we are completely done shutting down the
1101          * superblock, we need to actually destroy the kobject.
1102          */
1103         kobject_put(&sbi->s_kobj);
1104         wait_for_completion(&sbi->s_kobj_unregister);
1105         if (sbi->s_chksum_driver)
1106                 crypto_free_shash(sbi->s_chksum_driver);
1107         kfree(sbi->s_blockgroup_lock);
1108         fs_put_dax(sbi->s_daxdev);
1109 #ifdef CONFIG_UNICODE
1110         utf8_unload(sbi->s_encoding);
1111 #endif
1112         kfree(sbi);
1113 }
1114
1115 static struct kmem_cache *ext4_inode_cachep;
1116
1117 /*
1118  * Called inside transaction, so use GFP_NOFS
1119  */
1120 static struct inode *ext4_alloc_inode(struct super_block *sb)
1121 {
1122         struct ext4_inode_info *ei;
1123
1124         ei = kmem_cache_alloc(ext4_inode_cachep, GFP_NOFS);
1125         if (!ei)
1126                 return NULL;
1127
1128         inode_set_iversion(&ei->vfs_inode, 1);
1129         spin_lock_init(&ei->i_raw_lock);
1130         INIT_LIST_HEAD(&ei->i_prealloc_list);
1131         spin_lock_init(&ei->i_prealloc_lock);
1132         ext4_es_init_tree(&ei->i_es_tree);
1133         rwlock_init(&ei->i_es_lock);
1134         INIT_LIST_HEAD(&ei->i_es_list);
1135         ei->i_es_all_nr = 0;
1136         ei->i_es_shk_nr = 0;
1137         ei->i_es_shrink_lblk = 0;
1138         ei->i_reserved_data_blocks = 0;
1139         spin_lock_init(&(ei->i_block_reservation_lock));
1140         ext4_init_pending_tree(&ei->i_pending_tree);
1141 #ifdef CONFIG_QUOTA
1142         ei->i_reserved_quota = 0;
1143         memset(&ei->i_dquot, 0, sizeof(ei->i_dquot));
1144 #endif
1145         ei->jinode = NULL;
1146         INIT_LIST_HEAD(&ei->i_rsv_conversion_list);
1147         spin_lock_init(&ei->i_completed_io_lock);
1148         ei->i_sync_tid = 0;
1149         ei->i_datasync_tid = 0;
1150         atomic_set(&ei->i_unwritten, 0);
1151         INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work);
1152         return &ei->vfs_inode;
1153 }
1154
1155 static int ext4_drop_inode(struct inode *inode)
1156 {
1157         int drop = generic_drop_inode(inode);
1158
1159         if (!drop)
1160                 drop = fscrypt_drop_inode(inode);
1161
1162         trace_ext4_drop_inode(inode, drop);
1163         return drop;
1164 }
1165
1166 static void ext4_free_in_core_inode(struct inode *inode)
1167 {
1168         fscrypt_free_inode(inode);
1169         kmem_cache_free(ext4_inode_cachep, EXT4_I(inode));
1170 }
1171
1172 static void ext4_destroy_inode(struct inode *inode)
1173 {
1174         if (!list_empty(&(EXT4_I(inode)->i_orphan))) {
1175                 ext4_msg(inode->i_sb, KERN_ERR,
1176                          "Inode %lu (%p): orphan list check failed!",
1177                          inode->i_ino, EXT4_I(inode));
1178                 print_hex_dump(KERN_INFO, "", DUMP_PREFIX_ADDRESS, 16, 4,
1179                                 EXT4_I(inode), sizeof(struct ext4_inode_info),
1180                                 true);
1181                 dump_stack();
1182         }
1183 }
1184
1185 static void init_once(void *foo)
1186 {
1187         struct ext4_inode_info *ei = (struct ext4_inode_info *) foo;
1188
1189         INIT_LIST_HEAD(&ei->i_orphan);
1190         init_rwsem(&ei->xattr_sem);
1191         init_rwsem(&ei->i_data_sem);
1192         init_rwsem(&ei->i_mmap_sem);
1193         inode_init_once(&ei->vfs_inode);
1194 }
1195
1196 static int __init init_inodecache(void)
1197 {
1198         ext4_inode_cachep = kmem_cache_create_usercopy("ext4_inode_cache",
1199                                 sizeof(struct ext4_inode_info), 0,
1200                                 (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|
1201                                         SLAB_ACCOUNT),
1202                                 offsetof(struct ext4_inode_info, i_data),
1203                                 sizeof_field(struct ext4_inode_info, i_data),
1204                                 init_once);
1205         if (ext4_inode_cachep == NULL)
1206                 return -ENOMEM;
1207         return 0;
1208 }
1209
1210 static void destroy_inodecache(void)
1211 {
1212         /*
1213          * Make sure all delayed rcu free inodes are flushed before we
1214          * destroy cache.
1215          */
1216         rcu_barrier();
1217         kmem_cache_destroy(ext4_inode_cachep);
1218 }
1219
1220 void ext4_clear_inode(struct inode *inode)
1221 {
1222         invalidate_inode_buffers(inode);
1223         clear_inode(inode);
1224         ext4_discard_preallocations(inode);
1225         ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS);
1226         dquot_drop(inode);
1227         if (EXT4_I(inode)->jinode) {
1228                 jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode),
1229                                                EXT4_I(inode)->jinode);
1230                 jbd2_free_inode(EXT4_I(inode)->jinode);
1231                 EXT4_I(inode)->jinode = NULL;
1232         }
1233         fscrypt_put_encryption_info(inode);
1234         fsverity_cleanup_inode(inode);
1235 }
1236
1237 static struct inode *ext4_nfs_get_inode(struct super_block *sb,
1238                                         u64 ino, u32 generation)
1239 {
1240         struct inode *inode;
1241
1242         /*
1243          * Currently we don't know the generation for parent directory, so
1244          * a generation of 0 means "accept any"
1245          */
1246         inode = ext4_iget(sb, ino, EXT4_IGET_HANDLE);
1247         if (IS_ERR(inode))
1248                 return ERR_CAST(inode);
1249         if (generation && inode->i_generation != generation) {
1250                 iput(inode);
1251                 return ERR_PTR(-ESTALE);
1252         }
1253
1254         return inode;
1255 }
1256
1257 static struct dentry *ext4_fh_to_dentry(struct super_block *sb, struct fid *fid,
1258                                         int fh_len, int fh_type)
1259 {
1260         return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
1261                                     ext4_nfs_get_inode);
1262 }
1263
1264 static struct dentry *ext4_fh_to_parent(struct super_block *sb, struct fid *fid,
1265                                         int fh_len, int fh_type)
1266 {
1267         return generic_fh_to_parent(sb, fid, fh_len, fh_type,
1268                                     ext4_nfs_get_inode);
1269 }
1270
1271 static int ext4_nfs_commit_metadata(struct inode *inode)
1272 {
1273         struct writeback_control wbc = {
1274                 .sync_mode = WB_SYNC_ALL
1275         };
1276
1277         trace_ext4_nfs_commit_metadata(inode);
1278         return ext4_write_inode(inode, &wbc);
1279 }
1280
1281 /*
1282  * Try to release metadata pages (indirect blocks, directories) which are
1283  * mapped via the block device.  Since these pages could have journal heads
1284  * which would prevent try_to_free_buffers() from freeing them, we must use
1285  * jbd2 layer's try_to_free_buffers() function to release them.
1286  */
1287 static int bdev_try_to_free_page(struct super_block *sb, struct page *page,
1288                                  gfp_t wait)
1289 {
1290         journal_t *journal = EXT4_SB(sb)->s_journal;
1291
1292         WARN_ON(PageChecked(page));
1293         if (!page_has_buffers(page))
1294                 return 0;
1295         if (journal)
1296                 return jbd2_journal_try_to_free_buffers(journal, page,
1297                                                 wait & ~__GFP_DIRECT_RECLAIM);
1298         return try_to_free_buffers(page);
1299 }
1300
1301 #ifdef CONFIG_FS_ENCRYPTION
1302 static int ext4_get_context(struct inode *inode, void *ctx, size_t len)
1303 {
1304         return ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION,
1305                                  EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, ctx, len);
1306 }
1307
1308 static int ext4_set_context(struct inode *inode, const void *ctx, size_t len,
1309                                                         void *fs_data)
1310 {
1311         handle_t *handle = fs_data;
1312         int res, res2, credits, retries = 0;
1313
1314         /*
1315          * Encrypting the root directory is not allowed because e2fsck expects
1316          * lost+found to exist and be unencrypted, and encrypting the root
1317          * directory would imply encrypting the lost+found directory as well as
1318          * the filename "lost+found" itself.
1319          */
1320         if (inode->i_ino == EXT4_ROOT_INO)
1321                 return -EPERM;
1322
1323         if (WARN_ON_ONCE(IS_DAX(inode) && i_size_read(inode)))
1324                 return -EINVAL;
1325
1326         if (ext4_test_inode_flag(inode, EXT4_INODE_DAX))
1327                 return -EOPNOTSUPP;
1328
1329         res = ext4_convert_inline_data(inode);
1330         if (res)
1331                 return res;
1332
1333         /*
1334          * If a journal handle was specified, then the encryption context is
1335          * being set on a new inode via inheritance and is part of a larger
1336          * transaction to create the inode.  Otherwise the encryption context is
1337          * being set on an existing inode in its own transaction.  Only in the
1338          * latter case should the "retry on ENOSPC" logic be used.
1339          */
1340
1341         if (handle) {
1342                 res = ext4_xattr_set_handle(handle, inode,
1343                                             EXT4_XATTR_INDEX_ENCRYPTION,
1344                                             EXT4_XATTR_NAME_ENCRYPTION_CONTEXT,
1345                                             ctx, len, 0);
1346                 if (!res) {
1347                         ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT);
1348                         ext4_clear_inode_state(inode,
1349                                         EXT4_STATE_MAY_INLINE_DATA);
1350                         /*
1351                          * Update inode->i_flags - S_ENCRYPTED will be enabled,
1352                          * S_DAX may be disabled
1353                          */
1354                         ext4_set_inode_flags(inode, false);
1355                 }
1356                 return res;
1357         }
1358
1359         res = dquot_initialize(inode);
1360         if (res)
1361                 return res;
1362 retry:
1363         res = ext4_xattr_set_credits(inode, len, false /* is_create */,
1364                                      &credits);
1365         if (res)
1366                 return res;
1367
1368         handle = ext4_journal_start(inode, EXT4_HT_MISC, credits);
1369         if (IS_ERR(handle))
1370                 return PTR_ERR(handle);
1371
1372         res = ext4_xattr_set_handle(handle, inode, EXT4_XATTR_INDEX_ENCRYPTION,
1373                                     EXT4_XATTR_NAME_ENCRYPTION_CONTEXT,
1374                                     ctx, len, 0);
1375         if (!res) {
1376                 ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT);
1377                 /*
1378                  * Update inode->i_flags - S_ENCRYPTED will be enabled,
1379                  * S_DAX may be disabled
1380                  */
1381                 ext4_set_inode_flags(inode, false);
1382                 res = ext4_mark_inode_dirty(handle, inode);
1383                 if (res)
1384                         EXT4_ERROR_INODE(inode, "Failed to mark inode dirty");
1385         }
1386         res2 = ext4_journal_stop(handle);
1387
1388         if (res == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
1389                 goto retry;
1390         if (!res)
1391                 res = res2;
1392         return res;
1393 }
1394
1395 static bool ext4_dummy_context(struct inode *inode)
1396 {
1397         return DUMMY_ENCRYPTION_ENABLED(EXT4_SB(inode->i_sb));
1398 }
1399
1400 static bool ext4_has_stable_inodes(struct super_block *sb)
1401 {
1402         return ext4_has_feature_stable_inodes(sb);
1403 }
1404
1405 static void ext4_get_ino_and_lblk_bits(struct super_block *sb,
1406                                        int *ino_bits_ret, int *lblk_bits_ret)
1407 {
1408         *ino_bits_ret = 8 * sizeof(EXT4_SB(sb)->s_es->s_inodes_count);
1409         *lblk_bits_ret = 8 * sizeof(ext4_lblk_t);
1410 }
1411
1412 static const struct fscrypt_operations ext4_cryptops = {
1413         .key_prefix             = "ext4:",
1414         .get_context            = ext4_get_context,
1415         .set_context            = ext4_set_context,
1416         .dummy_context          = ext4_dummy_context,
1417         .empty_dir              = ext4_empty_dir,
1418         .max_namelen            = EXT4_NAME_LEN,
1419         .has_stable_inodes      = ext4_has_stable_inodes,
1420         .get_ino_and_lblk_bits  = ext4_get_ino_and_lblk_bits,
1421 };
1422 #endif
1423
1424 #ifdef CONFIG_QUOTA
1425 static const char * const quotatypes[] = INITQFNAMES;
1426 #define QTYPE2NAME(t) (quotatypes[t])
1427
1428 static int ext4_write_dquot(struct dquot *dquot);
1429 static int ext4_acquire_dquot(struct dquot *dquot);
1430 static int ext4_release_dquot(struct dquot *dquot);
1431 static int ext4_mark_dquot_dirty(struct dquot *dquot);
1432 static int ext4_write_info(struct super_block *sb, int type);
1433 static int ext4_quota_on(struct super_block *sb, int type, int format_id,
1434                          const struct path *path);
1435 static int ext4_quota_on_mount(struct super_block *sb, int type);
1436 static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
1437                                size_t len, loff_t off);
1438 static ssize_t ext4_quota_write(struct super_block *sb, int type,
1439                                 const char *data, size_t len, loff_t off);
1440 static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
1441                              unsigned int flags);
1442 static int ext4_enable_quotas(struct super_block *sb);
1443
1444 static struct dquot **ext4_get_dquots(struct inode *inode)
1445 {
1446         return EXT4_I(inode)->i_dquot;
1447 }
1448
1449 static const struct dquot_operations ext4_quota_operations = {
1450         .get_reserved_space     = ext4_get_reserved_space,
1451         .write_dquot            = ext4_write_dquot,
1452         .acquire_dquot          = ext4_acquire_dquot,
1453         .release_dquot          = ext4_release_dquot,
1454         .mark_dirty             = ext4_mark_dquot_dirty,
1455         .write_info             = ext4_write_info,
1456         .alloc_dquot            = dquot_alloc,
1457         .destroy_dquot          = dquot_destroy,
1458         .get_projid             = ext4_get_projid,
1459         .get_inode_usage        = ext4_get_inode_usage,
1460         .get_next_id            = dquot_get_next_id,
1461 };
1462
1463 static const struct quotactl_ops ext4_qctl_operations = {
1464         .quota_on       = ext4_quota_on,
1465         .quota_off      = ext4_quota_off,
1466         .quota_sync     = dquot_quota_sync,
1467         .get_state      = dquot_get_state,
1468         .set_info       = dquot_set_dqinfo,
1469         .get_dqblk      = dquot_get_dqblk,
1470         .set_dqblk      = dquot_set_dqblk,
1471         .get_nextdqblk  = dquot_get_next_dqblk,
1472 };
1473 #endif
1474
1475 static const struct super_operations ext4_sops = {
1476         .alloc_inode    = ext4_alloc_inode,
1477         .free_inode     = ext4_free_in_core_inode,
1478         .destroy_inode  = ext4_destroy_inode,
1479         .write_inode    = ext4_write_inode,
1480         .dirty_inode    = ext4_dirty_inode,
1481         .drop_inode     = ext4_drop_inode,
1482         .evict_inode    = ext4_evict_inode,
1483         .put_super      = ext4_put_super,
1484         .sync_fs        = ext4_sync_fs,
1485         .freeze_fs      = ext4_freeze,
1486         .unfreeze_fs    = ext4_unfreeze,
1487         .statfs         = ext4_statfs,
1488         .remount_fs     = ext4_remount,
1489         .show_options   = ext4_show_options,
1490 #ifdef CONFIG_QUOTA
1491         .quota_read     = ext4_quota_read,
1492         .quota_write    = ext4_quota_write,
1493         .get_dquots     = ext4_get_dquots,
1494 #endif
1495         .bdev_try_to_free_page = bdev_try_to_free_page,
1496 };
1497
1498 static const struct export_operations ext4_export_ops = {
1499         .fh_to_dentry = ext4_fh_to_dentry,
1500         .fh_to_parent = ext4_fh_to_parent,
1501         .get_parent = ext4_get_parent,
1502         .commit_metadata = ext4_nfs_commit_metadata,
1503 };
1504
1505 enum {
1506         Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid,
1507         Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro,
1508         Opt_nouid32, Opt_debug, Opt_removed,
1509         Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
1510         Opt_auto_da_alloc, Opt_noauto_da_alloc, Opt_noload,
1511         Opt_commit, Opt_min_batch_time, Opt_max_batch_time, Opt_journal_dev,
1512         Opt_journal_path, Opt_journal_checksum, Opt_journal_async_commit,
1513         Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
1514         Opt_data_err_abort, Opt_data_err_ignore, Opt_test_dummy_encryption,
1515         Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
1516         Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota,
1517         Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err,
1518         Opt_usrquota, Opt_grpquota, Opt_prjquota, Opt_i_version,
1519         Opt_dax, Opt_dax_always, Opt_dax_inode, Opt_dax_never,
1520         Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_warn_on_error,
1521         Opt_nowarn_on_error, Opt_mblk_io_submit,
1522         Opt_lazytime, Opt_nolazytime, Opt_debug_want_extra_isize,
1523         Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity,
1524         Opt_inode_readahead_blks, Opt_journal_ioprio,
1525         Opt_dioread_nolock, Opt_dioread_lock,
1526         Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable,
1527         Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_nombcache,
1528 };
1529
1530 static const match_table_t tokens = {
1531         {Opt_bsd_df, "bsddf"},
1532         {Opt_minix_df, "minixdf"},
1533         {Opt_grpid, "grpid"},
1534         {Opt_grpid, "bsdgroups"},
1535         {Opt_nogrpid, "nogrpid"},
1536         {Opt_nogrpid, "sysvgroups"},
1537         {Opt_resgid, "resgid=%u"},
1538         {Opt_resuid, "resuid=%u"},
1539         {Opt_sb, "sb=%u"},
1540         {Opt_err_cont, "errors=continue"},
1541         {Opt_err_panic, "errors=panic"},
1542         {Opt_err_ro, "errors=remount-ro"},
1543         {Opt_nouid32, "nouid32"},
1544         {Opt_debug, "debug"},
1545         {Opt_removed, "oldalloc"},
1546         {Opt_removed, "orlov"},
1547         {Opt_user_xattr, "user_xattr"},
1548         {Opt_nouser_xattr, "nouser_xattr"},
1549         {Opt_acl, "acl"},
1550         {Opt_noacl, "noacl"},
1551         {Opt_noload, "norecovery"},
1552         {Opt_noload, "noload"},
1553         {Opt_removed, "nobh"},
1554         {Opt_removed, "bh"},
1555         {Opt_commit, "commit=%u"},
1556         {Opt_min_batch_time, "min_batch_time=%u"},
1557         {Opt_max_batch_time, "max_batch_time=%u"},
1558         {Opt_journal_dev, "journal_dev=%u"},
1559         {Opt_journal_path, "journal_path=%s"},
1560         {Opt_journal_checksum, "journal_checksum"},
1561         {Opt_nojournal_checksum, "nojournal_checksum"},
1562         {Opt_journal_async_commit, "journal_async_commit"},
1563         {Opt_abort, "abort"},
1564         {Opt_data_journal, "data=journal"},
1565         {Opt_data_ordered, "data=ordered"},
1566         {Opt_data_writeback, "data=writeback"},
1567         {Opt_data_err_abort, "data_err=abort"},
1568         {Opt_data_err_ignore, "data_err=ignore"},
1569         {Opt_offusrjquota, "usrjquota="},
1570         {Opt_usrjquota, "usrjquota=%s"},
1571         {Opt_offgrpjquota, "grpjquota="},
1572         {Opt_grpjquota, "grpjquota=%s"},
1573         {Opt_jqfmt_vfsold, "jqfmt=vfsold"},
1574         {Opt_jqfmt_vfsv0, "jqfmt=vfsv0"},
1575         {Opt_jqfmt_vfsv1, "jqfmt=vfsv1"},
1576         {Opt_grpquota, "grpquota"},
1577         {Opt_noquota, "noquota"},
1578         {Opt_quota, "quota"},
1579         {Opt_usrquota, "usrquota"},
1580         {Opt_prjquota, "prjquota"},
1581         {Opt_barrier, "barrier=%u"},
1582         {Opt_barrier, "barrier"},
1583         {Opt_nobarrier, "nobarrier"},
1584         {Opt_i_version, "i_version"},
1585         {Opt_dax, "dax"},
1586         {Opt_dax_always, "dax=always"},
1587         {Opt_dax_inode, "dax=inode"},
1588         {Opt_dax_never, "dax=never"},
1589         {Opt_stripe, "stripe=%u"},
1590         {Opt_delalloc, "delalloc"},
1591         {Opt_warn_on_error, "warn_on_error"},
1592         {Opt_nowarn_on_error, "nowarn_on_error"},
1593         {Opt_lazytime, "lazytime"},
1594         {Opt_nolazytime, "nolazytime"},
1595         {Opt_debug_want_extra_isize, "debug_want_extra_isize=%u"},
1596         {Opt_nodelalloc, "nodelalloc"},
1597         {Opt_removed, "mblk_io_submit"},
1598         {Opt_removed, "nomblk_io_submit"},
1599         {Opt_block_validity, "block_validity"},
1600         {Opt_noblock_validity, "noblock_validity"},
1601         {Opt_inode_readahead_blks, "inode_readahead_blks=%u"},
1602         {Opt_journal_ioprio, "journal_ioprio=%u"},
1603         {Opt_auto_da_alloc, "auto_da_alloc=%u"},
1604         {Opt_auto_da_alloc, "auto_da_alloc"},
1605         {Opt_noauto_da_alloc, "noauto_da_alloc"},
1606         {Opt_dioread_nolock, "dioread_nolock"},
1607         {Opt_dioread_lock, "nodioread_nolock"},
1608         {Opt_dioread_lock, "dioread_lock"},
1609         {Opt_discard, "discard"},
1610         {Opt_nodiscard, "nodiscard"},
1611         {Opt_init_itable, "init_itable=%u"},
1612         {Opt_init_itable, "init_itable"},
1613         {Opt_noinit_itable, "noinit_itable"},
1614         {Opt_max_dir_size_kb, "max_dir_size_kb=%u"},
1615         {Opt_test_dummy_encryption, "test_dummy_encryption"},
1616         {Opt_nombcache, "nombcache"},
1617         {Opt_nombcache, "no_mbcache"},  /* for backward compatibility */
1618         {Opt_removed, "check=none"},    /* mount option from ext2/3 */
1619         {Opt_removed, "nocheck"},       /* mount option from ext2/3 */
1620         {Opt_removed, "reservation"},   /* mount option from ext2/3 */
1621         {Opt_removed, "noreservation"}, /* mount option from ext2/3 */
1622         {Opt_removed, "journal=%u"},    /* mount option from ext2/3 */
1623         {Opt_err, NULL},
1624 };
1625
1626 static ext4_fsblk_t get_sb_block(void **data)
1627 {
1628         ext4_fsblk_t    sb_block;
1629         char            *options = (char *) *data;
1630
1631         if (!options || strncmp(options, "sb=", 3) != 0)
1632                 return 1;       /* Default location */
1633
1634         options += 3;
1635         /* TODO: use simple_strtoll with >32bit ext4 */
1636         sb_block = simple_strtoul(options, &options, 0);
1637         if (*options && *options != ',') {
1638                 printk(KERN_ERR "EXT4-fs: Invalid sb specification: %s\n",
1639                        (char *) *data);
1640                 return 1;
1641         }
1642         if (*options == ',')
1643                 options++;
1644         *data = (void *) options;
1645
1646         return sb_block;
1647 }
1648
1649 #define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3))
1650 static const char deprecated_msg[] =
1651         "Mount option \"%s\" will be removed by %s\n"
1652         "Contact linux-ext4@vger.kernel.org if you think we should keep it.\n";
1653
1654 #ifdef CONFIG_QUOTA
1655 static int set_qf_name(struct super_block *sb, int qtype, substring_t *args)
1656 {
1657         struct ext4_sb_info *sbi = EXT4_SB(sb);
1658         char *qname, *old_qname = get_qf_name(sb, sbi, qtype);
1659         int ret = -1;
1660
1661         if (sb_any_quota_loaded(sb) && !old_qname) {
1662                 ext4_msg(sb, KERN_ERR,
1663                         "Cannot change journaled "
1664                         "quota options when quota turned on");
1665                 return -1;
1666         }
1667         if (ext4_has_feature_quota(sb)) {
1668                 ext4_msg(sb, KERN_INFO, "Journaled quota options "
1669                          "ignored when QUOTA feature is enabled");
1670                 return 1;
1671         }
1672         qname = match_strdup(args);
1673         if (!qname) {
1674                 ext4_msg(sb, KERN_ERR,
1675                         "Not enough memory for storing quotafile name");
1676                 return -1;
1677         }
1678         if (old_qname) {
1679                 if (strcmp(old_qname, qname) == 0)
1680                         ret = 1;
1681                 else
1682                         ext4_msg(sb, KERN_ERR,
1683                                  "%s quota file already specified",
1684                                  QTYPE2NAME(qtype));
1685                 goto errout;
1686         }
1687         if (strchr(qname, '/')) {
1688                 ext4_msg(sb, KERN_ERR,
1689                         "quotafile must be on filesystem root");
1690                 goto errout;
1691         }
1692         rcu_assign_pointer(sbi->s_qf_names[qtype], qname);
1693         set_opt(sb, QUOTA);
1694         return 1;
1695 errout:
1696         kfree(qname);
1697         return ret;
1698 }
1699
1700 static int clear_qf_name(struct super_block *sb, int qtype)
1701 {
1702
1703         struct ext4_sb_info *sbi = EXT4_SB(sb);
1704         char *old_qname = get_qf_name(sb, sbi, qtype);
1705
1706         if (sb_any_quota_loaded(sb) && old_qname) {
1707                 ext4_msg(sb, KERN_ERR, "Cannot change journaled quota options"
1708                         " when quota turned on");
1709                 return -1;
1710         }
1711         rcu_assign_pointer(sbi->s_qf_names[qtype], NULL);
1712         synchronize_rcu();
1713         kfree(old_qname);
1714         return 1;
1715 }
1716 #endif
1717
1718 #define MOPT_SET        0x0001
1719 #define MOPT_CLEAR      0x0002
1720 #define MOPT_NOSUPPORT  0x0004
1721 #define MOPT_EXPLICIT   0x0008
1722 #define MOPT_CLEAR_ERR  0x0010
1723 #define MOPT_GTE0       0x0020
1724 #ifdef CONFIG_QUOTA
1725 #define MOPT_Q          0
1726 #define MOPT_QFMT       0x0040
1727 #else
1728 #define MOPT_Q          MOPT_NOSUPPORT
1729 #define MOPT_QFMT       MOPT_NOSUPPORT
1730 #endif
1731 #define MOPT_DATAJ      0x0080
1732 #define MOPT_NO_EXT2    0x0100
1733 #define MOPT_NO_EXT3    0x0200
1734 #define MOPT_EXT4_ONLY  (MOPT_NO_EXT2 | MOPT_NO_EXT3)
1735 #define MOPT_STRING     0x0400
1736 #define MOPT_SKIP       0x0800
1737
1738 static const struct mount_opts {
1739         int     token;
1740         int     mount_opt;
1741         int     flags;
1742 } ext4_mount_opts[] = {
1743         {Opt_minix_df, EXT4_MOUNT_MINIX_DF, MOPT_SET},
1744         {Opt_bsd_df, EXT4_MOUNT_MINIX_DF, MOPT_CLEAR},
1745         {Opt_grpid, EXT4_MOUNT_GRPID, MOPT_SET},
1746         {Opt_nogrpid, EXT4_MOUNT_GRPID, MOPT_CLEAR},
1747         {Opt_block_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_SET},
1748         {Opt_noblock_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_CLEAR},
1749         {Opt_dioread_nolock, EXT4_MOUNT_DIOREAD_NOLOCK,
1750          MOPT_EXT4_ONLY | MOPT_SET},
1751         {Opt_dioread_lock, EXT4_MOUNT_DIOREAD_NOLOCK,
1752          MOPT_EXT4_ONLY | MOPT_CLEAR},
1753         {Opt_discard, EXT4_MOUNT_DISCARD, MOPT_SET},
1754         {Opt_nodiscard, EXT4_MOUNT_DISCARD, MOPT_CLEAR},
1755         {Opt_delalloc, EXT4_MOUNT_DELALLOC,
1756          MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT},
1757         {Opt_nodelalloc, EXT4_MOUNT_DELALLOC,
1758          MOPT_EXT4_ONLY | MOPT_CLEAR},
1759         {Opt_warn_on_error, EXT4_MOUNT_WARN_ON_ERROR, MOPT_SET},
1760         {Opt_nowarn_on_error, EXT4_MOUNT_WARN_ON_ERROR, MOPT_CLEAR},
1761         {Opt_nojournal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM,
1762          MOPT_EXT4_ONLY | MOPT_CLEAR},
1763         {Opt_journal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM,
1764          MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT},
1765         {Opt_journal_async_commit, (EXT4_MOUNT_JOURNAL_ASYNC_COMMIT |
1766                                     EXT4_MOUNT_JOURNAL_CHECKSUM),
1767          MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT},
1768         {Opt_noload, EXT4_MOUNT_NOLOAD, MOPT_NO_EXT2 | MOPT_SET},
1769         {Opt_err_panic, EXT4_MOUNT_ERRORS_PANIC, MOPT_SET | MOPT_CLEAR_ERR},
1770         {Opt_err_ro, EXT4_MOUNT_ERRORS_RO, MOPT_SET | MOPT_CLEAR_ERR},
1771         {Opt_err_cont, EXT4_MOUNT_ERRORS_CONT, MOPT_SET | MOPT_CLEAR_ERR},
1772         {Opt_data_err_abort, EXT4_MOUNT_DATA_ERR_ABORT,
1773          MOPT_NO_EXT2},
1774         {Opt_data_err_ignore, EXT4_MOUNT_DATA_ERR_ABORT,
1775          MOPT_NO_EXT2},
1776         {Opt_barrier, EXT4_MOUNT_BARRIER, MOPT_SET},
1777         {Opt_nobarrier, EXT4_MOUNT_BARRIER, MOPT_CLEAR},
1778         {Opt_noauto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_SET},
1779         {Opt_auto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_CLEAR},
1780         {Opt_noinit_itable, EXT4_MOUNT_INIT_INODE_TABLE, MOPT_CLEAR},
1781         {Opt_commit, 0, MOPT_GTE0},
1782         {Opt_max_batch_time, 0, MOPT_GTE0},
1783         {Opt_min_batch_time, 0, MOPT_GTE0},
1784         {Opt_inode_readahead_blks, 0, MOPT_GTE0},
1785         {Opt_init_itable, 0, MOPT_GTE0},
1786         {Opt_dax, EXT4_MOUNT_DAX_ALWAYS, MOPT_SET | MOPT_SKIP},
1787         {Opt_dax_always, EXT4_MOUNT_DAX_ALWAYS,
1788                 MOPT_EXT4_ONLY | MOPT_SET | MOPT_SKIP},
1789         {Opt_dax_inode, EXT4_MOUNT2_DAX_INODE,
1790                 MOPT_EXT4_ONLY | MOPT_SET | MOPT_SKIP},
1791         {Opt_dax_never, EXT4_MOUNT2_DAX_NEVER,
1792                 MOPT_EXT4_ONLY | MOPT_SET | MOPT_SKIP},
1793         {Opt_stripe, 0, MOPT_GTE0},
1794         {Opt_resuid, 0, MOPT_GTE0},
1795         {Opt_resgid, 0, MOPT_GTE0},
1796         {Opt_journal_dev, 0, MOPT_NO_EXT2 | MOPT_GTE0},
1797         {Opt_journal_path, 0, MOPT_NO_EXT2 | MOPT_STRING},
1798         {Opt_journal_ioprio, 0, MOPT_NO_EXT2 | MOPT_GTE0},
1799         {Opt_data_journal, EXT4_MOUNT_JOURNAL_DATA, MOPT_NO_EXT2 | MOPT_DATAJ},
1800         {Opt_data_ordered, EXT4_MOUNT_ORDERED_DATA, MOPT_NO_EXT2 | MOPT_DATAJ},
1801         {Opt_data_writeback, EXT4_MOUNT_WRITEBACK_DATA,
1802          MOPT_NO_EXT2 | MOPT_DATAJ},
1803         {Opt_user_xattr, EXT4_MOUNT_XATTR_USER, MOPT_SET},
1804         {Opt_nouser_xattr, EXT4_MOUNT_XATTR_USER, MOPT_CLEAR},
1805 #ifdef CONFIG_EXT4_FS_POSIX_ACL
1806         {Opt_acl, EXT4_MOUNT_POSIX_ACL, MOPT_SET},
1807         {Opt_noacl, EXT4_MOUNT_POSIX_ACL, MOPT_CLEAR},
1808 #else
1809         {Opt_acl, 0, MOPT_NOSUPPORT},
1810         {Opt_noacl, 0, MOPT_NOSUPPORT},
1811 #endif
1812         {Opt_nouid32, EXT4_MOUNT_NO_UID32, MOPT_SET},
1813         {Opt_debug, EXT4_MOUNT_DEBUG, MOPT_SET},
1814         {Opt_debug_want_extra_isize, 0, MOPT_GTE0},
1815         {Opt_quota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA, MOPT_SET | MOPT_Q},
1816         {Opt_usrquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA,
1817                                                         MOPT_SET | MOPT_Q},
1818         {Opt_grpquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_GRPQUOTA,
1819                                                         MOPT_SET | MOPT_Q},
1820         {Opt_prjquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_PRJQUOTA,
1821                                                         MOPT_SET | MOPT_Q},
1822         {Opt_noquota, (EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA |
1823                        EXT4_MOUNT_GRPQUOTA | EXT4_MOUNT_PRJQUOTA),
1824                                                         MOPT_CLEAR | MOPT_Q},
1825         {Opt_usrjquota, 0, MOPT_Q},
1826         {Opt_grpjquota, 0, MOPT_Q},
1827         {Opt_offusrjquota, 0, MOPT_Q},
1828         {Opt_offgrpjquota, 0, MOPT_Q},
1829         {Opt_jqfmt_vfsold, QFMT_VFS_OLD, MOPT_QFMT},
1830         {Opt_jqfmt_vfsv0, QFMT_VFS_V0, MOPT_QFMT},
1831         {Opt_jqfmt_vfsv1, QFMT_VFS_V1, MOPT_QFMT},
1832         {Opt_max_dir_size_kb, 0, MOPT_GTE0},
1833         {Opt_test_dummy_encryption, 0, MOPT_GTE0},
1834         {Opt_nombcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET},
1835         {Opt_err, 0, 0}
1836 };
1837
1838 #ifdef CONFIG_UNICODE
1839 static const struct ext4_sb_encodings {
1840         __u16 magic;
1841         char *name;
1842         char *version;
1843 } ext4_sb_encoding_map[] = {
1844         {EXT4_ENC_UTF8_12_1, "utf8", "12.1.0"},
1845 };
1846
1847 static int ext4_sb_read_encoding(const struct ext4_super_block *es,
1848                                  const struct ext4_sb_encodings **encoding,
1849                                  __u16 *flags)
1850 {
1851         __u16 magic = le16_to_cpu(es->s_encoding);
1852         int i;
1853
1854         for (i = 0; i < ARRAY_SIZE(ext4_sb_encoding_map); i++)
1855                 if (magic == ext4_sb_encoding_map[i].magic)
1856                         break;
1857
1858         if (i >= ARRAY_SIZE(ext4_sb_encoding_map))
1859                 return -EINVAL;
1860
1861         *encoding = &ext4_sb_encoding_map[i];
1862         *flags = le16_to_cpu(es->s_encoding_flags);
1863
1864         return 0;
1865 }
1866 #endif
1867
1868 static int handle_mount_opt(struct super_block *sb, char *opt, int token,
1869                             substring_t *args, unsigned long *journal_devnum,
1870                             unsigned int *journal_ioprio, int is_remount)
1871 {
1872         struct ext4_sb_info *sbi = EXT4_SB(sb);
1873         const struct mount_opts *m;
1874         kuid_t uid;
1875         kgid_t gid;
1876         int arg = 0;
1877
1878 #ifdef CONFIG_QUOTA
1879         if (token == Opt_usrjquota)
1880                 return set_qf_name(sb, USRQUOTA, &args[0]);
1881         else if (token == Opt_grpjquota)
1882                 return set_qf_name(sb, GRPQUOTA, &args[0]);
1883         else if (token == Opt_offusrjquota)
1884                 return clear_qf_name(sb, USRQUOTA);
1885         else if (token == Opt_offgrpjquota)
1886                 return clear_qf_name(sb, GRPQUOTA);
1887 #endif
1888         switch (token) {
1889         case Opt_noacl:
1890         case Opt_nouser_xattr:
1891                 ext4_msg(sb, KERN_WARNING, deprecated_msg, opt, "3.5");
1892                 break;
1893         case Opt_sb:
1894                 return 1;       /* handled by get_sb_block() */
1895         case Opt_removed:
1896                 ext4_msg(sb, KERN_WARNING, "Ignoring removed %s option", opt);
1897                 return 1;
1898         case Opt_abort:
1899                 sbi->s_mount_flags |= EXT4_MF_FS_ABORTED;
1900                 return 1;
1901         case Opt_i_version:
1902                 sb->s_flags |= SB_I_VERSION;
1903                 return 1;
1904         case Opt_lazytime:
1905                 sb->s_flags |= SB_LAZYTIME;
1906                 return 1;
1907         case Opt_nolazytime:
1908                 sb->s_flags &= ~SB_LAZYTIME;
1909                 return 1;
1910         }
1911
1912         for (m = ext4_mount_opts; m->token != Opt_err; m++)
1913                 if (token == m->token)
1914                         break;
1915
1916         if (m->token == Opt_err) {
1917                 ext4_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" "
1918                          "or missing value", opt);
1919                 return -1;
1920         }
1921
1922         if ((m->flags & MOPT_NO_EXT2) && IS_EXT2_SB(sb)) {
1923                 ext4_msg(sb, KERN_ERR,
1924                          "Mount option \"%s\" incompatible with ext2", opt);
1925                 return -1;
1926         }
1927         if ((m->flags & MOPT_NO_EXT3) && IS_EXT3_SB(sb)) {
1928                 ext4_msg(sb, KERN_ERR,
1929                          "Mount option \"%s\" incompatible with ext3", opt);
1930                 return -1;
1931         }
1932
1933         if (args->from && !(m->flags & MOPT_STRING) && match_int(args, &arg))
1934                 return -1;
1935         if (args->from && (m->flags & MOPT_GTE0) && (arg < 0))
1936                 return -1;
1937         if (m->flags & MOPT_EXPLICIT) {
1938                 if (m->mount_opt & EXT4_MOUNT_DELALLOC) {
1939                         set_opt2(sb, EXPLICIT_DELALLOC);
1940                 } else if (m->mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) {
1941                         set_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM);
1942                 } else
1943                         return -1;
1944         }
1945         if (m->flags & MOPT_CLEAR_ERR)
1946                 clear_opt(sb, ERRORS_MASK);
1947         if (token == Opt_noquota && sb_any_quota_loaded(sb)) {
1948                 ext4_msg(sb, KERN_ERR, "Cannot change quota "
1949                          "options when quota turned on");
1950                 return -1;
1951         }
1952
1953         if (m->flags & MOPT_NOSUPPORT) {
1954                 ext4_msg(sb, KERN_ERR, "%s option not supported", opt);
1955         } else if (token == Opt_commit) {
1956                 if (arg == 0)
1957                         arg = JBD2_DEFAULT_MAX_COMMIT_AGE;
1958                 else if (arg > INT_MAX / HZ) {
1959                         ext4_msg(sb, KERN_ERR,
1960                                  "Invalid commit interval %d, "
1961                                  "must be smaller than %d",
1962                                  arg, INT_MAX / HZ);
1963                         return -1;
1964                 }
1965                 sbi->s_commit_interval = HZ * arg;
1966         } else if (token == Opt_debug_want_extra_isize) {
1967                 if ((arg & 1) ||
1968                     (arg < 4) ||
1969                     (arg > (sbi->s_inode_size - EXT4_GOOD_OLD_INODE_SIZE))) {
1970                         ext4_msg(sb, KERN_ERR,
1971                                  "Invalid want_extra_isize %d", arg);
1972                         return -1;
1973                 }
1974                 sbi->s_want_extra_isize = arg;
1975         } else if (token == Opt_max_batch_time) {
1976                 sbi->s_max_batch_time = arg;
1977         } else if (token == Opt_min_batch_time) {
1978                 sbi->s_min_batch_time = arg;
1979         } else if (token == Opt_inode_readahead_blks) {
1980                 if (arg && (arg > (1 << 30) || !is_power_of_2(arg))) {
1981                         ext4_msg(sb, KERN_ERR,
1982                                  "EXT4-fs: inode_readahead_blks must be "
1983                                  "0 or a power of 2 smaller than 2^31");
1984                         return -1;
1985                 }
1986                 sbi->s_inode_readahead_blks = arg;
1987         } else if (token == Opt_init_itable) {
1988                 set_opt(sb, INIT_INODE_TABLE);
1989                 if (!args->from)
1990                         arg = EXT4_DEF_LI_WAIT_MULT;
1991                 sbi->s_li_wait_mult = arg;
1992         } else if (token == Opt_max_dir_size_kb) {
1993                 sbi->s_max_dir_size_kb = arg;
1994         } else if (token == Opt_stripe) {
1995                 sbi->s_stripe = arg;
1996         } else if (token == Opt_resuid) {
1997                 uid = make_kuid(current_user_ns(), arg);
1998                 if (!uid_valid(uid)) {
1999                         ext4_msg(sb, KERN_ERR, "Invalid uid value %d", arg);
2000                         return -1;
2001                 }
2002                 sbi->s_resuid = uid;
2003         } else if (token == Opt_resgid) {
2004                 gid = make_kgid(current_user_ns(), arg);
2005                 if (!gid_valid(gid)) {
2006                         ext4_msg(sb, KERN_ERR, "Invalid gid value %d", arg);
2007                         return -1;
2008                 }
2009                 sbi->s_resgid = gid;
2010         } else if (token == Opt_journal_dev) {
2011                 if (is_remount) {
2012                         ext4_msg(sb, KERN_ERR,
2013                                  "Cannot specify journal on remount");
2014                         return -1;
2015                 }
2016                 *journal_devnum = arg;
2017         } else if (token == Opt_journal_path) {
2018                 char *journal_path;
2019                 struct inode *journal_inode;
2020                 struct path path;
2021                 int error;
2022
2023                 if (is_remount) {
2024                         ext4_msg(sb, KERN_ERR,
2025                                  "Cannot specify journal on remount");
2026                         return -1;
2027                 }
2028                 journal_path = match_strdup(&args[0]);
2029                 if (!journal_path) {
2030                         ext4_msg(sb, KERN_ERR, "error: could not dup "
2031                                 "journal device string");
2032                         return -1;
2033                 }
2034
2035                 error = kern_path(journal_path, LOOKUP_FOLLOW, &path);
2036                 if (error) {
2037                         ext4_msg(sb, KERN_ERR, "error: could not find "
2038                                 "journal device path: error %d", error);
2039                         kfree(journal_path);
2040                         return -1;
2041                 }
2042
2043                 journal_inode = d_inode(path.dentry);
2044                 if (!S_ISBLK(journal_inode->i_mode)) {
2045                         ext4_msg(sb, KERN_ERR, "error: journal path %s "
2046                                 "is not a block device", journal_path);
2047                         path_put(&path);
2048                         kfree(journal_path);
2049                         return -1;
2050                 }
2051
2052                 *journal_devnum = new_encode_dev(journal_inode->i_rdev);
2053                 path_put(&path);
2054                 kfree(journal_path);
2055         } else if (token == Opt_journal_ioprio) {
2056                 if (arg > 7) {
2057                         ext4_msg(sb, KERN_ERR, "Invalid journal IO priority"
2058                                  " (must be 0-7)");
2059                         return -1;
2060                 }
2061                 *journal_ioprio =
2062                         IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, arg);
2063         } else if (token == Opt_test_dummy_encryption) {
2064 #ifdef CONFIG_FS_ENCRYPTION
2065                 sbi->s_mount_flags |= EXT4_MF_TEST_DUMMY_ENCRYPTION;
2066                 ext4_msg(sb, KERN_WARNING,
2067                          "Test dummy encryption mode enabled");
2068 #else
2069                 ext4_msg(sb, KERN_WARNING,
2070                          "Test dummy encryption mount option ignored");
2071 #endif
2072         } else if (m->flags & MOPT_DATAJ) {
2073                 if (is_remount) {
2074                         if (!sbi->s_journal)
2075                                 ext4_msg(sb, KERN_WARNING, "Remounting file system with no journal so ignoring journalled data option");
2076                         else if (test_opt(sb, DATA_FLAGS) != m->mount_opt) {
2077                                 ext4_msg(sb, KERN_ERR,
2078                                          "Cannot change data mode on remount");
2079                                 return -1;
2080                         }
2081                 } else {
2082                         clear_opt(sb, DATA_FLAGS);
2083                         sbi->s_mount_opt |= m->mount_opt;
2084                 }
2085 #ifdef CONFIG_QUOTA
2086         } else if (m->flags & MOPT_QFMT) {
2087                 if (sb_any_quota_loaded(sb) &&
2088                     sbi->s_jquota_fmt != m->mount_opt) {
2089                         ext4_msg(sb, KERN_ERR, "Cannot change journaled "
2090                                  "quota options when quota turned on");
2091                         return -1;
2092                 }
2093                 if (ext4_has_feature_quota(sb)) {
2094                         ext4_msg(sb, KERN_INFO,
2095                                  "Quota format mount options ignored "
2096                                  "when QUOTA feature is enabled");
2097                         return 1;
2098                 }
2099                 sbi->s_jquota_fmt = m->mount_opt;
2100 #endif
2101         } else if (token == Opt_dax || token == Opt_dax_always ||
2102                    token == Opt_dax_inode || token == Opt_dax_never) {
2103 #ifdef CONFIG_FS_DAX
2104                 switch (token) {
2105                 case Opt_dax:
2106                 case Opt_dax_always:
2107                         if (is_remount &&
2108                             (!(sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) ||
2109                              (sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER))) {
2110                         fail_dax_change_remount:
2111                                 ext4_msg(sb, KERN_ERR, "can't change "
2112                                          "dax mount option while remounting");
2113                                 return -1;
2114                         }
2115                         if (is_remount &&
2116                             (test_opt(sb, DATA_FLAGS) ==
2117                              EXT4_MOUNT_JOURNAL_DATA)) {
2118                                     ext4_msg(sb, KERN_ERR, "can't mount with "
2119                                              "both data=journal and dax");
2120                                     return -1;
2121                         }
2122                         ext4_msg(sb, KERN_WARNING,
2123                                 "DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
2124                         sbi->s_mount_opt |= EXT4_MOUNT_DAX_ALWAYS;
2125                         sbi->s_mount_opt2 &= ~EXT4_MOUNT2_DAX_NEVER;
2126                         break;
2127                 case Opt_dax_never:
2128                         if (is_remount &&
2129                             (!(sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER) ||
2130                              (sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS)))
2131                                 goto fail_dax_change_remount;
2132                         sbi->s_mount_opt2 |= EXT4_MOUNT2_DAX_NEVER;
2133                         sbi->s_mount_opt &= ~EXT4_MOUNT_DAX_ALWAYS;
2134                         break;
2135                 case Opt_dax_inode:
2136                         if (is_remount &&
2137                             ((sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) ||
2138                              (sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_NEVER) ||
2139                              !(sbi->s_mount_opt2 & EXT4_MOUNT2_DAX_INODE)))
2140                                 goto fail_dax_change_remount;
2141                         sbi->s_mount_opt &= ~EXT4_MOUNT_DAX_ALWAYS;
2142                         sbi->s_mount_opt2 &= ~EXT4_MOUNT2_DAX_NEVER;
2143                         /* Strictly for printing options */
2144                         sbi->s_mount_opt2 |= EXT4_MOUNT2_DAX_INODE;
2145                         break;
2146                 }
2147 #else
2148                 ext4_msg(sb, KERN_INFO, "dax option not supported");
2149                 sbi->s_mount_opt2 |= EXT4_MOUNT2_DAX_NEVER;
2150                 sbi->s_mount_opt &= ~EXT4_MOUNT_DAX_ALWAYS;
2151                 return -1;
2152 #endif
2153         } else if (token == Opt_data_err_abort) {
2154                 sbi->s_mount_opt |= m->mount_opt;
2155         } else if (token == Opt_data_err_ignore) {
2156                 sbi->s_mount_opt &= ~m->mount_opt;
2157         } else {
2158                 if (!args->from)
2159                         arg = 1;
2160                 if (m->flags & MOPT_CLEAR)
2161                         arg = !arg;
2162                 else if (unlikely(!(m->flags & MOPT_SET))) {
2163                         ext4_msg(sb, KERN_WARNING,
2164                                  "buggy handling of option %s", opt);
2165                         WARN_ON(1);
2166                         return -1;
2167                 }
2168                 if (arg != 0)
2169                         sbi->s_mount_opt |= m->mount_opt;
2170                 else
2171                         sbi->s_mount_opt &= ~m->mount_opt;
2172         }
2173         return 1;
2174 }
2175
2176 static int parse_options(char *options, struct super_block *sb,
2177                          unsigned long *journal_devnum,
2178                          unsigned int *journal_ioprio,
2179                          int is_remount)
2180 {
2181         struct ext4_sb_info __maybe_unused *sbi = EXT4_SB(sb);
2182         char *p, __maybe_unused *usr_qf_name, __maybe_unused *grp_qf_name;
2183         substring_t args[MAX_OPT_ARGS];
2184         int token;
2185
2186         if (!options)
2187                 return 1;
2188
2189         while ((p = strsep(&options, ",")) != NULL) {
2190                 if (!*p)
2191                         continue;
2192                 /*
2193                  * Initialize args struct so we know whether arg was
2194                  * found; some options take optional arguments.
2195                  */
2196                 args[0].to = args[0].from = NULL;
2197                 token = match_token(p, tokens, args);
2198                 if (handle_mount_opt(sb, p, token, args, journal_devnum,
2199                                      journal_ioprio, is_remount) < 0)
2200                         return 0;
2201         }
2202 #ifdef CONFIG_QUOTA
2203         /*
2204          * We do the test below only for project quotas. 'usrquota' and
2205          * 'grpquota' mount options are allowed even without quota feature
2206          * to support legacy quotas in quota files.
2207          */
2208         if (test_opt(sb, PRJQUOTA) && !ext4_has_feature_project(sb)) {
2209                 ext4_msg(sb, KERN_ERR, "Project quota feature not enabled. "
2210                          "Cannot enable project quota enforcement.");
2211                 return 0;
2212         }
2213         usr_qf_name = get_qf_name(sb, sbi, USRQUOTA);
2214         grp_qf_name = get_qf_name(sb, sbi, GRPQUOTA);
2215         if (usr_qf_name || grp_qf_name) {
2216                 if (test_opt(sb, USRQUOTA) && usr_qf_name)
2217                         clear_opt(sb, USRQUOTA);
2218
2219                 if (test_opt(sb, GRPQUOTA) && grp_qf_name)
2220                         clear_opt(sb, GRPQUOTA);
2221
2222                 if (test_opt(sb, GRPQUOTA) || test_opt(sb, USRQUOTA)) {
2223                         ext4_msg(sb, KERN_ERR, "old and new quota "
2224                                         "format mixing");
2225                         return 0;
2226                 }
2227
2228                 if (!sbi->s_jquota_fmt) {
2229                         ext4_msg(sb, KERN_ERR, "journaled quota format "
2230                                         "not specified");
2231                         return 0;
2232                 }
2233         }
2234 #endif
2235         if (test_opt(sb, DIOREAD_NOLOCK)) {
2236                 int blocksize =
2237                         BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size);
2238                 if (blocksize < PAGE_SIZE)
2239                         ext4_msg(sb, KERN_WARNING, "Warning: mounting with an "
2240                                  "experimental mount option 'dioread_nolock' "
2241                                  "for blocksize < PAGE_SIZE");
2242         }
2243         return 1;
2244 }
2245
2246 static inline void ext4_show_quota_options(struct seq_file *seq,
2247                                            struct super_block *sb)
2248 {
2249 #if defined(CONFIG_QUOTA)
2250         struct ext4_sb_info *sbi = EXT4_SB(sb);
2251         char *usr_qf_name, *grp_qf_name;
2252
2253         if (sbi->s_jquota_fmt) {
2254                 char *fmtname = "";
2255
2256                 switch (sbi->s_jquota_fmt) {
2257                 case QFMT_VFS_OLD:
2258                         fmtname = "vfsold";
2259                         break;
2260                 case QFMT_VFS_V0:
2261                         fmtname = "vfsv0";
2262                         break;
2263                 case QFMT_VFS_V1:
2264                         fmtname = "vfsv1";
2265                         break;
2266                 }
2267                 seq_printf(seq, ",jqfmt=%s", fmtname);
2268         }
2269
2270         rcu_read_lock();
2271         usr_qf_name = rcu_dereference(sbi->s_qf_names[USRQUOTA]);
2272         grp_qf_name = rcu_dereference(sbi->s_qf_names[GRPQUOTA]);
2273         if (usr_qf_name)
2274                 seq_show_option(seq, "usrjquota", usr_qf_name);
2275         if (grp_qf_name)
2276                 seq_show_option(seq, "grpjquota", grp_qf_name);
2277         rcu_read_unlock();
2278 #endif
2279 }
2280
2281 static const char *token2str(int token)
2282 {
2283         const struct match_token *t;
2284
2285         for (t = tokens; t->token != Opt_err; t++)
2286                 if (t->token == token && !strchr(t->pattern, '='))
2287                         break;
2288         return t->pattern;
2289 }
2290
2291 /*
2292  * Show an option if
2293  *  - it's set to a non-default value OR
2294  *  - if the per-sb default is different from the global default
2295  */
2296 static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
2297                               int nodefs)
2298 {
2299         struct ext4_sb_info *sbi = EXT4_SB(sb);
2300         struct ext4_super_block *es = sbi->s_es;
2301         int def_errors, def_mount_opt = sbi->s_def_mount_opt;
2302         const struct mount_opts *m;
2303         char sep = nodefs ? '\n' : ',';
2304
2305 #define SEQ_OPTS_PUTS(str) seq_printf(seq, "%c" str, sep)
2306 #define SEQ_OPTS_PRINT(str, arg) seq_printf(seq, "%c" str, sep, arg)
2307
2308         if (sbi->s_sb_block != 1)
2309                 SEQ_OPTS_PRINT("sb=%llu", sbi->s_sb_block);
2310
2311         for (m = ext4_mount_opts; m->token != Opt_err; m++) {
2312                 int want_set = m->flags & MOPT_SET;
2313                 if (((m->flags & (MOPT_SET|MOPT_CLEAR)) == 0) ||
2314                     (m->flags & MOPT_CLEAR_ERR) || m->flags & MOPT_SKIP)
2315                         continue;
2316                 if (!nodefs && !(m->mount_opt & (sbi->s_mount_opt ^ def_mount_opt)))
2317                         continue; /* skip if same as the default */
2318                 if ((want_set &&
2319                      (sbi->s_mount_opt & m->mount_opt) != m->mount_opt) ||
2320                     (!want_set && (sbi->s_mount_opt & m->mount_opt)))
2321                         continue; /* select Opt_noFoo vs Opt_Foo */
2322                 SEQ_OPTS_PRINT("%s", token2str(m->token));
2323         }
2324
2325         if (nodefs || !uid_eq(sbi->s_resuid, make_kuid(&init_user_ns, EXT4_DEF_RESUID)) ||
2326             le16_to_cpu(es->s_def_resuid) != EXT4_DEF_RESUID)
2327                 SEQ_OPTS_PRINT("resuid=%u",
2328                                 from_kuid_munged(&init_user_ns, sbi->s_resuid));
2329         if (nodefs || !gid_eq(sbi->s_resgid, make_kgid(&init_user_ns, EXT4_DEF_RESGID)) ||
2330             le16_to_cpu(es->s_def_resgid) != EXT4_DEF_RESGID)
2331                 SEQ_OPTS_PRINT("resgid=%u",
2332                                 from_kgid_munged(&init_user_ns, sbi->s_resgid));
2333         def_errors = nodefs ? -1 : le16_to_cpu(es->s_errors);
2334         if (test_opt(sb, ERRORS_RO) && def_errors != EXT4_ERRORS_RO)
2335                 SEQ_OPTS_PUTS("errors=remount-ro");
2336         if (test_opt(sb, ERRORS_CONT) && def_errors != EXT4_ERRORS_CONTINUE)
2337                 SEQ_OPTS_PUTS("errors=continue");
2338         if (test_opt(sb, ERRORS_PANIC) && def_errors != EXT4_ERRORS_PANIC)
2339                 SEQ_OPTS_PUTS("errors=panic");
2340         if (nodefs || sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ)
2341                 SEQ_OPTS_PRINT("commit=%lu", sbi->s_commit_interval / HZ);
2342         if (nodefs || sbi->s_min_batch_time != EXT4_DEF_MIN_BATCH_TIME)
2343                 SEQ_OPTS_PRINT("min_batch_time=%u", sbi->s_min_batch_time);
2344         if (nodefs || sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME)
2345                 SEQ_OPTS_PRINT("max_batch_time=%u", sbi->s_max_batch_time);
2346         if (sb->s_flags & SB_I_VERSION)
2347                 SEQ_OPTS_PUTS("i_version");
2348         if (nodefs || sbi->s_stripe)
2349                 SEQ_OPTS_PRINT("stripe=%lu", sbi->s_stripe);
2350         if (nodefs || EXT4_MOUNT_DATA_FLAGS &
2351                         (sbi->s_mount_opt ^ def_mount_opt)) {
2352                 if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
2353                         SEQ_OPTS_PUTS("data=journal");
2354                 else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
2355                         SEQ_OPTS_PUTS("data=ordered");
2356                 else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
2357                         SEQ_OPTS_PUTS("data=writeback");
2358         }
2359         if (nodefs ||
2360             sbi->s_inode_readahead_blks != EXT4_DEF_INODE_READAHEAD_BLKS)
2361                 SEQ_OPTS_PRINT("inode_readahead_blks=%u",
2362                                sbi->s_inode_readahead_blks);
2363
2364         if (test_opt(sb, INIT_INODE_TABLE) && (nodefs ||
2365                        (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT)))
2366                 SEQ_OPTS_PRINT("init_itable=%u", sbi->s_li_wait_mult);
2367         if (nodefs || sbi->s_max_dir_size_kb)
2368                 SEQ_OPTS_PRINT("max_dir_size_kb=%u", sbi->s_max_dir_size_kb);
2369         if (test_opt(sb, DATA_ERR_ABORT))
2370                 SEQ_OPTS_PUTS("data_err=abort");
2371         if (DUMMY_ENCRYPTION_ENABLED(sbi))
2372                 SEQ_OPTS_PUTS("test_dummy_encryption");
2373
2374         if (test_opt(sb, DAX_ALWAYS)) {
2375                 if (IS_EXT2_SB(sb))
2376                         SEQ_OPTS_PUTS("dax");
2377                 else
2378                         SEQ_OPTS_PUTS("dax=always");
2379         } else if (test_opt2(sb, DAX_NEVER)) {
2380                 SEQ_OPTS_PUTS("dax=never");
2381         } else if (test_opt2(sb, DAX_INODE)) {
2382                 SEQ_OPTS_PUTS("dax=inode");
2383         }
2384
2385         ext4_show_quota_options(seq, sb);
2386         return 0;
2387 }
2388
2389 static int ext4_show_options(struct seq_file *seq, struct dentry *root)
2390 {
2391         return _ext4_show_options(seq, root->d_sb, 0);
2392 }
2393
2394 int ext4_seq_options_show(struct seq_file *seq, void *offset)
2395 {
2396         struct super_block *sb = seq->private;
2397         int rc;
2398
2399         seq_puts(seq, sb_rdonly(sb) ? "ro" : "rw");
2400         rc = _ext4_show_options(seq, sb, 1);
2401         seq_puts(seq, "\n");
2402         return rc;
2403 }
2404
2405 static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
2406                             int read_only)
2407 {
2408         struct ext4_sb_info *sbi = EXT4_SB(sb);
2409         int err = 0;
2410
2411         if (le32_to_cpu(es->s_rev_level) > EXT4_MAX_SUPP_REV) {
2412                 ext4_msg(sb, KERN_ERR, "revision level too high, "
2413                          "forcing read-only mode");
2414                 err = -EROFS;
2415         }
2416         if (read_only)
2417                 goto done;
2418         if (!(sbi->s_mount_state & EXT4_VALID_FS))
2419                 ext4_msg(sb, KERN_WARNING, "warning: mounting unchecked fs, "
2420                          "running e2fsck is recommended");
2421         else if (sbi->s_mount_state & EXT4_ERROR_FS)
2422                 ext4_msg(sb, KERN_WARNING,
2423                          "warning: mounting fs with errors, "
2424                          "running e2fsck is recommended");
2425         else if ((__s16) le16_to_cpu(es->s_max_mnt_count) > 0 &&
2426                  le16_to_cpu(es->s_mnt_count) >=
2427                  (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count))
2428                 ext4_msg(sb, KERN_WARNING,
2429                          "warning: maximal mount count reached, "
2430                          "running e2fsck is recommended");
2431         else if (le32_to_cpu(es->s_checkinterval) &&
2432                  (ext4_get_tstamp(es, s_lastcheck) +
2433                   le32_to_cpu(es->s_checkinterval) <= ktime_get_real_seconds()))
2434                 ext4_msg(sb, KERN_WARNING,
2435                          "warning: checktime reached, "
2436                          "running e2fsck is recommended");
2437         if (!sbi->s_journal)
2438                 es->s_state &= cpu_to_le16(~EXT4_VALID_FS);
2439         if (!(__s16) le16_to_cpu(es->s_max_mnt_count))
2440                 es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT);
2441         le16_add_cpu(&es->s_mnt_count, 1);
2442         ext4_update_tstamp(es, s_mtime);
2443         if (sbi->s_journal)
2444                 ext4_set_feature_journal_needs_recovery(sb);
2445
2446         err = ext4_commit_super(sb, 1);
2447 done:
2448         if (test_opt(sb, DEBUG))
2449                 printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, "
2450                                 "bpg=%lu, ipg=%lu, mo=%04x, mo2=%04x]\n",
2451                         sb->s_blocksize,
2452                         sbi->s_groups_count,
2453                         EXT4_BLOCKS_PER_GROUP(sb),
2454                         EXT4_INODES_PER_GROUP(sb),
2455                         sbi->s_mount_opt, sbi->s_mount_opt2);
2456
2457         cleancache_init_fs(sb);
2458         return err;
2459 }
2460
2461 int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup)
2462 {
2463         struct ext4_sb_info *sbi = EXT4_SB(sb);
2464         struct flex_groups **old_groups, **new_groups;
2465         int size, i, j;
2466
2467         if (!sbi->s_log_groups_per_flex)
2468                 return 0;
2469
2470         size = ext4_flex_group(sbi, ngroup - 1) + 1;
2471         if (size <= sbi->s_flex_groups_allocated)
2472                 return 0;
2473
2474         new_groups = kvzalloc(roundup_pow_of_two(size *
2475                               sizeof(*sbi->s_flex_groups)), GFP_KERNEL);
2476         if (!new_groups) {
2477                 ext4_msg(sb, KERN_ERR,
2478                          "not enough memory for %d flex group pointers", size);
2479                 return -ENOMEM;
2480         }
2481         for (i = sbi->s_flex_groups_allocated; i < size; i++) {
2482                 new_groups[i] = kvzalloc(roundup_pow_of_two(
2483                                          sizeof(struct flex_groups)),
2484                                          GFP_KERNEL);
2485                 if (!new_groups[i]) {
2486                         for (j = sbi->s_flex_groups_allocated; j < i; j++)
2487                                 kvfree(new_groups[j]);
2488                         kvfree(new_groups);
2489                         ext4_msg(sb, KERN_ERR,
2490                                  "not enough memory for %d flex groups", size);
2491                         return -ENOMEM;
2492                 }
2493         }
2494         rcu_read_lock();
2495         old_groups = rcu_dereference(sbi->s_flex_groups);
2496         if (old_groups)
2497                 memcpy(new_groups, old_groups,
2498                        (sbi->s_flex_groups_allocated *
2499                         sizeof(struct flex_groups *)));
2500         rcu_read_unlock();
2501         rcu_assign_pointer(sbi->s_flex_groups, new_groups);
2502         sbi->s_flex_groups_allocated = size;
2503         if (old_groups)
2504                 ext4_kvfree_array_rcu(old_groups);
2505         return 0;
2506 }
2507
2508 static int ext4_fill_flex_info(struct super_block *sb)
2509 {
2510         struct ext4_sb_info *sbi = EXT4_SB(sb);
2511         struct ext4_group_desc *gdp = NULL;
2512         struct flex_groups *fg;
2513         ext4_group_t flex_group;
2514         int i, err;
2515
2516         sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
2517         if (sbi->s_log_groups_per_flex < 1 || sbi->s_log_groups_per_flex > 31) {
2518                 sbi->s_log_groups_per_flex = 0;
2519                 return 1;
2520         }
2521
2522         err = ext4_alloc_flex_bg_array(sb, sbi->s_groups_count);
2523         if (err)
2524                 goto failed;
2525
2526         for (i = 0; i < sbi->s_groups_count; i++) {
2527                 gdp = ext4_get_group_desc(sb, i, NULL);
2528
2529                 flex_group = ext4_flex_group(sbi, i);
2530                 fg = sbi_array_rcu_deref(sbi, s_flex_groups, flex_group);
2531                 atomic_add(ext4_free_inodes_count(sb, gdp), &fg->free_inodes);
2532                 atomic64_add(ext4_free_group_clusters(sb, gdp),
2533                              &fg->free_clusters);
2534                 atomic_add(ext4_used_dirs_count(sb, gdp), &fg->used_dirs);
2535         }
2536
2537         return 1;
2538 failed:
2539         return 0;
2540 }
2541
2542 static __le16 ext4_group_desc_csum(struct super_block *sb, __u32 block_group,
2543                                    struct ext4_group_desc *gdp)
2544 {
2545         int offset = offsetof(struct ext4_group_desc, bg_checksum);
2546         __u16 crc = 0;
2547         __le32 le_group = cpu_to_le32(block_group);
2548         struct ext4_sb_info *sbi = EXT4_SB(sb);
2549
2550         if (ext4_has_metadata_csum(sbi->s_sb)) {
2551                 /* Use new metadata_csum algorithm */
2552                 __u32 csum32;
2553                 __u16 dummy_csum = 0;
2554
2555                 csum32 = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&le_group,
2556                                      sizeof(le_group));
2557                 csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp, offset);
2558                 csum32 = ext4_chksum(sbi, csum32, (__u8 *)&dummy_csum,
2559                                      sizeof(dummy_csum));
2560                 offset += sizeof(dummy_csum);
2561                 if (offset < sbi->s_desc_size)
2562                         csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp + offset,
2563                                              sbi->s_desc_size - offset);
2564
2565                 crc = csum32 & 0xFFFF;
2566                 goto out;
2567         }
2568
2569         /* old crc16 code */
2570         if (!ext4_has_feature_gdt_csum(sb))
2571                 return 0;
2572
2573         crc = crc16(~0, sbi->s_es->s_uuid, sizeof(sbi->s_es->s_uuid));
2574         crc = crc16(crc, (__u8 *)&le_group, sizeof(le_group));
2575         crc = crc16(crc, (__u8 *)gdp, offset);
2576         offset += sizeof(gdp->bg_checksum); /* skip checksum */
2577         /* for checksum of struct ext4_group_desc do the rest...*/
2578         if (ext4_has_feature_64bit(sb) &&
2579             offset < le16_to_cpu(sbi->s_es->s_desc_size))
2580                 crc = crc16(crc, (__u8 *)gdp + offset,
2581                             le16_to_cpu(sbi->s_es->s_desc_size) -
2582                                 offset);
2583
2584 out:
2585         return cpu_to_le16(crc);
2586 }
2587
2588 int ext4_group_desc_csum_verify(struct super_block *sb, __u32 block_group,
2589                                 struct ext4_group_desc *gdp)
2590 {
2591         if (ext4_has_group_desc_csum(sb) &&
2592             (gdp->bg_checksum != ext4_group_desc_csum(sb, block_group, gdp)))
2593                 return 0;
2594
2595         return 1;
2596 }
2597
2598 void ext4_group_desc_csum_set(struct super_block *sb, __u32 block_group,
2599                               struct ext4_group_desc *gdp)
2600 {
2601         if (!ext4_has_group_desc_csum(sb))
2602                 return;
2603         gdp->bg_checksum = ext4_group_desc_csum(sb, block_group, gdp);
2604 }
2605
2606 /* Called at mount-time, super-block is locked */
2607 static int ext4_check_descriptors(struct super_block *sb,
2608                                   ext4_fsblk_t sb_block,
2609                                   ext4_group_t *first_not_zeroed)
2610 {
2611         struct ext4_sb_info *sbi = EXT4_SB(sb);
2612         ext4_fsblk_t first_block = le32_to_cpu(sbi->s_es->s_first_data_block);
2613         ext4_fsblk_t last_block;
2614         ext4_fsblk_t last_bg_block = sb_block + ext4_bg_num_gdb(sb, 0);
2615         ext4_fsblk_t block_bitmap;
2616         ext4_fsblk_t inode_bitmap;
2617         ext4_fsblk_t inode_table;
2618         int flexbg_flag = 0;
2619         ext4_group_t i, grp = sbi->s_groups_count;
2620
2621         if (ext4_has_feature_flex_bg(sb))
2622                 flexbg_flag = 1;
2623
2624         ext4_debug("Checking group descriptors");
2625
2626         for (i = 0; i < sbi->s_groups_count; i++) {
2627                 struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL);
2628
2629                 if (i == sbi->s_groups_count - 1 || flexbg_flag)
2630                         last_block = ext4_blocks_count(sbi->s_es) - 1;
2631                 else
2632                         last_block = first_block +
2633                                 (EXT4_BLOCKS_PER_GROUP(sb) - 1);
2634
2635                 if ((grp == sbi->s_groups_count) &&
2636                    !(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
2637                         grp = i;
2638
2639                 block_bitmap = ext4_block_bitmap(sb, gdp);
2640                 if (block_bitmap == sb_block) {
2641                         ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
2642                                  "Block bitmap for group %u overlaps "
2643                                  "superblock", i);
2644                         if (!sb_rdonly(sb))
2645                                 return 0;
2646                 }
2647                 if (block_bitmap >= sb_block + 1 &&
2648                     block_bitmap <= last_bg_block) {
2649                         ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
2650                                  "Block bitmap for group %u overlaps "
2651                                  "block group descriptors", i);
2652                         if (!sb_rdonly(sb))
2653                                 return 0;
2654                 }
2655                 if (block_bitmap < first_block || block_bitmap > last_block) {
2656                         ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
2657                                "Block bitmap for group %u not in group "
2658                                "(block %llu)!", i, block_bitmap);
2659                         return 0;
2660                 }
2661                 inode_bitmap = ext4_inode_bitmap(sb, gdp);
2662                 if (inode_bitmap == sb_block) {
2663                         ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
2664                                  "Inode bitmap for group %u overlaps "
2665                                  "superblock", i);
2666                         if (!sb_rdonly(sb))
2667                                 return 0;
2668                 }
2669                 if (inode_bitmap >= sb_block + 1 &&
2670                     inode_bitmap <= last_bg_block) {
2671                         ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
2672                                  "Inode bitmap for group %u overlaps "
2673                                  "block group descriptors", i);
2674                         if (!sb_rdonly(sb))
2675                                 return 0;
2676                 }
2677                 if (inode_bitmap < first_block || inode_bitmap > last_block) {
2678                         ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
2679                                "Inode bitmap for group %u not in group "
2680                                "(block %llu)!", i, inode_bitmap);
2681                         return 0;
2682                 }
2683                 inode_table = ext4_inode_table(sb, gdp);
2684                 if (inode_table == sb_block) {
2685                         ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
2686                                  "Inode table for group %u overlaps "
2687                                  "superblock", i);
2688                         if (!sb_rdonly(sb))
2689                                 return 0;
2690                 }
2691                 if (inode_table >= sb_block + 1 &&
2692                     inode_table <= last_bg_block) {
2693                         ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
2694                                  "Inode table for group %u overlaps "
2695                                  "block group descriptors", i);
2696                         if (!sb_rdonly(sb))
2697                                 return 0;
2698                 }
2699                 if (inode_table < first_block ||
2700                     inode_table + sbi->s_itb_per_group - 1 > last_block) {
2701                         ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
2702                                "Inode table for group %u not in group "
2703                                "(block %llu)!", i, inode_table);
2704                         return 0;
2705                 }
2706                 ext4_lock_group(sb, i);
2707                 if (!ext4_group_desc_csum_verify(sb, i, gdp)) {
2708                         ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
2709                                  "Checksum for group %u failed (%u!=%u)",
2710                                  i, le16_to_cpu(ext4_group_desc_csum(sb, i,
2711                                      gdp)), le16_to_cpu(gdp->bg_checksum));
2712                         if (!sb_rdonly(sb)) {
2713                                 ext4_unlock_group(sb, i);
2714                                 return 0;
2715                         }
2716                 }
2717                 ext4_unlock_group(sb, i);
2718                 if (!flexbg_flag)
2719                         first_block += EXT4_BLOCKS_PER_GROUP(sb);
2720         }
2721         if (NULL != first_not_zeroed)
2722                 *first_not_zeroed = grp;
2723         return 1;
2724 }
2725
2726 /* ext4_orphan_cleanup() walks a singly-linked list of inodes (starting at
2727  * the superblock) which were deleted from all directories, but held open by
2728  * a process at the time of a crash.  We walk the list and try to delete these
2729  * inodes at recovery time (only with a read-write filesystem).
2730  *
2731  * In order to keep the orphan inode chain consistent during traversal (in
2732  * case of crash during recovery), we link each inode into the superblock
2733  * orphan list_head and handle it the same way as an inode deletion during
2734  * normal operation (which journals the operations for us).
2735  *
2736  * We only do an iget() and an iput() on each inode, which is very safe if we
2737  * accidentally point at an in-use or already deleted inode.  The worst that
2738  * can happen in this case is that we get a "bit already cleared" message from
2739  * ext4_free_inode().  The only reason we would point at a wrong inode is if
2740  * e2fsck was run on this filesystem, and it must have already done the orphan
2741  * inode cleanup for us, so we can safely abort without any further action.
2742  */
2743 static void ext4_orphan_cleanup(struct super_block *sb,
2744                                 struct ext4_super_block *es)
2745 {
2746         unsigned int s_flags = sb->s_flags;
2747         int ret, nr_orphans = 0, nr_truncates = 0;
2748 #ifdef CONFIG_QUOTA
2749         int quota_update = 0;
2750         int i;
2751 #endif
2752         if (!es->s_last_orphan) {
2753                 jbd_debug(4, "no orphan inodes to clean up\n");
2754                 return;
2755         }
2756
2757         if (bdev_read_only(sb->s_bdev)) {
2758                 ext4_msg(sb, KERN_ERR, "write access "
2759                         "unavailable, skipping orphan cleanup");
2760                 return;
2761         }
2762
2763         /* Check if feature set would not allow a r/w mount */
2764         if (!ext4_feature_set_ok(sb, 0)) {
2765                 ext4_msg(sb, KERN_INFO, "Skipping orphan cleanup due to "
2766                          "unknown ROCOMPAT features");
2767                 return;
2768         }
2769
2770         if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) {
2771                 /* don't clear list on RO mount w/ errors */
2772                 if (es->s_last_orphan && !(s_flags & SB_RDONLY)) {
2773                         ext4_msg(sb, KERN_INFO, "Errors on filesystem, "
2774                                   "clearing orphan list.\n");
2775                         es->s_last_orphan = 0;
2776                 }
2777                 jbd_debug(1, "Skipping orphan recovery on fs with errors.\n");
2778                 return;
2779         }
2780
2781         if (s_flags & SB_RDONLY) {
2782                 ext4_msg(sb, KERN_INFO, "orphan cleanup on readonly fs");
2783                 sb->s_flags &= ~SB_RDONLY;
2784         }
2785 #ifdef CONFIG_QUOTA
2786         /* Needed for iput() to work correctly and not trash data */
2787         sb->s_flags |= SB_ACTIVE;
2788
2789         /*
2790          * Turn on quotas which were not enabled for read-only mounts if
2791          * filesystem has quota feature, so that they are updated correctly.
2792          */
2793         if (ext4_has_feature_quota(sb) && (s_flags & SB_RDONLY)) {
2794                 int ret = ext4_enable_quotas(sb);
2795
2796                 if (!ret)
2797                         quota_update = 1;
2798                 else
2799                         ext4_msg(sb, KERN_ERR,
2800                                 "Cannot turn on quotas: error %d", ret);
2801         }
2802
2803         /* Turn on journaled quotas used for old sytle */
2804         for (i = 0; i < EXT4_MAXQUOTAS; i++) {
2805                 if (EXT4_SB(sb)->s_qf_names[i]) {
2806                         int ret = ext4_quota_on_mount(sb, i);
2807
2808                         if (!ret)
2809                                 quota_update = 1;
2810                         else
2811                                 ext4_msg(sb, KERN_ERR,
2812                                         "Cannot turn on journaled "
2813                                         "quota: type %d: error %d", i, ret);
2814                 }
2815         }
2816 #endif
2817
2818         while (es->s_last_orphan) {
2819                 struct inode *inode;
2820
2821                 /*
2822                  * We may have encountered an error during cleanup; if
2823                  * so, skip the rest.
2824                  */
2825                 if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) {
2826                         jbd_debug(1, "Skipping orphan recovery on fs with errors.\n");
2827                         es->s_last_orphan = 0;
2828                         break;
2829                 }
2830
2831                 inode = ext4_orphan_get(sb, le32_to_cpu(es->s_last_orphan));
2832                 if (IS_ERR(inode)) {
2833                         es->s_last_orphan = 0;
2834                         break;
2835                 }
2836
2837                 list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
2838                 dquot_initialize(inode);
2839                 if (inode->i_nlink) {
2840                         if (test_opt(sb, DEBUG))
2841                                 ext4_msg(sb, KERN_DEBUG,
2842                                         "%s: truncating inode %lu to %lld bytes",
2843                                         __func__, inode->i_ino, inode->i_size);
2844                         jbd_debug(2, "truncating inode %lu to %lld bytes\n",
2845                                   inode->i_ino, inode->i_size);
2846                         inode_lock(inode);
2847                         truncate_inode_pages(inode->i_mapping, inode->i_size);
2848                         ret = ext4_truncate(inode);
2849                         if (ret)
2850                                 ext4_std_error(inode->i_sb, ret);
2851                         inode_unlock(inode);
2852                         nr_truncates++;
2853                 } else {
2854                         if (test_opt(sb, DEBUG))
2855                                 ext4_msg(sb, KERN_DEBUG,
2856                                         "%s: deleting unreferenced inode %lu",
2857                                         __func__, inode->i_ino);
2858                         jbd_debug(2, "deleting unreferenced inode %lu\n",
2859                                   inode->i_ino);
2860                         nr_orphans++;
2861                 }
2862                 iput(inode);  /* The delete magic happens here! */
2863         }
2864
2865 #define PLURAL(x) (x), ((x) == 1) ? "" : "s"
2866
2867         if (nr_orphans)
2868                 ext4_msg(sb, KERN_INFO, "%d orphan inode%s deleted",
2869                        PLURAL(nr_orphans));
2870         if (nr_truncates)
2871                 ext4_msg(sb, KERN_INFO, "%d truncate%s cleaned up",
2872                        PLURAL(nr_truncates));
2873 #ifdef CONFIG_QUOTA
2874         /* Turn off quotas if they were enabled for orphan cleanup */
2875         if (quota_update) {
2876                 for (i = 0; i < EXT4_MAXQUOTAS; i++) {
2877                         if (sb_dqopt(sb)->files[i])
2878                                 dquot_quota_off(sb, i);
2879                 }
2880         }
2881 #endif
2882         sb->s_flags = s_flags; /* Restore SB_RDONLY status */
2883 }
2884
2885 /*
2886  * Maximal extent format file size.
2887  * Resulting logical blkno at s_maxbytes must fit in our on-disk
2888  * extent format containers, within a sector_t, and within i_blocks
2889  * in the vfs.  ext4 inode has 48 bits of i_block in fsblock units,
2890  * so that won't be a limiting factor.
2891  *
2892  * However there is other limiting factor. We do store extents in the form
2893  * of starting block and length, hence the resulting length of the extent
2894  * covering maximum file size must fit into on-disk format containers as
2895  * well. Given that length is always by 1 unit bigger than max unit (because
2896  * we count 0 as well) we have to lower the s_maxbytes by one fs block.
2897  *
2898  * Note, this does *not* consider any metadata overhead for vfs i_blocks.
2899  */
2900 static loff_t ext4_max_size(int blkbits, int has_huge_files)
2901 {
2902         loff_t res;
2903         loff_t upper_limit = MAX_LFS_FILESIZE;
2904
2905         BUILD_BUG_ON(sizeof(blkcnt_t) < sizeof(u64));
2906
2907         if (!has_huge_files) {
2908                 upper_limit = (1LL << 32) - 1;
2909
2910                 /* total blocks in file system block size */
2911                 upper_limit >>= (blkbits - 9);
2912                 upper_limit <<= blkbits;
2913         }
2914
2915         /*
2916          * 32-bit extent-start container, ee_block. We lower the maxbytes
2917          * by one fs block, so ee_len can cover the extent of maximum file
2918          * size
2919          */
2920         res = (1LL << 32) - 1;
2921         res <<= blkbits;
2922
2923         /* Sanity check against vm- & vfs- imposed limits */
2924         if (res > upper_limit)
2925                 res = upper_limit;
2926
2927         return res;
2928 }
2929
2930 /*
2931  * Maximal bitmap file size.  There is a direct, and {,double-,triple-}indirect
2932  * block limit, and also a limit of (2^48 - 1) 512-byte sectors in i_blocks.
2933  * We need to be 1 filesystem block less than the 2^48 sector limit.
2934  */
2935 static loff_t ext4_max_bitmap_size(int bits, int has_huge_files)
2936 {
2937         loff_t res = EXT4_NDIR_BLOCKS;
2938         int meta_blocks;
2939         loff_t upper_limit;
2940         /* This is calculated to be the largest file size for a dense, block
2941          * mapped file such that the file's total number of 512-byte sectors,
2942          * including data and all indirect blocks, does not exceed (2^48 - 1).
2943          *
2944          * __u32 i_blocks_lo and _u16 i_blocks_high represent the total
2945          * number of 512-byte sectors of the file.
2946          */
2947
2948         if (!has_huge_files) {
2949                 /*
2950                  * !has_huge_files or implies that the inode i_block field
2951                  * represents total file blocks in 2^32 512-byte sectors ==
2952                  * size of vfs inode i_blocks * 8
2953                  */
2954                 upper_limit = (1LL << 32) - 1;
2955
2956                 /* total blocks in file system block size */
2957                 upper_limit >>= (bits - 9);
2958
2959         } else {
2960                 /*
2961                  * We use 48 bit ext4_inode i_blocks
2962                  * With EXT4_HUGE_FILE_FL set the i_blocks
2963                  * represent total number of blocks in
2964                  * file system block size
2965                  */
2966                 upper_limit = (1LL << 48) - 1;
2967
2968         }
2969
2970         /* indirect blocks */
2971         meta_blocks = 1;
2972         /* double indirect blocks */
2973         meta_blocks += 1 + (1LL << (bits-2));
2974         /* tripple indirect blocks */
2975         meta_blocks += 1 + (1LL << (bits-2)) + (1LL << (2*(bits-2)));
2976
2977         upper_limit -= meta_blocks;
2978         upper_limit <<= bits;
2979
2980         res += 1LL << (bits-2);
2981         res += 1LL << (2*(bits-2));
2982         res += 1LL << (3*(bits-2));
2983         res <<= bits;
2984         if (res > upper_limit)
2985                 res = upper_limit;
2986
2987         if (res > MAX_LFS_FILESIZE)
2988                 res = MAX_LFS_FILESIZE;
2989
2990         return res;
2991 }
2992
2993 static ext4_fsblk_t descriptor_loc(struct super_block *sb,
2994                                    ext4_fsblk_t logical_sb_block, int nr)
2995 {
2996         struct ext4_sb_info *sbi = EXT4_SB(sb);
2997         ext4_group_t bg, first_meta_bg;
2998         int has_super = 0;
2999
3000         first_meta_bg = le32_to_cpu(sbi->s_es->s_first_meta_bg);
3001
3002         if (!ext4_has_feature_meta_bg(sb) || nr < first_meta_bg)
3003                 return logical_sb_block + nr + 1;
3004         bg = sbi->s_desc_per_block * nr;
3005         if (ext4_bg_has_super(sb, bg))
3006                 has_super = 1;
3007
3008         /*
3009          * If we have a meta_bg fs with 1k blocks, group 0's GDT is at
3010          * block 2, not 1.  If s_first_data_block == 0 (bigalloc is enabled
3011          * on modern mke2fs or blksize > 1k on older mke2fs) then we must
3012          * compensate.
3013          */
3014         if (sb->s_blocksize == 1024 && nr == 0 &&
3015             le32_to_cpu(sbi->s_es->s_first_data_block) == 0)
3016                 has_super++;
3017
3018         return (has_super + ext4_group_first_block_no(sb, bg));
3019 }
3020
3021 /**
3022  * ext4_get_stripe_size: Get the stripe size.
3023  * @sbi: In memory super block info
3024  *
3025  * If we have specified it via mount option, then
3026  * use the mount option value. If the value specified at mount time is
3027  * greater than the blocks per group use the super block value.
3028  * If the super block value is greater than blocks per group return 0.
3029  * Allocator needs it be less than blocks per group.
3030  *
3031  */
3032 static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi)
3033 {
3034         unsigned long stride = le16_to_cpu(sbi->s_es->s_raid_stride);
3035         unsigned long stripe_width =
3036                         le32_to_cpu(sbi->s_es->s_raid_stripe_width);
3037         int ret;
3038
3039         if (sbi->s_stripe && sbi->s_stripe <= sbi->s_blocks_per_group)
3040                 ret = sbi->s_stripe;
3041         else if (stripe_width && stripe_width <= sbi->s_blocks_per_group)
3042                 ret = stripe_width;
3043         else if (stride && stride <= sbi->s_blocks_per_group)
3044                 ret = stride;
3045         else
3046                 ret = 0;
3047
3048         /*
3049          * If the stripe width is 1, this makes no sense and
3050          * we set it to 0 to turn off stripe handling code.
3051          */
3052         if (ret <= 1)
3053                 ret = 0;
3054
3055         return ret;
3056 }
3057
3058 /*
3059  * Check whether this filesystem can be mounted based on
3060  * the features present and the RDONLY/RDWR mount requested.
3061  * Returns 1 if this filesystem can be mounted as requested,
3062  * 0 if it cannot be.
3063  */
3064 static int ext4_feature_set_ok(struct super_block *sb, int readonly)
3065 {
3066         if (ext4_has_unknown_ext4_incompat_features(sb)) {
3067                 ext4_msg(sb, KERN_ERR,
3068                         "Couldn't mount because of "
3069                         "unsupported optional features (%x)",
3070                         (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) &
3071                         ~EXT4_FEATURE_INCOMPAT_SUPP));
3072                 return 0;
3073         }
3074
3075 #ifndef CONFIG_UNICODE
3076         if (ext4_has_feature_casefold(sb)) {
3077                 ext4_msg(sb, KERN_ERR,
3078                          "Filesystem with casefold feature cannot be "
3079                          "mounted without CONFIG_UNICODE");
3080                 return 0;
3081         }
3082 #endif
3083
3084         if (readonly)
3085                 return 1;
3086
3087         if (ext4_has_feature_readonly(sb)) {
3088                 ext4_msg(sb, KERN_INFO, "filesystem is read-only");
3089                 sb->s_flags |= SB_RDONLY;
3090                 return 1;
3091         }
3092
3093         /* Check that feature set is OK for a read-write mount */
3094         if (ext4_has_unknown_ext4_ro_compat_features(sb)) {
3095                 ext4_msg(sb, KERN_ERR, "couldn't mount RDWR because of "
3096                          "unsupported optional features (%x)",
3097                          (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) &
3098                                 ~EXT4_FEATURE_RO_COMPAT_SUPP));
3099                 return 0;
3100         }
3101         if (ext4_has_feature_bigalloc(sb) && !ext4_has_feature_extents(sb)) {
3102                 ext4_msg(sb, KERN_ERR,
3103                          "Can't support bigalloc feature without "
3104                          "extents feature\n");
3105                 return 0;
3106         }
3107
3108 #if !IS_ENABLED(CONFIG_QUOTA) || !IS_ENABLED(CONFIG_QFMT_V2)
3109         if (!readonly && (ext4_has_feature_quota(sb) ||
3110                           ext4_has_feature_project(sb))) {
3111                 ext4_msg(sb, KERN_ERR,
3112                          "The kernel was not built with CONFIG_QUOTA and CONFIG_QFMT_V2");
3113                 return 0;
3114         }
3115 #endif  /* CONFIG_QUOTA */
3116         return 1;
3117 }
3118
3119 /*
3120  * This function is called once a day if we have errors logged
3121  * on the file system
3122  */
3123 static void print_daily_error_info(struct timer_list *t)
3124 {
3125         struct ext4_sb_info *sbi = from_timer(sbi, t, s_err_report);
3126         struct super_block *sb = sbi->s_sb;
3127         struct ext4_super_block *es = sbi->s_es;
3128
3129         if (es->s_error_count)
3130                 /* fsck newer than v1.41.13 is needed to clean this condition. */
3131                 ext4_msg(sb, KERN_NOTICE, "error count since last fsck: %u",
3132                          le32_to_cpu(es->s_error_count));
3133         if (es->s_first_error_time) {
3134                 printk(KERN_NOTICE "EXT4-fs (%s): initial error at time %llu: %.*s:%d",
3135                        sb->s_id,
3136                        ext4_get_tstamp(es, s_first_error_time),
3137                        (int) sizeof(es->s_first_error_func),
3138                        es->s_first_error_func,
3139                        le32_to_cpu(es->s_first_error_line));
3140                 if (es->s_first_error_ino)
3141                         printk(KERN_CONT ": inode %u",
3142                                le32_to_cpu(es->s_first_error_ino));
3143                 if (es->s_first_error_block)
3144                         printk(KERN_CONT ": block %llu", (unsigned long long)
3145                                le64_to_cpu(es->s_first_error_block));
3146                 printk(KERN_CONT "\n");
3147         }
3148         if (es->s_last_error_time) {
3149                 printk(KERN_NOTICE "EXT4-fs (%s): last error at time %llu: %.*s:%d",
3150                        sb->s_id,
3151                        ext4_get_tstamp(es, s_last_error_time),
3152                        (int) sizeof(es->s_last_error_func),
3153                        es->s_last_error_func,
3154                        le32_to_cpu(es->s_last_error_line));
3155                 if (es->s_last_error_ino)
3156                         printk(KERN_CONT ": inode %u",
3157                                le32_to_cpu(es->s_last_error_ino));
3158                 if (es->s_last_error_block)
3159                         printk(KERN_CONT ": block %llu", (unsigned long long)
3160                                le64_to_cpu(es->s_last_error_block));
3161                 printk(KERN_CONT "\n");
3162         }
3163         mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ);  /* Once a day */
3164 }
3165
3166 /* Find next suitable group and run ext4_init_inode_table */
3167 static int ext4_run_li_request(struct ext4_li_request *elr)
3168 {
3169         struct ext4_group_desc *gdp = NULL;
3170         ext4_group_t group, ngroups;
3171         struct super_block *sb;
3172         unsigned long timeout = 0;
3173         int ret = 0;
3174
3175         sb = elr->lr_super;
3176         ngroups = EXT4_SB(sb)->s_groups_count;
3177
3178         for (group = elr->lr_next_group; group < ngroups; group++) {
3179                 gdp = ext4_get_group_desc(sb, group, NULL);
3180                 if (!gdp) {
3181                         ret = 1;
3182                         break;
3183                 }
3184
3185                 if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
3186                         break;
3187         }
3188
3189         if (group >= ngroups)
3190                 ret = 1;
3191
3192         if (!ret) {
3193                 timeout = jiffies;
3194                 ret = ext4_init_inode_table(sb, group,
3195                                             elr->lr_timeout ? 0 : 1);
3196                 if (elr->lr_timeout == 0) {
3197                         timeout = (jiffies - timeout) *
3198                                   elr->lr_sbi->s_li_wait_mult;
3199                         elr->lr_timeout = timeout;
3200                 }
3201                 elr->lr_next_sched = jiffies + elr->lr_timeout;
3202                 elr->lr_next_group = group + 1;
3203         }
3204         return ret;
3205 }
3206
3207 /*
3208  * Remove lr_request from the list_request and free the
3209  * request structure. Should be called with li_list_mtx held
3210  */
3211 static void ext4_remove_li_request(struct ext4_li_request *elr)
3212 {
3213         struct ext4_sb_info *sbi;
3214
3215         if (!elr)
3216                 return;
3217
3218         sbi = elr->lr_sbi;
3219
3220         list_del(&elr->lr_request);
3221         sbi->s_li_request = NULL;
3222         kfree(elr);
3223 }
3224
3225 static void ext4_unregister_li_request(struct super_block *sb)
3226 {
3227         mutex_lock(&ext4_li_mtx);
3228         if (!ext4_li_info) {
3229                 mutex_unlock(&ext4_li_mtx);
3230                 return;
3231         }
3232
3233         mutex_lock(&ext4_li_info->li_list_mtx);
3234         ext4_remove_li_request(EXT4_SB(sb)->s_li_request);
3235         mutex_unlock(&ext4_li_info->li_list_mtx);
3236         mutex_unlock(&ext4_li_mtx);
3237 }
3238
3239 static struct task_struct *ext4_lazyinit_task;
3240
3241 /*
3242  * This is the function where ext4lazyinit thread lives. It walks
3243  * through the request list searching for next scheduled filesystem.
3244  * When such a fs is found, run the lazy initialization request
3245  * (ext4_rn_li_request) and keep track of the time spend in this
3246  * function. Based on that time we compute next schedule time of
3247  * the request. When walking through the list is complete, compute
3248  * next waking time and put itself into sleep.
3249  */
3250 static int ext4_lazyinit_thread(void *arg)
3251 {
3252         struct ext4_lazy_init *eli = (struct ext4_lazy_init *)arg;
3253         struct list_head *pos, *n;
3254         struct ext4_li_request *elr;
3255         unsigned long next_wakeup, cur;
3256
3257         BUG_ON(NULL == eli);
3258
3259 cont_thread:
3260         while (true) {
3261                 next_wakeup = MAX_JIFFY_OFFSET;
3262
3263                 mutex_lock(&eli->li_list_mtx);
3264                 if (list_empty(&eli->li_request_list)) {
3265                         mutex_unlock(&eli->li_list_mtx);
3266                         goto exit_thread;
3267                 }
3268                 list_for_each_safe(pos, n, &eli->li_request_list) {
3269                         int err = 0;
3270                         int progress = 0;
3271                         elr = list_entry(pos, struct ext4_li_request,
3272                                          lr_request);
3273
3274                         if (time_before(jiffies, elr->lr_next_sched)) {
3275                                 if (time_before(elr->lr_next_sched, next_wakeup))
3276                                         next_wakeup = elr->lr_next_sched;
3277                                 continue;
3278                         }
3279                         if (down_read_trylock(&elr->lr_super->s_umount)) {
3280                                 if (sb_start_write_trylock(elr->lr_super)) {
3281                                         progress = 1;
3282                                         /*
3283                                          * We hold sb->s_umount, sb can not
3284                                          * be removed from the list, it is
3285                                          * now safe to drop li_list_mtx
3286                                          */
3287                                         mutex_unlock(&eli->li_list_mtx);
3288                                         err = ext4_run_li_request(elr);
3289                                         sb_end_write(elr->lr_super);
3290                                         mutex_lock(&eli->li_list_mtx);
3291                                         n = pos->next;
3292                                 }
3293                                 up_read((&elr->lr_super->s_umount));
3294                         }
3295                         /* error, remove the lazy_init job */
3296                         if (err) {
3297                                 ext4_remove_li_request(elr);
3298                                 continue;
3299                         }
3300                         if (!progress) {
3301                                 elr->lr_next_sched = jiffies +
3302                                         (prandom_u32()
3303                                          % (EXT4_DEF_LI_MAX_START_DELAY * HZ));
3304                         }
3305                         if (time_before(elr->lr_next_sched, next_wakeup))
3306                                 next_wakeup = elr->lr_next_sched;
3307                 }
3308                 mutex_unlock(&eli->li_list_mtx);
3309
3310                 try_to_freeze();
3311
3312                 cur = jiffies;
3313                 if ((time_after_eq(cur, next_wakeup)) ||
3314                     (MAX_JIFFY_OFFSET == next_wakeup)) {
3315                         cond_resched();
3316                         continue;
3317                 }
3318
3319                 schedule_timeout_interruptible(next_wakeup - cur);
3320
3321                 if (kthread_should_stop()) {
3322                         ext4_clear_request_list();
3323                         goto exit_thread;
3324                 }
3325         }
3326
3327 exit_thread:
3328         /*
3329          * It looks like the request list is empty, but we need
3330          * to check it under the li_list_mtx lock, to prevent any
3331          * additions into it, and of course we should lock ext4_li_mtx
3332          * to atomically free the list and ext4_li_info, because at
3333          * this point another ext4 filesystem could be registering
3334          * new one.
3335          */
3336         mutex_lock(&ext4_li_mtx);
3337         mutex_lock(&eli->li_list_mtx);
3338         if (!list_empty(&eli->li_request_list)) {
3339                 mutex_unlock(&eli->li_list_mtx);
3340                 mutex_unlock(&ext4_li_mtx);
3341                 goto cont_thread;
3342         }
3343         mutex_unlock(&eli->li_list_mtx);
3344         kfree(ext4_li_info);
3345         ext4_li_info = NULL;
3346         mutex_unlock(&ext4_li_mtx);
3347
3348         return 0;
3349 }
3350
3351 static void ext4_clear_request_list(void)
3352 {
3353         struct list_head *pos, *n;
3354         struct ext4_li_request *elr;
3355
3356         mutex_lock(&ext4_li_info->li_list_mtx);
3357         list_for_each_safe(pos, n, &ext4_li_info->li_request_list) {
3358                 elr = list_entry(pos, struct ext4_li_request,
3359                                  lr_request);
3360                 ext4_remove_li_request(elr);
3361         }
3362         mutex_unlock(&ext4_li_info->li_list_mtx);
3363 }
3364
3365 static int ext4_run_lazyinit_thread(void)
3366 {
3367         ext4_lazyinit_task = kthread_run(ext4_lazyinit_thread,
3368                                          ext4_li_info, "ext4lazyinit");
3369         if (IS_ERR(ext4_lazyinit_task)) {
3370                 int err = PTR_ERR(ext4_lazyinit_task);
3371                 ext4_clear_request_list();
3372                 kfree(ext4_li_info);
3373                 ext4_li_info = NULL;
3374                 printk(KERN_CRIT "EXT4-fs: error %d creating inode table "
3375                                  "initialization thread\n",
3376                                  err);
3377                 return err;
3378         }
3379         ext4_li_info->li_state |= EXT4_LAZYINIT_RUNNING;
3380         return 0;
3381 }
3382
3383 /*
3384  * Check whether it make sense to run itable init. thread or not.
3385  * If there is at least one uninitialized inode table, return
3386  * corresponding group number, else the loop goes through all
3387  * groups and return total number of groups.
3388  */
3389 static ext4_group_t ext4_has_uninit_itable(struct super_block *sb)
3390 {
3391         ext4_group_t group, ngroups = EXT4_SB(sb)->s_groups_count;
3392         struct ext4_group_desc *gdp = NULL;
3393
3394         if (!ext4_has_group_desc_csum(sb))
3395                 return ngroups;
3396
3397         for (group = 0; group < ngroups; group++) {
3398                 gdp = ext4_get_group_desc(sb, group, NULL);
3399                 if (!gdp)
3400                         continue;
3401
3402                 if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_ZEROED)))
3403                         break;
3404         }
3405
3406         return group;
3407 }
3408
3409 static int ext4_li_info_new(void)
3410 {
3411         struct ext4_lazy_init *eli = NULL;
3412
3413         eli = kzalloc(sizeof(*eli), GFP_KERNEL);
3414         if (!eli)
3415                 return -ENOMEM;
3416
3417         INIT_LIST_HEAD(&eli->li_request_list);
3418         mutex_init(&eli->li_list_mtx);
3419
3420         eli->li_state |= EXT4_LAZYINIT_QUIT;
3421
3422         ext4_li_info = eli;
3423
3424         return 0;
3425 }
3426
3427 static struct ext4_li_request *ext4_li_request_new(struct super_block *sb,
3428                                             ext4_group_t start)
3429 {
3430         struct ext4_sb_info *sbi = EXT4_SB(sb);
3431         struct ext4_li_request *elr;
3432
3433         elr = kzalloc(sizeof(*elr), GFP_KERNEL);
3434         if (!elr)
3435                 return NULL;
3436
3437         elr->lr_super = sb;
3438         elr->lr_sbi = sbi;
3439         elr->lr_next_group = start;
3440
3441         /*
3442          * Randomize first schedule time of the request to
3443          * spread the inode table initialization requests
3444          * better.
3445          */
3446         elr->lr_next_sched = jiffies + (prandom_u32() %
3447                                 (EXT4_DEF_LI_MAX_START_DELAY * HZ));
3448         return elr;
3449 }
3450
3451 int ext4_register_li_request(struct super_block *sb,
3452                              ext4_group_t first_not_zeroed)
3453 {
3454         struct ext4_sb_info *sbi = EXT4_SB(sb);
3455         struct ext4_li_request *elr = NULL;
3456         ext4_group_t ngroups = sbi->s_groups_count;
3457         int ret = 0;
3458
3459         mutex_lock(&ext4_li_mtx);
3460         if (sbi->s_li_request != NULL) {
3461                 /*
3462                  * Reset timeout so it can be computed again, because
3463                  * s_li_wait_mult might have changed.
3464                  */
3465                 sbi->s_li_request->lr_timeout = 0;
3466                 goto out;
3467         }
3468
3469         if (first_not_zeroed == ngroups || sb_rdonly(sb) ||
3470             !test_opt(sb, INIT_INODE_TABLE))
3471                 goto out;
3472
3473         elr = ext4_li_request_new(sb, first_not_zeroed);
3474         if (!elr) {
3475                 ret = -ENOMEM;
3476                 goto out;
3477         }
3478
3479         if (NULL == ext4_li_info) {
3480                 ret = ext4_li_info_new();
3481                 if (ret)
3482                         goto out;
3483         }
3484
3485         mutex_lock(&ext4_li_info->li_list_mtx);
3486         list_add(&elr->lr_request, &ext4_li_info->li_request_list);
3487         mutex_unlock(&ext4_li_info->li_list_mtx);
3488
3489         sbi->s_li_request = elr;
3490         /*
3491          * set elr to NULL here since it has been inserted to
3492          * the request_list and the removal and free of it is
3493          * handled by ext4_clear_request_list from now on.
3494          */
3495         elr = NULL;
3496
3497         if (!(ext4_li_info->li_state & EXT4_LAZYINIT_RUNNING)) {
3498                 ret = ext4_run_lazyinit_thread();
3499                 if (ret)
3500                         goto out;
3501         }
3502 out:
3503         mutex_unlock(&ext4_li_mtx);
3504         if (ret)
3505                 kfree(elr);
3506         return ret;
3507 }
3508
3509 /*
3510  * We do not need to lock anything since this is called on
3511  * module unload.
3512  */
3513 static void ext4_destroy_lazyinit_thread(void)
3514 {
3515         /*
3516          * If thread exited earlier
3517          * there's nothing to be done.
3518          */
3519         if (!ext4_li_info || !ext4_lazyinit_task)
3520                 return;
3521
3522         kthread_stop(ext4_lazyinit_task);
3523 }
3524
3525 static int set_journal_csum_feature_set(struct super_block *sb)
3526 {
3527         int ret = 1;
3528         int compat, incompat;
3529         struct ext4_sb_info *sbi = EXT4_SB(sb);
3530
3531         if (ext4_has_metadata_csum(sb)) {
3532                 /* journal checksum v3 */
3533                 compat = 0;
3534                 incompat = JBD2_FEATURE_INCOMPAT_CSUM_V3;
3535         } else {
3536                 /* journal checksum v1 */
3537                 compat = JBD2_FEATURE_COMPAT_CHECKSUM;
3538                 incompat = 0;
3539         }
3540
3541         jbd2_journal_clear_features(sbi->s_journal,
3542                         JBD2_FEATURE_COMPAT_CHECKSUM, 0,
3543                         JBD2_FEATURE_INCOMPAT_CSUM_V3 |
3544                         JBD2_FEATURE_INCOMPAT_CSUM_V2);
3545         if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
3546                 ret = jbd2_journal_set_features(sbi->s_journal,
3547                                 compat, 0,
3548                                 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT |
3549                                 incompat);
3550         } else if (test_opt(sb, JOURNAL_CHECKSUM)) {
3551                 ret = jbd2_journal_set_features(sbi->s_journal,
3552                                 compat, 0,
3553                                 incompat);
3554                 jbd2_journal_clear_features(sbi->s_journal, 0, 0,
3555                                 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
3556         } else {
3557                 jbd2_journal_clear_features(sbi->s_journal, 0, 0,
3558                                 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
3559         }
3560
3561         return ret;
3562 }
3563
3564 /*
3565  * Note: calculating the overhead so we can be compatible with
3566  * historical BSD practice is quite difficult in the face of
3567  * clusters/bigalloc.  This is because multiple metadata blocks from
3568  * different block group can end up in the same allocation cluster.
3569  * Calculating the exact overhead in the face of clustered allocation
3570  * requires either O(all block bitmaps) in memory or O(number of block
3571  * groups**2) in time.  We will still calculate the superblock for
3572  * older file systems --- and if we come across with a bigalloc file
3573  * system with zero in s_overhead_clusters the estimate will be close to
3574  * correct especially for very large cluster sizes --- but for newer
3575  * file systems, it's better to calculate this figure once at mkfs
3576  * time, and store it in the superblock.  If the superblock value is
3577  * present (even for non-bigalloc file systems), we will use it.
3578  */
3579 static int count_overhead(struct super_block *sb, ext4_group_t grp,
3580                           char *buf)
3581 {
3582         struct ext4_sb_info     *sbi = EXT4_SB(sb);
3583         struct ext4_group_desc  *gdp;
3584         ext4_fsblk_t            first_block, last_block, b;
3585         ext4_group_t            i, ngroups = ext4_get_groups_count(sb);
3586         int                     s, j, count = 0;
3587
3588         if (!ext4_has_feature_bigalloc(sb))
3589                 return (ext4_bg_has_super(sb, grp) + ext4_bg_num_gdb(sb, grp) +
3590                         sbi->s_itb_per_group + 2);
3591
3592         first_block = le32_to_cpu(sbi->s_es->s_first_data_block) +
3593                 (grp * EXT4_BLOCKS_PER_GROUP(sb));
3594         last_block = first_block + EXT4_BLOCKS_PER_GROUP(sb) - 1;
3595         for (i = 0; i < ngroups; i++) {
3596                 gdp = ext4_get_group_desc(sb, i, NULL);
3597                 b = ext4_block_bitmap(sb, gdp);
3598                 if (b >= first_block && b <= last_block) {
3599                         ext4_set_bit(EXT4_B2C(sbi, b - first_block), buf);
3600                         count++;
3601                 }
3602                 b = ext4_inode_bitmap(sb, gdp);
3603                 if (b >= first_block && b <= last_block) {
3604                         ext4_set_bit(EXT4_B2C(sbi, b - first_block), buf);
3605                         count++;
3606                 }
3607                 b = ext4_inode_table(sb, gdp);
3608                 if (b >= first_block && b + sbi->s_itb_per_group <= last_block)
3609                         for (j = 0; j < sbi->s_itb_per_group; j++, b++) {
3610                                 int c = EXT4_B2C(sbi, b - first_block);
3611                                 ext4_set_bit(c, buf);
3612                                 count++;
3613                         }
3614                 if (i != grp)
3615                         continue;
3616                 s = 0;
3617                 if (ext4_bg_has_super(sb, grp)) {
3618                         ext4_set_bit(s++, buf);
3619                         count++;
3620                 }
3621                 j = ext4_bg_num_gdb(sb, grp);
3622                 if (s + j > EXT4_BLOCKS_PER_GROUP(sb)) {
3623                         ext4_error(sb, "Invalid number of block group "
3624                                    "descriptor blocks: %d", j);
3625                         j = EXT4_BLOCKS_PER_GROUP(sb) - s;
3626                 }
3627                 count += j;
3628                 for (; j > 0; j--)
3629                         ext4_set_bit(EXT4_B2C(sbi, s++), buf);
3630         }
3631         if (!count)
3632                 return 0;
3633         return EXT4_CLUSTERS_PER_GROUP(sb) -
3634                 ext4_count_free(buf, EXT4_CLUSTERS_PER_GROUP(sb) / 8);
3635 }
3636
3637 /*
3638  * Compute the overhead and stash it in sbi->s_overhead
3639  */
3640 int ext4_calculate_overhead(struct super_block *sb)
3641 {
3642         struct ext4_sb_info *sbi = EXT4_SB(sb);
3643         struct ext4_super_block *es = sbi->s_es;
3644         struct inode *j_inode;
3645         unsigned int j_blocks, j_inum = le32_to_cpu(es->s_journal_inum);
3646         ext4_group_t i, ngroups = ext4_get_groups_count(sb);
3647         ext4_fsblk_t overhead = 0;
3648         char *buf = (char *) get_zeroed_page(GFP_NOFS);
3649
3650         if (!buf)
3651                 return -ENOMEM;
3652
3653         /*
3654          * Compute the overhead (FS structures).  This is constant
3655          * for a given filesystem unless the number of block groups
3656          * changes so we cache the previous value until it does.
3657          */
3658
3659         /*
3660          * All of the blocks before first_data_block are overhead
3661          */
3662         overhead = EXT4_B2C(sbi, le32_to_cpu(es->s_first_data_block));
3663
3664         /*
3665          * Add the overhead found in each block group
3666          */
3667         for (i = 0; i < ngroups; i++) {
3668                 int blks;
3669
3670                 blks = count_overhead(sb, i, buf);
3671                 overhead += blks;
3672                 if (blks)
3673                         memset(buf, 0, PAGE_SIZE);
3674                 cond_resched();
3675         }
3676
3677         /*
3678          * Add the internal journal blocks whether the journal has been
3679          * loaded or not
3680          */
3681         if (sbi->s_journal && !sbi->journal_bdev)
3682                 overhead += EXT4_NUM_B2C(sbi, sbi->s_journal->j_maxlen);
3683         else if (ext4_has_feature_journal(sb) && !sbi->s_journal && j_inum) {
3684                 /* j_inum for internal journal is non-zero */
3685                 j_inode = ext4_get_journal_inode(sb, j_inum);
3686                 if (j_inode) {
3687                         j_blocks = j_inode->i_size >> sb->s_blocksize_bits;
3688                         overhead += EXT4_NUM_B2C(sbi, j_blocks);
3689                         iput(j_inode);
3690                 } else {
3691                         ext4_msg(sb, KERN_ERR, "can't get journal size");
3692                 }
3693         }
3694         sbi->s_overhead = overhead;
3695         smp_wmb();
3696         free_page((unsigned long) buf);
3697         return 0;
3698 }
3699
3700 static void ext4_set_resv_clusters(struct super_block *sb)
3701 {
3702         ext4_fsblk_t resv_clusters;
3703         struct ext4_sb_info *sbi = EXT4_SB(sb);
3704
3705         /*
3706          * There's no need to reserve anything when we aren't using extents.
3707          * The space estimates are exact, there are no unwritten extents,
3708          * hole punching doesn't need new metadata... This is needed especially
3709          * to keep ext2/3 backward compatibility.
3710          */
3711         if (!ext4_has_feature_extents(sb))
3712                 return;
3713         /*
3714          * By default we reserve 2% or 4096 clusters, whichever is smaller.
3715          * This should cover the situations where we can not afford to run
3716          * out of space like for example punch hole, or converting
3717          * unwritten extents in delalloc path. In most cases such
3718          * allocation would require 1, or 2 blocks, higher numbers are
3719          * very rare.
3720          */
3721         resv_clusters = (ext4_blocks_count(sbi->s_es) >>
3722                          sbi->s_cluster_bits);
3723
3724         do_div(resv_clusters, 50);
3725         resv_clusters = min_t(ext4_fsblk_t, resv_clusters, 4096);
3726
3727         atomic64_set(&sbi->s_resv_clusters, resv_clusters);
3728 }
3729
3730 static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3731 {
3732         struct dax_device *dax_dev = fs_dax_get_by_bdev(sb->s_bdev);
3733         char *orig_data = kstrdup(data, GFP_KERNEL);
3734         struct buffer_head *bh, **group_desc;
3735         struct ext4_super_block *es = NULL;
3736         struct ext4_sb_info *sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
3737         struct flex_groups **flex_groups;
3738         ext4_fsblk_t block;
3739         ext4_fsblk_t sb_block = get_sb_block(&data);
3740         ext4_fsblk_t logical_sb_block;
3741         unsigned long offset = 0;
3742         unsigned long journal_devnum = 0;
3743         unsigned long def_mount_opts;
3744         struct inode *root;
3745         const char *descr;
3746         int ret = -ENOMEM;
3747         int blocksize, clustersize;
3748         unsigned int db_count;
3749         unsigned int i;
3750         int needs_recovery, has_huge_files;
3751         __u64 blocks_count;
3752         int err = 0;
3753         unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
3754         ext4_group_t first_not_zeroed;
3755
3756         if ((data && !orig_data) || !sbi)
3757                 goto out_free_base;
3758
3759         sbi->s_daxdev = dax_dev;
3760         sbi->s_blockgroup_lock =
3761                 kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
3762         if (!sbi->s_blockgroup_lock)
3763                 goto out_free_base;
3764
3765         sb->s_fs_info = sbi;
3766         sbi->s_sb = sb;
3767         sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS;
3768         sbi->s_sb_block = sb_block;
3769         if (sb->s_bdev->bd_part)
3770                 sbi->s_sectors_written_start =
3771                         part_stat_read(sb->s_bdev->bd_part, sectors[STAT_WRITE]);
3772
3773         /* Cleanup superblock name */
3774         strreplace(sb->s_id, '/', '!');
3775
3776         /* -EINVAL is default */
3777         ret = -EINVAL;
3778         blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE);
3779         if (!blocksize) {
3780                 ext4_msg(sb, KERN_ERR, "unable to set blocksize");
3781                 goto out_fail;
3782         }
3783
3784         /*
3785          * The ext4 superblock will not be buffer aligned for other than 1kB
3786          * block sizes.  We need to calculate the offset from buffer start.
3787          */
3788         if (blocksize != EXT4_MIN_BLOCK_SIZE) {
3789                 logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE;
3790                 offset = do_div(logical_sb_block, blocksize);
3791         } else {
3792                 logical_sb_block = sb_block;
3793         }
3794
3795         if (!(bh = sb_bread_unmovable(sb, logical_sb_block))) {
3796                 ext4_msg(sb, KERN_ERR, "unable to read superblock");
3797                 goto out_fail;
3798         }
3799         /*
3800          * Note: s_es must be initialized as soon as possible because
3801          *       some ext4 macro-instructions depend on its value
3802          */
3803         es = (struct ext4_super_block *) (bh->b_data + offset);
3804         sbi->s_es = es;
3805         sb->s_magic = le16_to_cpu(es->s_magic);
3806         if (sb->s_magic != EXT4_SUPER_MAGIC)
3807                 goto cantfind_ext4;
3808         sbi->s_kbytes_written = le64_to_cpu(es->s_kbytes_written);
3809
3810         /* Warn if metadata_csum and gdt_csum are both set. */
3811         if (ext4_has_feature_metadata_csum(sb) &&
3812             ext4_has_feature_gdt_csum(sb))
3813                 ext4_warning(sb, "metadata_csum and uninit_bg are "
3814                              "redundant flags; please run fsck.");
3815
3816         /* Check for a known checksum algorithm */
3817         if (!ext4_verify_csum_type(sb, es)) {
3818                 ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with "
3819                          "unknown checksum algorithm.");
3820                 silent = 1;
3821                 goto cantfind_ext4;
3822         }
3823
3824         /* Load the checksum driver */
3825         sbi->s_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
3826         if (IS_ERR(sbi->s_chksum_driver)) {
3827                 ext4_msg(sb, KERN_ERR, "Cannot load crc32c driver.");
3828                 ret = PTR_ERR(sbi->s_chksum_driver);
3829                 sbi->s_chksum_driver = NULL;
3830                 goto failed_mount;
3831         }
3832
3833         /* Check superblock checksum */
3834         if (!ext4_superblock_csum_verify(sb, es)) {
3835                 ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with "
3836                          "invalid superblock checksum.  Run e2fsck?");
3837                 silent = 1;
3838                 ret = -EFSBADCRC;
3839                 goto cantfind_ext4;
3840         }
3841
3842         /* Precompute checksum seed for all metadata */
3843         if (ext4_has_feature_csum_seed(sb))
3844                 sbi->s_csum_seed = le32_to_cpu(es->s_checksum_seed);
3845         else if (ext4_has_metadata_csum(sb) || ext4_has_feature_ea_inode(sb))
3846                 sbi->s_csum_seed = ext4_chksum(sbi, ~0, es->s_uuid,
3847                                                sizeof(es->s_uuid));
3848
3849         /* Set defaults before we parse the mount options */
3850         def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
3851         set_opt(sb, INIT_INODE_TABLE);
3852         if (def_mount_opts & EXT4_DEFM_DEBUG)
3853                 set_opt(sb, DEBUG);
3854         if (def_mount_opts & EXT4_DEFM_BSDGROUPS)
3855                 set_opt(sb, GRPID);
3856         if (def_mount_opts & EXT4_DEFM_UID16)
3857                 set_opt(sb, NO_UID32);
3858         /* xattr user namespace & acls are now defaulted on */
3859         set_opt(sb, XATTR_USER);
3860 #ifdef CONFIG_EXT4_FS_POSIX_ACL
3861         set_opt(sb, POSIX_ACL);
3862 #endif
3863         /* don't forget to enable journal_csum when metadata_csum is enabled. */
3864         if (ext4_has_metadata_csum(sb))
3865                 set_opt(sb, JOURNAL_CHECKSUM);
3866
3867         if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA)
3868                 set_opt(sb, JOURNAL_DATA);
3869         else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED)
3870                 set_opt(sb, ORDERED_DATA);
3871         else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_WBACK)
3872                 set_opt(sb, WRITEBACK_DATA);
3873
3874         if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_PANIC)
3875                 set_opt(sb, ERRORS_PANIC);
3876         else if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_CONTINUE)
3877                 set_opt(sb, ERRORS_CONT);
3878         else
3879                 set_opt(sb, ERRORS_RO);
3880         /* block_validity enabled by default; disable with noblock_validity */
3881         set_opt(sb, BLOCK_VALIDITY);
3882         if (def_mount_opts & EXT4_DEFM_DISCARD)
3883                 set_opt(sb, DISCARD);
3884
3885         sbi->s_resuid = make_kuid(&init_user_ns, le16_to_cpu(es->s_def_resuid));
3886         sbi->s_resgid = make_kgid(&init_user_ns, le16_to_cpu(es->s_def_resgid));
3887         sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ;
3888         sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
3889         sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
3890
3891         if ((def_mount_opts & EXT4_DEFM_NOBARRIER) == 0)
3892                 set_opt(sb, BARRIER);
3893
3894         /*
3895          * enable delayed allocation by default
3896          * Use -o nodelalloc to turn it off
3897          */
3898         if (!IS_EXT3_SB(sb) && !IS_EXT2_SB(sb) &&
3899             ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0))
3900                 set_opt(sb, DELALLOC);
3901
3902         /*
3903          * set default s_li_wait_mult for lazyinit, for the case there is
3904          * no mount option specified.
3905          */
3906         sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT;
3907
3908         blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
3909
3910         if (blocksize == PAGE_SIZE)
3911                 set_opt(sb, DIOREAD_NOLOCK);
3912
3913         if (blocksize < EXT4_MIN_BLOCK_SIZE ||
3914             blocksize > EXT4_MAX_BLOCK_SIZE) {
3915                 ext4_msg(sb, KERN_ERR,
3916                        "Unsupported filesystem blocksize %d (%d log_block_size)",
3917                          blocksize, le32_to_cpu(es->s_log_block_size));
3918                 goto failed_mount;
3919         }
3920
3921         if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) {
3922                 sbi->s_inode_size = EXT4_GOOD_OLD_INODE_SIZE;
3923                 sbi->s_first_ino = EXT4_GOOD_OLD_FIRST_INO;
3924         } else {
3925                 sbi->s_inode_size = le16_to_cpu(es->s_inode_size);
3926                 sbi->s_first_ino = le32_to_cpu(es->s_first_ino);
3927                 if (sbi->s_first_ino < EXT4_GOOD_OLD_FIRST_INO) {
3928                         ext4_msg(sb, KERN_ERR, "invalid first ino: %u",
3929                                  sbi->s_first_ino);
3930                         goto failed_mount;
3931                 }
3932                 if ((sbi->s_inode_size < EXT4_GOOD_OLD_INODE_SIZE) ||
3933                     (!is_power_of_2(sbi->s_inode_size)) ||
3934                     (sbi->s_inode_size > blocksize)) {
3935                         ext4_msg(sb, KERN_ERR,
3936                                "unsupported inode size: %d",
3937                                sbi->s_inode_size);
3938                         ext4_msg(sb, KERN_ERR, "blocksize: %d", blocksize);
3939                         goto failed_mount;
3940                 }
3941                 /*
3942                  * i_atime_extra is the last extra field available for
3943                  * [acm]times in struct ext4_inode. Checking for that
3944                  * field should suffice to ensure we have extra space
3945                  * for all three.
3946                  */
3947                 if (sbi->s_inode_size >= offsetof(struct ext4_inode, i_atime_extra) +
3948                         sizeof(((struct ext4_inode *)0)->i_atime_extra)) {
3949                         sb->s_time_gran = 1;
3950                         sb->s_time_max = EXT4_EXTRA_TIMESTAMP_MAX;
3951                 } else {
3952                         sb->s_time_gran = NSEC_PER_SEC;
3953                         sb->s_time_max = EXT4_NON_EXTRA_TIMESTAMP_MAX;
3954                 }
3955                 sb->s_time_min = EXT4_TIMESTAMP_MIN;
3956         }
3957         if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) {
3958                 sbi->s_want_extra_isize = sizeof(struct ext4_inode) -
3959                         EXT4_GOOD_OLD_INODE_SIZE;
3960                 if (ext4_has_feature_extra_isize(sb)) {
3961                         unsigned v, max = (sbi->s_inode_size -
3962                                            EXT4_GOOD_OLD_INODE_SIZE);
3963
3964                         v = le16_to_cpu(es->s_want_extra_isize);
3965                         if (v > max) {
3966                                 ext4_msg(sb, KERN_ERR,
3967                                          "bad s_want_extra_isize: %d", v);
3968                                 goto failed_mount;
3969                         }
3970                         if (sbi->s_want_extra_isize < v)
3971                                 sbi->s_want_extra_isize = v;
3972
3973                         v = le16_to_cpu(es->s_min_extra_isize);
3974                         if (v > max) {
3975                                 ext4_msg(sb, KERN_ERR,
3976                                          "bad s_min_extra_isize: %d", v);
3977                                 goto failed_mount;
3978                         }
3979                         if (sbi->s_want_extra_isize < v)
3980                                 sbi->s_want_extra_isize = v;
3981                 }
3982         }
3983
3984         if (sbi->s_es->s_mount_opts[0]) {
3985                 char *s_mount_opts = kstrndup(sbi->s_es->s_mount_opts,
3986                                               sizeof(sbi->s_es->s_mount_opts),
3987                                               GFP_KERNEL);
3988                 if (!s_mount_opts)
3989                         goto failed_mount;
3990                 if (!parse_options(s_mount_opts, sb, &journal_devnum,
3991                                    &journal_ioprio, 0)) {
3992                         ext4_msg(sb, KERN_WARNING,
3993                                  "failed to parse options in superblock: %s",
3994                                  s_mount_opts);
3995                 }
3996                 kfree(s_mount_opts);
3997         }
3998         sbi->s_def_mount_opt = sbi->s_mount_opt;
3999         if (!parse_options((char *) data, sb, &journal_devnum,
4000                            &journal_ioprio, 0))
4001                 goto failed_mount;
4002
4003 #ifdef CONFIG_UNICODE
4004         if (ext4_has_feature_casefold(sb) && !sbi->s_encoding) {
4005                 const struct ext4_sb_encodings *encoding_info;
4006                 struct unicode_map *encoding;
4007                 __u16 encoding_flags;
4008
4009                 if (ext4_has_feature_encrypt(sb)) {
4010                         ext4_msg(sb, KERN_ERR,
4011                                  "Can't mount with encoding and encryption");
4012                         goto failed_mount;
4013                 }
4014
4015                 if (ext4_sb_read_encoding(es, &encoding_info,
4016                                           &encoding_flags)) {
4017                         ext4_msg(sb, KERN_ERR,
4018                                  "Encoding requested by superblock is unknown");
4019                         goto failed_mount;
4020                 }
4021
4022                 encoding = utf8_load(encoding_info->version);
4023                 if (IS_ERR(encoding)) {
4024                         ext4_msg(sb, KERN_ERR,
4025                                  "can't mount with superblock charset: %s-%s "
4026                                  "not supported by the kernel. flags: 0x%x.",
4027                                  encoding_info->name, encoding_info->version,
4028                                  encoding_flags);
4029                         goto failed_mount;
4030                 }
4031                 ext4_msg(sb, KERN_INFO,"Using encoding defined by superblock: "
4032                          "%s-%s with flags 0x%hx", encoding_info->name,
4033                          encoding_info->version?:"\b", encoding_flags);
4034
4035                 sbi->s_encoding = encoding;
4036                 sbi->s_encoding_flags = encoding_flags;
4037         }
4038 #endif
4039
4040         if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
4041                 printk_once(KERN_WARNING "EXT4-fs: Warning: mounting with data=journal disables delayed allocation, dioread_nolock, and O_DIRECT support!\n");
4042                 /* can't mount with both data=journal and dioread_nolock. */
4043                 clear_opt(sb, DIOREAD_NOLOCK);
4044                 if (test_opt2(sb, EXPLICIT_DELALLOC)) {
4045                         ext4_msg(sb, KERN_ERR, "can't mount with "
4046                                  "both data=journal and delalloc");
4047                         goto failed_mount;
4048                 }
4049                 if (test_opt(sb, DAX_ALWAYS)) {
4050                         ext4_msg(sb, KERN_ERR, "can't mount with "
4051                                  "both data=journal and dax");
4052                         goto failed_mount;
4053                 }
4054                 if (ext4_has_feature_encrypt(sb)) {
4055                         ext4_msg(sb, KERN_WARNING,
4056                                  "encrypted files will use data=ordered "
4057                                  "instead of data journaling mode");
4058                 }
4059                 if (test_opt(sb, DELALLOC))
4060                         clear_opt(sb, DELALLOC);
4061         } else {
4062                 sb->s_iflags |= SB_I_CGROUPWB;
4063         }
4064
4065         sb->s_flags = (sb->s_flags & ~SB_POSIXACL) |
4066                 (test_opt(sb, POSIX_ACL) ? SB_POSIXACL : 0);
4067
4068         if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV &&
4069             (ext4_has_compat_features(sb) ||
4070              ext4_has_ro_compat_features(sb) ||
4071              ext4_has_incompat_features(sb)))
4072                 ext4_msg(sb, KERN_WARNING,
4073                        "feature flags set on rev 0 fs, "
4074                        "running e2fsck is recommended");
4075
4076         if (es->s_creator_os == cpu_to_le32(EXT4_OS_HURD)) {
4077                 set_opt2(sb, HURD_COMPAT);
4078                 if (ext4_has_feature_64bit(sb)) {
4079                         ext4_msg(sb, KERN_ERR,
4080                                  "The Hurd can't support 64-bit file systems");
4081                         goto failed_mount;
4082                 }
4083
4084                 /*
4085                  * ea_inode feature uses l_i_version field which is not
4086                  * available in HURD_COMPAT mode.
4087                  */
4088                 if (ext4_has_feature_ea_inode(sb)) {
4089                         ext4_msg(sb, KERN_ERR,
4090                                  "ea_inode feature is not supported for Hurd");
4091                         goto failed_mount;
4092                 }
4093         }
4094
4095         if (IS_EXT2_SB(sb)) {
4096                 if (ext2_feature_set_ok(sb))
4097                         ext4_msg(sb, KERN_INFO, "mounting ext2 file system "
4098                                  "using the ext4 subsystem");
4099                 else {
4100                         /*
4101                          * If we're probing be silent, if this looks like
4102                          * it's actually an ext[34] filesystem.
4103                          */
4104                         if (silent && ext4_feature_set_ok(sb, sb_rdonly(sb)))
4105                                 goto failed_mount;
4106                         ext4_msg(sb, KERN_ERR, "couldn't mount as ext2 due "
4107                                  "to feature incompatibilities");
4108                         goto failed_mount;
4109                 }
4110         }
4111
4112         if (IS_EXT3_SB(sb)) {
4113                 if (ext3_feature_set_ok(sb))
4114                         ext4_msg(sb, KERN_INFO, "mounting ext3 file system "
4115                                  "using the ext4 subsystem");
4116                 else {
4117                         /*
4118                          * If we're probing be silent, if this looks like
4119                          * it's actually an ext4 filesystem.
4120                          */
4121                         if (silent && ext4_feature_set_ok(sb, sb_rdonly(sb)))
4122                                 goto failed_mount;
4123                         ext4_msg(sb, KERN_ERR, "couldn't mount as ext3 due "
4124                                  "to feature incompatibilities");
4125                         goto failed_mount;
4126                 }
4127         }
4128
4129         /*
4130          * Check feature flags regardless of the revision level, since we
4131          * previously didn't change the revision level when setting the flags,
4132          * so there is a chance incompat flags are set on a rev 0 filesystem.
4133          */
4134         if (!ext4_feature_set_ok(sb, (sb_rdonly(sb))))
4135                 goto failed_mount;
4136
4137         if (le32_to_cpu(es->s_log_block_size) >
4138             (EXT4_MAX_BLOCK_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) {
4139                 ext4_msg(sb, KERN_ERR,
4140                          "Invalid log block size: %u",
4141                          le32_to_cpu(es->s_log_block_size));
4142                 goto failed_mount;
4143         }
4144         if (le32_to_cpu(es->s_log_cluster_size) >
4145             (EXT4_MAX_CLUSTER_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) {
4146                 ext4_msg(sb, KERN_ERR,
4147                          "Invalid log cluster size: %u",
4148                          le32_to_cpu(es->s_log_cluster_size));
4149                 goto failed_mount;
4150         }
4151
4152         if (le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) > (blocksize / 4)) {
4153                 ext4_msg(sb, KERN_ERR,
4154                          "Number of reserved GDT blocks insanely large: %d",
4155                          le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks));
4156                 goto failed_mount;
4157         }
4158
4159         if (bdev_dax_supported(sb->s_bdev, blocksize))
4160                 set_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags);
4161
4162         if (sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) {
4163                 if (ext4_has_feature_inline_data(sb)) {
4164                         ext4_msg(sb, KERN_ERR, "Cannot use DAX on a filesystem"
4165                                         " that may contain inline data");
4166                         goto failed_mount;
4167                 }
4168                 if (!test_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags)) {
4169                         ext4_msg(sb, KERN_ERR,
4170                                 "DAX unsupported by block device.");
4171                         goto failed_mount;
4172                 }
4173         }
4174
4175         if (ext4_has_feature_encrypt(sb) && es->s_encryption_level) {
4176                 ext4_msg(sb, KERN_ERR, "Unsupported encryption level %d",
4177                          es->s_encryption_level);
4178                 goto failed_mount;
4179         }
4180
4181         if (sb->s_blocksize != blocksize) {
4182                 /* Validate the filesystem blocksize */
4183                 if (!sb_set_blocksize(sb, blocksize)) {
4184                         ext4_msg(sb, KERN_ERR, "bad block size %d",
4185                                         blocksize);
4186                         goto failed_mount;
4187                 }
4188
4189                 brelse(bh);
4190                 logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE;
4191                 offset = do_div(logical_sb_block, blocksize);
4192                 bh = sb_bread_unmovable(sb, logical_sb_block);
4193                 if (!bh) {
4194                         ext4_msg(sb, KERN_ERR,
4195                                "Can't read superblock on 2nd try");
4196                         goto failed_mount;
4197                 }
4198                 es = (struct ext4_super_block *)(bh->b_data + offset);
4199                 sbi->s_es = es;
4200                 if (es->s_magic != cpu_to_le16(EXT4_SUPER_MAGIC)) {
4201                         ext4_msg(sb, KERN_ERR,
4202                                "Magic mismatch, very weird!");
4203                         goto failed_mount;
4204                 }
4205         }
4206
4207         has_huge_files = ext4_has_feature_huge_file(sb);
4208         sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits,
4209                                                       has_huge_files);
4210         sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files);
4211
4212         sbi->s_desc_size = le16_to_cpu(es->s_desc_size);
4213         if (ext4_has_feature_64bit(sb)) {
4214                 if (sbi->s_desc_size < EXT4_MIN_DESC_SIZE_64BIT ||
4215                     sbi->s_desc_size > EXT4_MAX_DESC_SIZE ||
4216                     !is_power_of_2(sbi->s_desc_size)) {
4217                         ext4_msg(sb, KERN_ERR,
4218                                "unsupported descriptor size %lu",
4219                                sbi->s_desc_size);
4220                         goto failed_mount;
4221                 }
4222         } else
4223                 sbi->s_desc_size = EXT4_MIN_DESC_SIZE;
4224
4225         sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group);
4226         sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group);
4227
4228         sbi->s_inodes_per_block = blocksize / EXT4_INODE_SIZE(sb);
4229         if (sbi->s_inodes_per_block == 0)
4230                 goto cantfind_ext4;
4231         if (sbi->s_inodes_per_group < sbi->s_inodes_per_block ||
4232             sbi->s_inodes_per_group > blocksize * 8) {
4233                 ext4_msg(sb, KERN_ERR, "invalid inodes per group: %lu\n",
4234                          sbi->s_inodes_per_group);
4235                 goto failed_mount;
4236         }
4237         sbi->s_itb_per_group = sbi->s_inodes_per_group /
4238                                         sbi->s_inodes_per_block;
4239         sbi->s_desc_per_block = blocksize / EXT4_DESC_SIZE(sb);
4240         sbi->s_sbh = bh;
4241         sbi->s_mount_state = le16_to_cpu(es->s_state);
4242         sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb));
4243         sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb));
4244
4245         for (i = 0; i < 4; i++)
4246                 sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
4247         sbi->s_def_hash_version = es->s_def_hash_version;
4248         if (ext4_has_feature_dir_index(sb)) {
4249                 i = le32_to_cpu(es->s_flags);
4250                 if (i & EXT2_FLAGS_UNSIGNED_HASH)
4251                         sbi->s_hash_unsigned = 3;
4252                 else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) {
4253 #ifdef __CHAR_UNSIGNED__
4254                         if (!sb_rdonly(sb))
4255                                 es->s_flags |=
4256                                         cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH);
4257                         sbi->s_hash_unsigned = 3;
4258 #else
4259                         if (!sb_rdonly(sb))
4260                                 es->s_flags |=
4261                                         cpu_to_le32(EXT2_FLAGS_SIGNED_HASH);
4262 #endif
4263                 }
4264         }
4265
4266         /* Handle clustersize */
4267         clustersize = BLOCK_SIZE << le32_to_cpu(es->s_log_cluster_size);
4268         if (ext4_has_feature_bigalloc(sb)) {
4269                 if (clustersize < blocksize) {
4270                         ext4_msg(sb, KERN_ERR,
4271                                  "cluster size (%d) smaller than "
4272                                  "block size (%d)", clustersize, blocksize);
4273                         goto failed_mount;
4274                 }
4275                 sbi->s_cluster_bits = le32_to_cpu(es->s_log_cluster_size) -
4276                         le32_to_cpu(es->s_log_block_size);
4277                 sbi->s_clusters_per_group =
4278                         le32_to_cpu(es->s_clusters_per_group);
4279                 if (sbi->s_clusters_per_group > blocksize * 8) {
4280                         ext4_msg(sb, KERN_ERR,
4281                                  "#clusters per group too big: %lu",
4282                                  sbi->s_clusters_per_group);
4283                         goto failed_mount;
4284                 }
4285                 if (sbi->s_blocks_per_group !=
4286                     (sbi->s_clusters_per_group * (clustersize / blocksize))) {
4287                         ext4_msg(sb, KERN_ERR, "blocks per group (%lu) and "
4288                                  "clusters per group (%lu) inconsistent",
4289                                  sbi->s_blocks_per_group,
4290                                  sbi->s_clusters_per_group);
4291                         goto failed_mount;
4292                 }
4293         } else {
4294                 if (clustersize != blocksize) {
4295                         ext4_msg(sb, KERN_ERR,
4296                                  "fragment/cluster size (%d) != "
4297                                  "block size (%d)", clustersize, blocksize);
4298                         goto failed_mount;
4299                 }
4300                 if (sbi->s_blocks_per_group > blocksize * 8) {
4301                         ext4_msg(sb, KERN_ERR,
4302                                  "#blocks per group too big: %lu",
4303                                  sbi->s_blocks_per_group);
4304                         goto failed_mount;
4305                 }
4306                 sbi->s_clusters_per_group = sbi->s_blocks_per_group;
4307                 sbi->s_cluster_bits = 0;
4308         }
4309         sbi->s_cluster_ratio = clustersize / blocksize;
4310
4311         /* Do we have standard group size of clustersize * 8 blocks ? */
4312         if (sbi->s_blocks_per_group == clustersize << 3)
4313                 set_opt2(sb, STD_GROUP_SIZE);
4314
4315         /*
4316          * Test whether we have more sectors than will fit in sector_t,
4317          * and whether the max offset is addressable by the page cache.
4318          */
4319         err = generic_check_addressable(sb->s_blocksize_bits,
4320                                         ext4_blocks_count(es));
4321         if (err) {
4322                 ext4_msg(sb, KERN_ERR, "filesystem"
4323                          " too large to mount safely on this system");
4324                 goto failed_mount;
4325         }
4326
4327         if (EXT4_BLOCKS_PER_GROUP(sb) == 0)
4328                 goto cantfind_ext4;
4329
4330         /* check blocks count against device size */
4331         blocks_count = sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits;
4332         if (blocks_count && ext4_blocks_count(es) > blocks_count) {
4333                 ext4_msg(sb, KERN_WARNING, "bad geometry: block count %llu "
4334                        "exceeds size of device (%llu blocks)",
4335                        ext4_blocks_count(es), blocks_count);
4336                 goto failed_mount;
4337         }
4338
4339         /*
4340          * It makes no sense for the first data block to be beyond the end
4341          * of the filesystem.
4342          */
4343         if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) {
4344                 ext4_msg(sb, KERN_WARNING, "bad geometry: first data "
4345                          "block %u is beyond end of filesystem (%llu)",
4346                          le32_to_cpu(es->s_first_data_block),
4347                          ext4_blocks_count(es));
4348                 goto failed_mount;
4349         }
4350         if ((es->s_first_data_block == 0) && (es->s_log_block_size == 0) &&
4351             (sbi->s_cluster_ratio == 1)) {
4352                 ext4_msg(sb, KERN_WARNING, "bad geometry: first data "
4353                          "block is 0 with a 1k block and cluster size");
4354                 goto failed_mount;
4355         }
4356
4357         blocks_count = (ext4_blocks_count(es) -
4358                         le32_to_cpu(es->s_first_data_block) +
4359                         EXT4_BLOCKS_PER_GROUP(sb) - 1);
4360         do_div(blocks_count, EXT4_BLOCKS_PER_GROUP(sb));
4361         if (blocks_count > ((uint64_t)1<<32) - EXT4_DESC_PER_BLOCK(sb)) {
4362                 ext4_msg(sb, KERN_WARNING, "groups count too large: %llu "
4363                        "(block count %llu, first data block %u, "
4364                        "blocks per group %lu)", blocks_count,
4365                        ext4_blocks_count(es),
4366                        le32_to_cpu(es->s_first_data_block),
4367                        EXT4_BLOCKS_PER_GROUP(sb));
4368                 goto failed_mount;
4369         }
4370         sbi->s_groups_count = blocks_count;
4371         sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count,
4372                         (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb)));
4373         if (((u64)sbi->s_groups_count * sbi->s_inodes_per_group) !=
4374             le32_to_cpu(es->s_inodes_count)) {
4375                 ext4_msg(sb, KERN_ERR, "inodes count not valid: %u vs %llu",
4376                          le32_to_cpu(es->s_inodes_count),
4377                          ((u64)sbi->s_groups_count * sbi->s_inodes_per_group));
4378                 ret = -EINVAL;
4379                 goto failed_mount;
4380         }
4381         db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
4382                    EXT4_DESC_PER_BLOCK(sb);
4383         if (ext4_has_feature_meta_bg(sb)) {
4384                 if (le32_to_cpu(es->s_first_meta_bg) > db_count) {
4385                         ext4_msg(sb, KERN_WARNING,
4386                                  "first meta block group too large: %u "
4387                                  "(group descriptor block count %u)",
4388                                  le32_to_cpu(es->s_first_meta_bg), db_count);
4389                         goto failed_mount;
4390                 }
4391         }
4392         rcu_assign_pointer(sbi->s_group_desc,
4393                            kvmalloc_array(db_count,
4394                                           sizeof(struct buffer_head *),
4395                                           GFP_KERNEL));
4396         if (sbi->s_group_desc == NULL) {
4397                 ext4_msg(sb, KERN_ERR, "not enough memory");
4398                 ret = -ENOMEM;
4399                 goto failed_mount;
4400         }
4401
4402         bgl_lock_init(sbi->s_blockgroup_lock);
4403
4404         /* Pre-read the descriptors into the buffer cache */
4405         for (i = 0; i < db_count; i++) {
4406                 block = descriptor_loc(sb, logical_sb_block, i);
4407                 sb_breadahead_unmovable(sb, block);
4408         }
4409
4410         for (i = 0; i < db_count; i++) {
4411                 struct buffer_head *bh;
4412
4413                 block = descriptor_loc(sb, logical_sb_block, i);
4414                 bh = sb_bread_unmovable(sb, block);
4415                 if (!bh) {
4416                         ext4_msg(sb, KERN_ERR,
4417                                "can't read group descriptor %d", i);
4418                         db_count = i;
4419                         goto failed_mount2;
4420                 }
4421                 rcu_read_lock();
4422                 rcu_dereference(sbi->s_group_desc)[i] = bh;
4423                 rcu_read_unlock();
4424         }
4425         sbi->s_gdb_count = db_count;
4426         if (!ext4_check_descriptors(sb, logical_sb_block, &first_not_zeroed)) {
4427                 ext4_msg(sb, KERN_ERR, "group descriptors corrupted!");
4428                 ret = -EFSCORRUPTED;
4429                 goto failed_mount2;
4430         }
4431
4432         timer_setup(&sbi->s_err_report, print_daily_error_info, 0);
4433
4434         /* Register extent status tree shrinker */
4435         if (ext4_es_register_shrinker(sbi))
4436                 goto failed_mount3;
4437
4438         sbi->s_stripe = ext4_get_stripe_size(sbi);
4439         sbi->s_extent_max_zeroout_kb = 32;
4440
4441         /*
4442          * set up enough so that it can read an inode
4443          */
4444         sb->s_op = &ext4_sops;
4445         sb->s_export_op = &ext4_export_ops;
4446         sb->s_xattr = ext4_xattr_handlers;
4447 #ifdef CONFIG_FS_ENCRYPTION
4448         sb->s_cop = &ext4_cryptops;
4449 #endif
4450 #ifdef CONFIG_FS_VERITY
4451         sb->s_vop = &ext4_verityops;
4452 #endif
4453 #ifdef CONFIG_QUOTA
4454         sb->dq_op = &ext4_quota_operations;
4455         if (ext4_has_feature_quota(sb))
4456                 sb->s_qcop = &dquot_quotactl_sysfile_ops;
4457         else
4458                 sb->s_qcop = &ext4_qctl_operations;
4459         sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP | QTYPE_MASK_PRJ;
4460 #endif
4461         memcpy(&sb->s_uuid, es->s_uuid, sizeof(es->s_uuid));
4462
4463         INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
4464         mutex_init(&sbi->s_orphan_lock);
4465
4466         sb->s_root = NULL;
4467
4468         needs_recovery = (es->s_last_orphan != 0 ||
4469                           ext4_has_feature_journal_needs_recovery(sb));
4470
4471         if (ext4_has_feature_mmp(sb) && !sb_rdonly(sb))
4472                 if (ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block)))
4473                         goto failed_mount3a;
4474
4475         /*
4476          * The first inode we look at is the journal inode.  Don't try
4477          * root first: it may be modified in the journal!
4478          */
4479         if (!test_opt(sb, NOLOAD) && ext4_has_feature_journal(sb)) {
4480                 err = ext4_load_journal(sb, es, journal_devnum);
4481                 if (err)
4482                         goto failed_mount3a;
4483         } else if (test_opt(sb, NOLOAD) && !sb_rdonly(sb) &&
4484                    ext4_has_feature_journal_needs_recovery(sb)) {
4485                 ext4_msg(sb, KERN_ERR, "required journal recovery "
4486                        "suppressed and not mounted read-only");
4487                 goto failed_mount_wq;
4488         } else {
4489                 /* Nojournal mode, all journal mount options are illegal */
4490                 if (test_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM)) {
4491                         ext4_msg(sb, KERN_ERR, "can't mount with "
4492                                  "journal_checksum, fs mounted w/o journal");
4493                         goto failed_mount_wq;
4494                 }
4495                 if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
4496                         ext4_msg(sb, KERN_ERR, "can't mount with "
4497                                  "journal_async_commit, fs mounted w/o journal");
4498                         goto failed_mount_wq;
4499                 }
4500                 if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) {
4501                         ext4_msg(sb, KERN_ERR, "can't mount with "
4502                                  "commit=%lu, fs mounted w/o journal",
4503                                  sbi->s_commit_interval / HZ);
4504                         goto failed_mount_wq;
4505                 }
4506                 if (EXT4_MOUNT_DATA_FLAGS &
4507                     (sbi->s_mount_opt ^ sbi->s_def_mount_opt)) {
4508                         ext4_msg(sb, KERN_ERR, "can't mount with "
4509                                  "data=, fs mounted w/o journal");
4510                         goto failed_mount_wq;
4511                 }
4512                 sbi->s_def_mount_opt &= ~EXT4_MOUNT_JOURNAL_CHECKSUM;
4513                 clear_opt(sb, JOURNAL_CHECKSUM);
4514                 clear_opt(sb, DATA_FLAGS);
4515                 sbi->s_journal = NULL;
4516                 needs_recovery = 0;
4517                 goto no_journal;
4518         }
4519
4520         if (ext4_has_feature_64bit(sb) &&
4521             !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0,
4522                                        JBD2_FEATURE_INCOMPAT_64BIT)) {
4523                 ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature");
4524                 goto failed_mount_wq;
4525         }
4526
4527         if (!set_journal_csum_feature_set(sb)) {
4528                 ext4_msg(sb, KERN_ERR, "Failed to set journal checksum "
4529                          "feature set");
4530                 goto failed_mount_wq;
4531         }
4532
4533         /* We have now updated the journal if required, so we can
4534          * validate the data journaling mode. */
4535         switch (test_opt(sb, DATA_FLAGS)) {
4536         case 0:
4537                 /* No mode set, assume a default based on the journal
4538                  * capabilities: ORDERED_DATA if the journal can
4539                  * cope, else JOURNAL_DATA
4540                  */
4541                 if (jbd2_journal_check_available_features
4542                     (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) {
4543                         set_opt(sb, ORDERED_DATA);
4544                         sbi->s_def_mount_opt |= EXT4_MOUNT_ORDERED_DATA;
4545                 } else {
4546                         set_opt(sb, JOURNAL_DATA);
4547                         sbi->s_def_mount_opt |= EXT4_MOUNT_JOURNAL_DATA;
4548                 }
4549                 break;
4550
4551         case EXT4_MOUNT_ORDERED_DATA:
4552         case EXT4_MOUNT_WRITEBACK_DATA:
4553                 if (!jbd2_journal_check_available_features
4554                     (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) {
4555                         ext4_msg(sb, KERN_ERR, "Journal does not support "
4556                                "requested data journaling mode");
4557                         goto failed_mount_wq;
4558                 }
4559         default:
4560                 break;
4561         }
4562
4563         if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA &&
4564             test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
4565                 ext4_msg(sb, KERN_ERR, "can't mount with "
4566                         "journal_async_commit in data=ordered mode");
4567                 goto failed_mount_wq;
4568         }
4569
4570         set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
4571
4572         sbi->s_journal->j_commit_callback = ext4_journal_commit_callback;
4573
4574 no_journal:
4575         if (!test_opt(sb, NO_MBCACHE)) {
4576                 sbi->s_ea_block_cache = ext4_xattr_create_cache();
4577                 if (!sbi->s_ea_block_cache) {
4578                         ext4_msg(sb, KERN_ERR,
4579                                  "Failed to create ea_block_cache");
4580                         goto failed_mount_wq;
4581                 }
4582
4583                 if (ext4_has_feature_ea_inode(sb)) {
4584                         sbi->s_ea_inode_cache = ext4_xattr_create_cache();
4585                         if (!sbi->s_ea_inode_cache) {
4586                                 ext4_msg(sb, KERN_ERR,
4587                                          "Failed to create ea_inode_cache");
4588                                 goto failed_mount_wq;
4589                         }
4590                 }
4591         }
4592
4593         if (ext4_has_feature_verity(sb) && blocksize != PAGE_SIZE) {
4594                 ext4_msg(sb, KERN_ERR, "Unsupported blocksize for fs-verity");
4595                 goto failed_mount_wq;
4596         }
4597
4598         if (DUMMY_ENCRYPTION_ENABLED(sbi) && !sb_rdonly(sb) &&
4599             !ext4_has_feature_encrypt(sb)) {
4600                 ext4_set_feature_encrypt(sb);
4601                 ext4_commit_super(sb, 1);
4602         }
4603
4604         /*
4605          * Get the # of file system overhead blocks from the
4606          * superblock if present.
4607          */
4608         if (es->s_overhead_clusters)
4609                 sbi->s_overhead = le32_to_cpu(es->s_overhead_clusters);
4610         else {
4611                 err = ext4_calculate_overhead(sb);
4612                 if (err)
4613                         goto failed_mount_wq;
4614         }
4615
4616         /*
4617          * The maximum number of concurrent works can be high and
4618          * concurrency isn't really necessary.  Limit it to 1.
4619          */
4620         EXT4_SB(sb)->rsv_conversion_wq =
4621                 alloc_workqueue("ext4-rsv-conversion", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
4622         if (!EXT4_SB(sb)->rsv_conversion_wq) {
4623                 printk(KERN_ERR "EXT4-fs: failed to create workqueue\n");
4624                 ret = -ENOMEM;
4625                 goto failed_mount4;
4626         }
4627
4628         /*
4629          * The jbd2_journal_load will have done any necessary log recovery,
4630          * so we can safely mount the rest of the filesystem now.
4631          */
4632
4633         root = ext4_iget(sb, EXT4_ROOT_INO, EXT4_IGET_SPECIAL);
4634         if (IS_ERR(root)) {
4635                 ext4_msg(sb, KERN_ERR, "get root inode failed");
4636                 ret = PTR_ERR(root);
4637                 root = NULL;
4638                 goto failed_mount4;
4639         }
4640         if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
4641                 ext4_msg(sb, KERN_ERR, "corrupt root inode, run e2fsck");
4642                 iput(root);
4643                 goto failed_mount4;
4644         }
4645
4646 #ifdef CONFIG_UNICODE
4647         if (sbi->s_encoding)
4648                 sb->s_d_op = &ext4_dentry_ops;
4649 #endif
4650
4651         sb->s_root = d_make_root(root);
4652         if (!sb->s_root) {
4653                 ext4_msg(sb, KERN_ERR, "get root dentry failed");
4654                 ret = -ENOMEM;
4655                 goto failed_mount4;
4656         }
4657
4658         ret = ext4_setup_super(sb, es, sb_rdonly(sb));
4659         if (ret == -EROFS) {
4660                 sb->s_flags |= SB_RDONLY;
4661                 ret = 0;
4662         } else if (ret)
4663                 goto failed_mount4a;
4664
4665         ext4_set_resv_clusters(sb);
4666
4667         err = ext4_setup_system_zone(sb);
4668         if (err) {
4669                 ext4_msg(sb, KERN_ERR, "failed to initialize system "
4670                          "zone (%d)", err);
4671                 goto failed_mount4a;
4672         }
4673
4674         ext4_ext_init(sb);
4675         err = ext4_mb_init(sb);
4676         if (err) {
4677                 ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)",
4678                          err);
4679                 goto failed_mount5;
4680         }
4681
4682         block = ext4_count_free_clusters(sb);
4683         ext4_free_blocks_count_set(sbi->s_es,
4684                                    EXT4_C2B(sbi, block));
4685         ext4_superblock_csum_set(sb);
4686         err = percpu_counter_init(&sbi->s_freeclusters_counter, block,
4687                                   GFP_KERNEL);
4688         if (!err) {
4689                 unsigned long freei = ext4_count_free_inodes(sb);
4690                 sbi->s_es->s_free_inodes_count = cpu_to_le32(freei);
4691                 ext4_superblock_csum_set(sb);
4692                 err = percpu_counter_init(&sbi->s_freeinodes_counter, freei,
4693                                           GFP_KERNEL);
4694         }
4695         if (!err)
4696                 err = percpu_counter_init(&sbi->s_dirs_counter,
4697                                           ext4_count_dirs(sb), GFP_KERNEL);
4698         if (!err)
4699                 err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0,
4700                                           GFP_KERNEL);
4701         if (!err)
4702                 err = percpu_init_rwsem(&sbi->s_writepages_rwsem);
4703
4704         if (err) {
4705                 ext4_msg(sb, KERN_ERR, "insufficient memory");
4706                 goto failed_mount6;
4707         }
4708
4709         if (ext4_has_feature_flex_bg(sb))
4710                 if (!ext4_fill_flex_info(sb)) {
4711                         ext4_msg(sb, KERN_ERR,
4712                                "unable to initialize "
4713                                "flex_bg meta info!");
4714                         goto failed_mount6;
4715                 }
4716
4717         err = ext4_register_li_request(sb, first_not_zeroed);
4718         if (err)
4719                 goto failed_mount6;
4720
4721         err = ext4_register_sysfs(sb);
4722         if (err)
4723                 goto failed_mount7;
4724
4725 #ifdef CONFIG_QUOTA
4726         /* Enable quota usage during mount. */
4727         if (ext4_has_feature_quota(sb) && !sb_rdonly(sb)) {
4728                 err = ext4_enable_quotas(sb);
4729                 if (err)
4730                         goto failed_mount8;
4731         }
4732 #endif  /* CONFIG_QUOTA */
4733
4734         EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS;
4735         ext4_orphan_cleanup(sb, es);
4736         EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS;
4737         if (needs_recovery) {
4738                 ext4_msg(sb, KERN_INFO, "recovery complete");
4739                 ext4_mark_recovery_complete(sb, es);
4740         }
4741         if (EXT4_SB(sb)->s_journal) {
4742                 if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
4743                         descr = " journalled data mode";
4744                 else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
4745                         descr = " ordered data mode";
4746                 else
4747                         descr = " writeback data mode";
4748         } else
4749                 descr = "out journal";
4750
4751         if (test_opt(sb, DISCARD)) {
4752                 struct request_queue *q = bdev_get_queue(sb->s_bdev);
4753                 if (!blk_queue_discard(q))
4754                         ext4_msg(sb, KERN_WARNING,
4755                                  "mounting with \"discard\" option, but "
4756                                  "the device does not support discard");
4757         }
4758
4759         if (___ratelimit(&ext4_mount_msg_ratelimit, "EXT4-fs mount"))
4760                 ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. "
4761                          "Opts: %.*s%s%s", descr,
4762                          (int) sizeof(sbi->s_es->s_mount_opts),
4763                          sbi->s_es->s_mount_opts,
4764                          *sbi->s_es->s_mount_opts ? "; " : "", orig_data);
4765
4766         if (es->s_error_count)
4767                 mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */
4768
4769         /* Enable message ratelimiting. Default is 10 messages per 5 secs. */
4770         ratelimit_state_init(&sbi->s_err_ratelimit_state, 5 * HZ, 10);
4771         ratelimit_state_init(&sbi->s_warning_ratelimit_state, 5 * HZ, 10);
4772         ratelimit_state_init(&sbi->s_msg_ratelimit_state, 5 * HZ, 10);
4773
4774         kfree(orig_data);
4775         return 0;
4776
4777 cantfind_ext4:
4778         if (!silent)
4779                 ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem");
4780         goto failed_mount;
4781
4782 #ifdef CONFIG_QUOTA
4783 failed_mount8:
4784         ext4_unregister_sysfs(sb);
4785 #endif
4786 failed_mount7:
4787         ext4_unregister_li_request(sb);
4788 failed_mount6:
4789         ext4_mb_release(sb);
4790         rcu_read_lock();
4791         flex_groups = rcu_dereference(sbi->s_flex_groups);
4792         if (flex_groups) {
4793                 for (i = 0; i < sbi->s_flex_groups_allocated; i++)
4794                         kvfree(flex_groups[i]);
4795                 kvfree(flex_groups);
4796         }
4797         rcu_read_unlock();
4798         percpu_counter_destroy(&sbi->s_freeclusters_counter);
4799         percpu_counter_destroy(&sbi->s_freeinodes_counter);
4800         percpu_counter_destroy(&sbi->s_dirs_counter);
4801         percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
4802         percpu_free_rwsem(&sbi->s_writepages_rwsem);
4803 failed_mount5:
4804         ext4_ext_release(sb);
4805         ext4_release_system_zone(sb);
4806 failed_mount4a:
4807         dput(sb->s_root);
4808         sb->s_root = NULL;
4809 failed_mount4:
4810         ext4_msg(sb, KERN_ERR, "mount failed");
4811         if (EXT4_SB(sb)->rsv_conversion_wq)
4812                 destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq);
4813 failed_mount_wq:
4814         ext4_xattr_destroy_cache(sbi->s_ea_inode_cache);
4815         sbi->s_ea_inode_cache = NULL;
4816
4817         ext4_xattr_destroy_cache(sbi->s_ea_block_cache);
4818         sbi->s_ea_block_cache = NULL;
4819
4820         if (sbi->s_journal) {
4821                 jbd2_journal_destroy(sbi->s_journal);
4822                 sbi->s_journal = NULL;
4823         }
4824 failed_mount3a:
4825         ext4_es_unregister_shrinker(sbi);
4826 failed_mount3:
4827         del_timer_sync(&sbi->s_err_report);
4828         if (sbi->s_mmp_tsk)
4829                 kthread_stop(sbi->s_mmp_tsk);
4830 failed_mount2:
4831         rcu_read_lock();
4832         group_desc = rcu_dereference(sbi->s_group_desc);
4833         for (i = 0; i < db_count; i++)
4834                 brelse(group_desc[i]);
4835         kvfree(group_desc);
4836         rcu_read_unlock();
4837 failed_mount:
4838         if (sbi->s_chksum_driver)
4839                 crypto_free_shash(sbi->s_chksum_driver);
4840
4841 #ifdef CONFIG_UNICODE
4842         utf8_unload(sbi->s_encoding);
4843 #endif
4844
4845 #ifdef CONFIG_QUOTA
4846         for (i = 0; i < EXT4_MAXQUOTAS; i++)
4847                 kfree(get_qf_name(sb, sbi, i));
4848 #endif
4849         ext4_blkdev_remove(sbi);
4850         brelse(bh);
4851 out_fail:
4852         sb->s_fs_info = NULL;
4853         kfree(sbi->s_blockgroup_lock);
4854 out_free_base:
4855         kfree(sbi);
4856         kfree(orig_data);
4857         fs_put_dax(dax_dev);
4858         return err ? err : ret;
4859 }
4860
4861 /*
4862  * Setup any per-fs journal parameters now.  We'll do this both on
4863  * initial mount, once the journal has been initialised but before we've
4864  * done any recovery; and again on any subsequent remount.
4865  */
4866 static void ext4_init_journal_params(struct super_block *sb, journal_t *journal)
4867 {
4868         struct ext4_sb_info *sbi = EXT4_SB(sb);
4869
4870         journal->j_commit_interval = sbi->s_commit_interval;
4871         journal->j_min_batch_time = sbi->s_min_batch_time;
4872         journal->j_max_batch_time = sbi->s_max_batch_time;
4873
4874         write_lock(&journal->j_state_lock);
4875         if (test_opt(sb, BARRIER))
4876                 journal->j_flags |= JBD2_BARRIER;
4877         else
4878                 journal->j_flags &= ~JBD2_BARRIER;
4879         if (test_opt(sb, DATA_ERR_ABORT))
4880                 journal->j_flags |= JBD2_ABORT_ON_SYNCDATA_ERR;
4881         else
4882                 journal->j_flags &= ~JBD2_ABORT_ON_SYNCDATA_ERR;
4883         write_unlock(&journal->j_state_lock);
4884 }
4885
4886 static struct inode *ext4_get_journal_inode(struct super_block *sb,
4887                                              unsigned int journal_inum)
4888 {
4889         struct inode *journal_inode;
4890
4891         /*
4892          * Test for the existence of a valid inode on disk.  Bad things
4893          * happen if we iget() an unused inode, as the subsequent iput()
4894          * will try to delete it.
4895          */
4896         journal_inode = ext4_iget(sb, journal_inum, EXT4_IGET_SPECIAL);
4897         if (IS_ERR(journal_inode)) {
4898                 ext4_msg(sb, KERN_ERR, "no journal found");
4899                 return NULL;
4900         }
4901         if (!journal_inode->i_nlink) {
4902                 make_bad_inode(journal_inode);
4903                 iput(journal_inode);
4904                 ext4_msg(sb, KERN_ERR, "journal inode is deleted");
4905                 return NULL;
4906         }
4907
4908         jbd_debug(2, "Journal inode found at %p: %lld bytes\n",
4909                   journal_inode, journal_inode->i_size);
4910         if (!S_ISREG(journal_inode->i_mode)) {
4911                 ext4_msg(sb, KERN_ERR, "invalid journal inode");
4912                 iput(journal_inode);
4913                 return NULL;
4914         }
4915         return journal_inode;
4916 }
4917
4918 static journal_t *ext4_get_journal(struct super_block *sb,
4919                                    unsigned int journal_inum)
4920 {
4921         struct inode *journal_inode;
4922         journal_t *journal;
4923
4924         BUG_ON(!ext4_has_feature_journal(sb));
4925
4926         journal_inode = ext4_get_journal_inode(sb, journal_inum);
4927         if (!journal_inode)
4928                 return NULL;
4929
4930         journal = jbd2_journal_init_inode(journal_inode);
4931         if (!journal) {
4932                 ext4_msg(sb, KERN_ERR, "Could not load journal inode");
4933                 iput(journal_inode);
4934                 return NULL;
4935         }
4936         journal->j_private = sb;
4937         ext4_init_journal_params(sb, journal);
4938         return journal;
4939 }
4940
4941 static journal_t *ext4_get_dev_journal(struct super_block *sb,
4942                                        dev_t j_dev)
4943 {
4944         struct buffer_head *bh;
4945         journal_t *journal;
4946         ext4_fsblk_t start;
4947         ext4_fsblk_t len;
4948         int hblock, blocksize;
4949         ext4_fsblk_t sb_block;
4950         unsigned long offset;
4951         struct ext4_super_block *es;
4952         struct block_device *bdev;
4953
4954         BUG_ON(!ext4_has_feature_journal(sb));
4955
4956         bdev = ext4_blkdev_get(j_dev, sb);
4957         if (bdev == NULL)
4958                 return NULL;
4959
4960         blocksize = sb->s_blocksize;
4961         hblock = bdev_logical_block_size(bdev);
4962         if (blocksize < hblock) {
4963                 ext4_msg(sb, KERN_ERR,
4964                         "blocksize too small for journal device");
4965                 goto out_bdev;
4966         }
4967
4968         sb_block = EXT4_MIN_BLOCK_SIZE / blocksize;
4969         offset = EXT4_MIN_BLOCK_SIZE % blocksize;
4970         set_blocksize(bdev, blocksize);
4971         if (!(bh = __bread(bdev, sb_block, blocksize))) {
4972                 ext4_msg(sb, KERN_ERR, "couldn't read superblock of "
4973                        "external journal");
4974                 goto out_bdev;
4975         }
4976
4977         es = (struct ext4_super_block *) (bh->b_data + offset);
4978         if ((le16_to_cpu(es->s_magic) != EXT4_SUPER_MAGIC) ||
4979             !(le32_to_cpu(es->s_feature_incompat) &
4980               EXT4_FEATURE_INCOMPAT_JOURNAL_DEV)) {
4981                 ext4_msg(sb, KERN_ERR, "external journal has "
4982                                         "bad superblock");
4983                 brelse(bh);
4984                 goto out_bdev;
4985         }
4986
4987         if ((le32_to_cpu(es->s_feature_ro_compat) &
4988              EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) &&
4989             es->s_checksum != ext4_superblock_csum(sb, es)) {
4990                 ext4_msg(sb, KERN_ERR, "external journal has "
4991                                        "corrupt superblock");
4992                 brelse(bh);
4993                 goto out_bdev;
4994         }
4995
4996         if (memcmp(EXT4_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) {
4997                 ext4_msg(sb, KERN_ERR, "journal UUID does not match");
4998                 brelse(bh);
4999                 goto out_bdev;
5000         }
5001
5002         len = ext4_blocks_count(es);
5003         start = sb_block + 1;
5004         brelse(bh);     /* we're done with the superblock */
5005
5006         journal = jbd2_journal_init_dev(bdev, sb->s_bdev,
5007                                         start, len, blocksize);
5008         if (!journal) {
5009                 ext4_msg(sb, KERN_ERR, "failed to create device journal");
5010                 goto out_bdev;
5011         }
5012         journal->j_private = sb;
5013         ll_rw_block(REQ_OP_READ, REQ_META | REQ_PRIO, 1, &journal->j_sb_buffer);
5014         wait_on_buffer(journal->j_sb_buffer);
5015         if (!buffer_uptodate(journal->j_sb_buffer)) {
5016                 ext4_msg(sb, KERN_ERR, "I/O error on journal device");
5017                 goto out_journal;
5018         }
5019         if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) {
5020                 ext4_msg(sb, KERN_ERR, "External journal has more than one "
5021                                         "user (unsupported) - %d",
5022                         be32_to_cpu(journal->j_superblock->s_nr_users));
5023                 goto out_journal;
5024         }
5025         EXT4_SB(sb)->journal_bdev = bdev;
5026         ext4_init_journal_params(sb, journal);
5027         return journal;
5028
5029 out_journal:
5030         jbd2_journal_destroy(journal);
5031 out_bdev:
5032         ext4_blkdev_put(bdev);
5033         return NULL;
5034 }
5035
5036 static int ext4_load_journal(struct super_block *sb,
5037                              struct ext4_super_block *es,
5038                              unsigned long journal_devnum)
5039 {
5040         journal_t *journal;
5041         unsigned int journal_inum = le32_to_cpu(es->s_journal_inum);
5042         dev_t journal_dev;
5043         int err = 0;
5044         int really_read_only;
5045
5046         BUG_ON(!ext4_has_feature_journal(sb));
5047
5048         if (journal_devnum &&
5049             journal_devnum != le32_to_cpu(es->s_journal_dev)) {
5050                 ext4_msg(sb, KERN_INFO, "external journal device major/minor "
5051                         "numbers have changed");
5052                 journal_dev = new_decode_dev(journal_devnum);
5053         } else
5054                 journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev));
5055
5056         really_read_only = bdev_read_only(sb->s_bdev);
5057
5058         /*
5059          * Are we loading a blank journal or performing recovery after a
5060          * crash?  For recovery, we need to check in advance whether we
5061          * can get read-write access to the device.
5062          */
5063         if (ext4_has_feature_journal_needs_recovery(sb)) {
5064                 if (sb_rdonly(sb)) {
5065                         ext4_msg(sb, KERN_INFO, "INFO: recovery "
5066                                         "required on readonly filesystem");
5067                         if (really_read_only) {
5068                                 ext4_msg(sb, KERN_ERR, "write access "
5069                                         "unavailable, cannot proceed "
5070                                         "(try mounting with noload)");
5071                                 return -EROFS;
5072                         }
5073                         ext4_msg(sb, KERN_INFO, "write access will "
5074                                "be enabled during recovery");
5075                 }
5076         }
5077
5078         if (journal_inum && journal_dev) {
5079                 ext4_msg(sb, KERN_ERR, "filesystem has both journal "
5080                        "and inode journals!");
5081                 return -EINVAL;
5082         }
5083
5084         if (journal_inum) {
5085                 if (!(journal = ext4_get_journal(sb, journal_inum)))
5086                         return -EINVAL;
5087         } else {
5088                 if (!(journal = ext4_get_dev_journal(sb, journal_dev)))
5089                         return -EINVAL;
5090         }
5091
5092         if (!(journal->j_flags & JBD2_BARRIER))
5093                 ext4_msg(sb, KERN_INFO, "barriers disabled");
5094
5095         if (!ext4_has_feature_journal_needs_recovery(sb))
5096                 err = jbd2_journal_wipe(journal, !really_read_only);
5097         if (!err) {
5098                 char *save = kmalloc(EXT4_S_ERR_LEN, GFP_KERNEL);
5099                 if (save)
5100                         memcpy(save, ((char *) es) +
5101                                EXT4_S_ERR_START, EXT4_S_ERR_LEN);
5102                 err = jbd2_journal_load(journal);
5103                 if (save)
5104                         memcpy(((char *) es) + EXT4_S_ERR_START,
5105                                save, EXT4_S_ERR_LEN);
5106                 kfree(save);
5107         }
5108
5109         if (err) {
5110                 ext4_msg(sb, KERN_ERR, "error loading journal");
5111                 jbd2_journal_destroy(journal);
5112                 return err;
5113         }
5114
5115         EXT4_SB(sb)->s_journal = journal;
5116         ext4_clear_journal_err(sb, es);
5117
5118         if (!really_read_only && journal_devnum &&
5119             journal_devnum != le32_to_cpu(es->s_journal_dev)) {
5120                 es->s_journal_dev = cpu_to_le32(journal_devnum);
5121
5122                 /* Make sure we flush the recovery flag to disk. */
5123                 ext4_commit_super(sb, 1);
5124         }
5125
5126         return 0;
5127 }
5128
5129 static int ext4_commit_super(struct super_block *sb, int sync)
5130 {
5131         struct ext4_super_block *es = EXT4_SB(sb)->s_es;
5132         struct buffer_head *sbh = EXT4_SB(sb)->s_sbh;
5133         int error = 0;
5134
5135         if (!sbh || block_device_ejected(sb))
5136                 return error;
5137
5138         /*
5139          * The superblock bh should be mapped, but it might not be if the
5140          * device was hot-removed. Not much we can do but fail the I/O.
5141          */
5142         if (!buffer_mapped(sbh))
5143                 return error;
5144
5145         /*
5146          * If the file system is mounted read-only, don't update the
5147          * superblock write time.  This avoids updating the superblock
5148          * write time when we are mounting the root file system
5149          * read/only but we need to replay the journal; at that point,
5150          * for people who are east of GMT and who make their clock
5151          * tick in localtime for Windows bug-for-bug compatibility,
5152          * the clock is set in the future, and this will cause e2fsck
5153          * to complain and force a full file system check.
5154          */
5155         if (!(sb->s_flags & SB_RDONLY))
5156                 ext4_update_tstamp(es, s_wtime);
5157         if (sb->s_bdev->bd_part)
5158                 es->s_kbytes_written =
5159                         cpu_to_le64(EXT4_SB(sb)->s_kbytes_written +
5160                             ((part_stat_read(sb->s_bdev->bd_part,
5161                                              sectors[STAT_WRITE]) -
5162                               EXT4_SB(sb)->s_sectors_written_start) >> 1));
5163         else
5164                 es->s_kbytes_written =
5165                         cpu_to_le64(EXT4_SB(sb)->s_kbytes_written);
5166         if (percpu_counter_initialized(&EXT4_SB(sb)->s_freeclusters_counter))
5167                 ext4_free_blocks_count_set(es,
5168                         EXT4_C2B(EXT4_SB(sb), percpu_counter_sum_positive(
5169                                 &EXT4_SB(sb)->s_freeclusters_counter)));
5170         if (percpu_counter_initialized(&EXT4_SB(sb)->s_freeinodes_counter))
5171                 es->s_free_inodes_count =
5172                         cpu_to_le32(percpu_counter_sum_positive(
5173                                 &EXT4_SB(sb)->s_freeinodes_counter));
5174         BUFFER_TRACE(sbh, "marking dirty");
5175         ext4_superblock_csum_set(sb);
5176         if (sync)
5177                 lock_buffer(sbh);
5178         if (buffer_write_io_error(sbh) || !buffer_uptodate(sbh)) {
5179                 /*
5180                  * Oh, dear.  A previous attempt to write the
5181                  * superblock failed.  This could happen because the
5182                  * USB device was yanked out.  Or it could happen to
5183                  * be a transient write error and maybe the block will
5184                  * be remapped.  Nothing we can do but to retry the
5185                  * write and hope for the best.
5186                  */
5187                 ext4_msg(sb, KERN_ERR, "previous I/O error to "
5188                        "superblock detected");
5189                 clear_buffer_write_io_error(sbh);
5190                 set_buffer_uptodate(sbh);
5191         }
5192         mark_buffer_dirty(sbh);
5193         if (sync) {
5194                 unlock_buffer(sbh);
5195                 error = __sync_dirty_buffer(sbh,
5196                         REQ_SYNC | (test_opt(sb, BARRIER) ? REQ_FUA : 0));
5197                 if (buffer_write_io_error(sbh)) {
5198                         ext4_msg(sb, KERN_ERR, "I/O error while writing "
5199                                "superblock");
5200                         clear_buffer_write_io_error(sbh);
5201                         set_buffer_uptodate(sbh);
5202                 }
5203         }
5204         return error;
5205 }
5206
5207 /*
5208  * Have we just finished recovery?  If so, and if we are mounting (or
5209  * remounting) the filesystem readonly, then we will end up with a
5210  * consistent fs on disk.  Record that fact.
5211  */
5212 static void ext4_mark_recovery_complete(struct super_block *sb,
5213                                         struct ext4_super_block *es)
5214 {
5215         journal_t *journal = EXT4_SB(sb)->s_journal;
5216
5217         if (!ext4_has_feature_journal(sb)) {
5218                 BUG_ON(journal != NULL);
5219                 return;
5220         }
5221         jbd2_journal_lock_updates(journal);
5222         if (jbd2_journal_flush(journal) < 0)
5223                 goto out;
5224
5225         if (ext4_has_feature_journal_needs_recovery(sb) && sb_rdonly(sb)) {
5226                 ext4_clear_feature_journal_needs_recovery(sb);
5227                 ext4_commit_super(sb, 1);
5228         }
5229
5230 out:
5231         jbd2_journal_unlock_updates(journal);
5232 }
5233
5234 /*
5235  * If we are mounting (or read-write remounting) a filesystem whose journal
5236  * has recorded an error from a previous lifetime, move that error to the
5237  * main filesystem now.
5238  */
5239 static void ext4_clear_journal_err(struct super_block *sb,
5240                                    struct ext4_super_block *es)
5241 {
5242         journal_t *journal;
5243         int j_errno;
5244         const char *errstr;
5245
5246         BUG_ON(!ext4_has_feature_journal(sb));
5247
5248         journal = EXT4_SB(sb)->s_journal;
5249
5250         /*
5251          * Now check for any error status which may have been recorded in the
5252          * journal by a prior ext4_error() or ext4_abort()
5253          */
5254
5255         j_errno = jbd2_journal_errno(journal);
5256         if (j_errno) {
5257                 char nbuf[16];
5258
5259                 errstr = ext4_decode_error(sb, j_errno, nbuf);
5260                 ext4_warning(sb, "Filesystem error recorded "
5261                              "from previous mount: %s", errstr);
5262                 ext4_warning(sb, "Marking fs in need of filesystem check.");
5263
5264                 EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
5265                 es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
5266                 ext4_commit_super(sb, 1);
5267
5268                 jbd2_journal_clear_err(journal);
5269                 jbd2_journal_update_sb_errno(journal);
5270         }
5271 }
5272
5273 /*
5274  * Force the running and committing transactions to commit,
5275  * and wait on the commit.
5276  */
5277 int ext4_force_commit(struct super_block *sb)
5278 {
5279         journal_t *journal;
5280
5281         if (sb_rdonly(sb))
5282                 return 0;
5283
5284         journal = EXT4_SB(sb)->s_journal;
5285         return ext4_journal_force_commit(journal);
5286 }
5287
5288 static int ext4_sync_fs(struct super_block *sb, int wait)
5289 {
5290         int ret = 0;
5291         tid_t target;
5292         bool needs_barrier = false;
5293         struct ext4_sb_info *sbi = EXT4_SB(sb);
5294
5295         if (unlikely(ext4_forced_shutdown(sbi)))
5296                 return 0;
5297
5298         trace_ext4_sync_fs(sb, wait);
5299         flush_workqueue(sbi->rsv_conversion_wq);
5300         /*
5301          * Writeback quota in non-journalled quota case - journalled quota has
5302          * no dirty dquots
5303          */
5304         dquot_writeback_dquots(sb, -1);
5305         /*
5306          * Data writeback is possible w/o journal transaction, so barrier must
5307          * being sent at the end of the function. But we can skip it if
5308          * transaction_commit will do it for us.
5309          */
5310         if (sbi->s_journal) {
5311                 target = jbd2_get_latest_transaction(sbi->s_journal);
5312                 if (wait && sbi->s_journal->j_flags & JBD2_BARRIER &&
5313                     !jbd2_trans_will_send_data_barrier(sbi->s_journal, target))
5314                         needs_barrier = true;
5315
5316                 if (jbd2_journal_start_commit(sbi->s_journal, &target)) {
5317                         if (wait)
5318                                 ret = jbd2_log_wait_commit(sbi->s_journal,
5319                                                            target);
5320                 }
5321         } else if (wait && test_opt(sb, BARRIER))
5322                 needs_barrier = true;
5323         if (needs_barrier) {
5324                 int err;
5325                 err = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL);
5326                 if (!ret)
5327                         ret = err;
5328         }
5329
5330         return ret;
5331 }
5332
5333 /*
5334  * LVM calls this function before a (read-only) snapshot is created.  This
5335  * gives us a chance to flush the journal completely and mark the fs clean.
5336  *
5337  * Note that only this function cannot bring a filesystem to be in a clean
5338  * state independently. It relies on upper layer to stop all data & metadata
5339  * modifications.
5340  */
5341 static int ext4_freeze(struct super_block *sb)
5342 {
5343         int error = 0;
5344         journal_t *journal;
5345
5346         if (sb_rdonly(sb))
5347                 return 0;
5348
5349         journal = EXT4_SB(sb)->s_journal;
5350
5351         if (journal) {
5352                 /* Now we set up the journal barrier. */
5353                 jbd2_journal_lock_updates(journal);
5354
5355                 /*
5356                  * Don't clear the needs_recovery flag if we failed to
5357                  * flush the journal.
5358                  */
5359                 error = jbd2_journal_flush(journal);
5360                 if (error < 0)
5361                         goto out;
5362
5363                 /* Journal blocked and flushed, clear needs_recovery flag. */
5364                 ext4_clear_feature_journal_needs_recovery(sb);
5365         }
5366
5367         error = ext4_commit_super(sb, 1);
5368 out:
5369         if (journal)
5370                 /* we rely on upper layer to stop further updates */
5371                 jbd2_journal_unlock_updates(journal);
5372         return error;
5373 }
5374
5375 /*
5376  * Called by LVM after the snapshot is done.  We need to reset the RECOVER
5377  * flag here, even though the filesystem is not technically dirty yet.
5378  */
5379 static int ext4_unfreeze(struct super_block *sb)
5380 {
5381         if (sb_rdonly(sb) || ext4_forced_shutdown(EXT4_SB(sb)))
5382                 return 0;
5383
5384         if (EXT4_SB(sb)->s_journal) {
5385                 /* Reset the needs_recovery flag before the fs is unlocked. */
5386                 ext4_set_feature_journal_needs_recovery(sb);
5387         }
5388
5389         ext4_commit_super(sb, 1);
5390         return 0;
5391 }
5392
5393 /*
5394  * Structure to save mount options for ext4_remount's benefit
5395  */
5396 struct ext4_mount_options {
5397         unsigned long s_mount_opt;
5398         unsigned long s_mount_opt2;
5399         kuid_t s_resuid;
5400         kgid_t s_resgid;
5401         unsigned long s_commit_interval;
5402         u32 s_min_batch_time, s_max_batch_time;
5403 #ifdef CONFIG_QUOTA
5404         int s_jquota_fmt;
5405         char *s_qf_names[EXT4_MAXQUOTAS];
5406 #endif
5407 };
5408
5409 static int ext4_remount(struct super_block *sb, int *flags, char *data)
5410 {
5411         struct ext4_super_block *es;
5412         struct ext4_sb_info *sbi = EXT4_SB(sb);
5413         unsigned long old_sb_flags;
5414         struct ext4_mount_options old_opts;
5415         int enable_quota = 0;
5416         ext4_group_t g;
5417         unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
5418         int err = 0;
5419 #ifdef CONFIG_QUOTA
5420         int i, j;
5421         char *to_free[EXT4_MAXQUOTAS];
5422 #endif
5423         char *orig_data = kstrdup(data, GFP_KERNEL);
5424
5425         if (data && !orig_data)
5426                 return -ENOMEM;
5427
5428         /* Store the original options */
5429         old_sb_flags = sb->s_flags;
5430         old_opts.s_mount_opt = sbi->s_mount_opt;
5431         old_opts.s_mount_opt2 = sbi->s_mount_opt2;
5432         old_opts.s_resuid = sbi->s_resuid;
5433         old_opts.s_resgid = sbi->s_resgid;
5434         old_opts.s_commit_interval = sbi->s_commit_interval;
5435         old_opts.s_min_batch_time = sbi->s_min_batch_time;
5436         old_opts.s_max_batch_time = sbi->s_max_batch_time;
5437 #ifdef CONFIG_QUOTA
5438         old_opts.s_jquota_fmt = sbi->s_jquota_fmt;
5439         for (i = 0; i < EXT4_MAXQUOTAS; i++)
5440                 if (sbi->s_qf_names[i]) {
5441                         char *qf_name = get_qf_name(sb, sbi, i);
5442
5443                         old_opts.s_qf_names[i] = kstrdup(qf_name, GFP_KERNEL);
5444                         if (!old_opts.s_qf_names[i]) {
5445                                 for (j = 0; j < i; j++)
5446                                         kfree(old_opts.s_qf_names[j]);
5447                                 kfree(orig_data);
5448                                 return -ENOMEM;
5449                         }
5450                 } else
5451                         old_opts.s_qf_names[i] = NULL;
5452 #endif
5453         if (sbi->s_journal && sbi->s_journal->j_task->io_context)
5454                 journal_ioprio = sbi->s_journal->j_task->io_context->ioprio;
5455
5456         if (!parse_options(data, sb, NULL, &journal_ioprio, 1)) {
5457                 err = -EINVAL;
5458                 goto restore_opts;
5459         }
5460
5461         if ((old_opts.s_mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) ^
5462             test_opt(sb, JOURNAL_CHECKSUM)) {
5463                 ext4_msg(sb, KERN_ERR, "changing journal_checksum "
5464                          "during remount not supported; ignoring");
5465                 sbi->s_mount_opt ^= EXT4_MOUNT_JOURNAL_CHECKSUM;
5466         }
5467
5468         if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
5469                 if (test_opt2(sb, EXPLICIT_DELALLOC)) {
5470                         ext4_msg(sb, KERN_ERR, "can't mount with "
5471                                  "both data=journal and delalloc");
5472                         err = -EINVAL;
5473                         goto restore_opts;
5474                 }
5475                 if (test_opt(sb, DIOREAD_NOLOCK)) {
5476                         ext4_msg(sb, KERN_ERR, "can't mount with "
5477                                  "both data=journal and dioread_nolock");
5478                         err = -EINVAL;
5479                         goto restore_opts;
5480                 }
5481         } else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) {
5482                 if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
5483                         ext4_msg(sb, KERN_ERR, "can't mount with "
5484                                 "journal_async_commit in data=ordered mode");
5485                         err = -EINVAL;
5486                         goto restore_opts;
5487                 }
5488         }
5489
5490         if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT4_MOUNT_NO_MBCACHE) {
5491                 ext4_msg(sb, KERN_ERR, "can't enable nombcache during remount");
5492                 err = -EINVAL;
5493                 goto restore_opts;
5494         }
5495
5496         if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED)
5497                 ext4_abort(sb, EXT4_ERR_ESHUTDOWN, "Abort forced by user");
5498
5499         sb->s_flags = (sb->s_flags & ~SB_POSIXACL) |
5500                 (test_opt(sb, POSIX_ACL) ? SB_POSIXACL : 0);
5501
5502         es = sbi->s_es;
5503
5504         if (sbi->s_journal) {
5505                 ext4_init_journal_params(sb, sbi->s_journal);
5506                 set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
5507         }
5508
5509         if (*flags & SB_LAZYTIME)
5510                 sb->s_flags |= SB_LAZYTIME;
5511
5512         if ((bool)(*flags & SB_RDONLY) != sb_rdonly(sb)) {
5513                 if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) {
5514                         err = -EROFS;
5515                         goto restore_opts;
5516                 }
5517
5518                 if (*flags & SB_RDONLY) {
5519                         err = sync_filesystem(sb);
5520                         if (err < 0)
5521                                 goto restore_opts;
5522                         err = dquot_suspend(sb, -1);
5523                         if (err < 0)
5524                                 goto restore_opts;
5525
5526                         /*
5527                          * First of all, the unconditional stuff we have to do
5528                          * to disable replay of the journal when we next remount
5529                          */
5530                         sb->s_flags |= SB_RDONLY;
5531
5532                         /*
5533                          * OK, test if we are remounting a valid rw partition
5534                          * readonly, and if so set the rdonly flag and then
5535                          * mark the partition as valid again.
5536                          */
5537                         if (!(es->s_state & cpu_to_le16(EXT4_VALID_FS)) &&
5538                             (sbi->s_mount_state & EXT4_VALID_FS))
5539                                 es->s_state = cpu_to_le16(sbi->s_mount_state);
5540
5541                         if (sbi->s_journal)
5542                                 ext4_mark_recovery_complete(sb, es);
5543                         if (sbi->s_mmp_tsk)
5544                                 kthread_stop(sbi->s_mmp_tsk);
5545                 } else {
5546                         /* Make sure we can mount this feature set readwrite */
5547                         if (ext4_has_feature_readonly(sb) ||
5548                             !ext4_feature_set_ok(sb, 0)) {
5549                                 err = -EROFS;
5550                                 goto restore_opts;
5551                         }
5552                         /*
5553                          * Make sure the group descriptor checksums
5554                          * are sane.  If they aren't, refuse to remount r/w.
5555                          */
5556                         for (g = 0; g < sbi->s_groups_count; g++) {
5557                                 struct ext4_group_desc *gdp =
5558                                         ext4_get_group_desc(sb, g, NULL);
5559
5560                                 if (!ext4_group_desc_csum_verify(sb, g, gdp)) {
5561                                         ext4_msg(sb, KERN_ERR,
5562                "ext4_remount: Checksum for group %u failed (%u!=%u)",
5563                 g, le16_to_cpu(ext4_group_desc_csum(sb, g, gdp)),
5564                                                le16_to_cpu(gdp->bg_checksum));
5565                                         err = -EFSBADCRC;
5566                                         goto restore_opts;
5567                                 }
5568                         }
5569
5570                         /*
5571                          * If we have an unprocessed orphan list hanging
5572                          * around from a previously readonly bdev mount,
5573                          * require a full umount/remount for now.
5574                          */
5575                         if (es->s_last_orphan) {
5576                                 ext4_msg(sb, KERN_WARNING, "Couldn't "
5577                                        "remount RDWR because of unprocessed "
5578                                        "orphan inode list.  Please "
5579                                        "umount/remount instead");
5580                                 err = -EINVAL;
5581                                 goto restore_opts;
5582                         }
5583
5584                         /*
5585                          * Mounting a RDONLY partition read-write, so reread
5586                          * and store the current valid flag.  (It may have
5587                          * been changed by e2fsck since we originally mounted
5588                          * the partition.)
5589                          */
5590                         if (sbi->s_journal)
5591                                 ext4_clear_journal_err(sb, es);
5592                         sbi->s_mount_state = le16_to_cpu(es->s_state);
5593
5594                         err = ext4_setup_super(sb, es, 0);
5595                         if (err)
5596                                 goto restore_opts;
5597
5598                         sb->s_flags &= ~SB_RDONLY;
5599                         if (ext4_has_feature_mmp(sb))
5600                                 if (ext4_multi_mount_protect(sb,
5601                                                 le64_to_cpu(es->s_mmp_block))) {
5602                                         err = -EROFS;
5603                                         goto restore_opts;
5604                                 }
5605                         enable_quota = 1;
5606                 }
5607         }
5608
5609         /*
5610          * Reinitialize lazy itable initialization thread based on
5611          * current settings
5612          */
5613         if (sb_rdonly(sb) || !test_opt(sb, INIT_INODE_TABLE))
5614                 ext4_unregister_li_request(sb);
5615         else {
5616                 ext4_group_t first_not_zeroed;
5617                 first_not_zeroed = ext4_has_uninit_itable(sb);
5618                 ext4_register_li_request(sb, first_not_zeroed);
5619         }
5620
5621         ext4_setup_system_zone(sb);
5622         if (sbi->s_journal == NULL && !(old_sb_flags & SB_RDONLY)) {
5623                 err = ext4_commit_super(sb, 1);
5624                 if (err)
5625                         goto restore_opts;
5626         }
5627
5628 #ifdef CONFIG_QUOTA
5629         /* Release old quota file names */
5630         for (i = 0; i < EXT4_MAXQUOTAS; i++)
5631                 kfree(old_opts.s_qf_names[i]);
5632         if (enable_quota) {
5633                 if (sb_any_quota_suspended(sb))
5634                         dquot_resume(sb, -1);
5635                 else if (ext4_has_feature_quota(sb)) {
5636                         err = ext4_enable_quotas(sb);
5637                         if (err)
5638                                 goto restore_opts;
5639                 }
5640         }
5641 #endif
5642
5643         *flags = (*flags & ~SB_LAZYTIME) | (sb->s_flags & SB_LAZYTIME);
5644         ext4_msg(sb, KERN_INFO, "re-mounted. Opts: %s", orig_data);
5645         kfree(orig_data);
5646         return 0;
5647
5648 restore_opts:
5649         sb->s_flags = old_sb_flags;
5650         sbi->s_mount_opt = old_opts.s_mount_opt;
5651         sbi->s_mount_opt2 = old_opts.s_mount_opt2;
5652         sbi->s_resuid = old_opts.s_resuid;
5653         sbi->s_resgid = old_opts.s_resgid;
5654         sbi->s_commit_interval = old_opts.s_commit_interval;
5655         sbi->s_min_batch_time = old_opts.s_min_batch_time;
5656         sbi->s_max_batch_time = old_opts.s_max_batch_time;
5657 #ifdef CONFIG_QUOTA
5658         sbi->s_jquota_fmt = old_opts.s_jquota_fmt;
5659         for (i = 0; i < EXT4_MAXQUOTAS; i++) {
5660                 to_free[i] = get_qf_name(sb, sbi, i);
5661                 rcu_assign_pointer(sbi->s_qf_names[i], old_opts.s_qf_names[i]);
5662         }
5663         synchronize_rcu();
5664         for (i = 0; i < EXT4_MAXQUOTAS; i++)
5665                 kfree(to_free[i]);
5666 #endif
5667         kfree(orig_data);
5668         return err;
5669 }
5670
5671 #ifdef CONFIG_QUOTA
5672 static int ext4_statfs_project(struct super_block *sb,
5673                                kprojid_t projid, struct kstatfs *buf)
5674 {
5675         struct kqid qid;
5676         struct dquot *dquot;
5677         u64 limit;
5678         u64 curblock;
5679
5680         qid = make_kqid_projid(projid);
5681         dquot = dqget(sb, qid);
5682         if (IS_ERR(dquot))
5683                 return PTR_ERR(dquot);
5684         spin_lock(&dquot->dq_dqb_lock);
5685
5686         limit = min_not_zero(dquot->dq_dqb.dqb_bsoftlimit,
5687                              dquot->dq_dqb.dqb_bhardlimit);
5688         limit >>= sb->s_blocksize_bits;
5689
5690         if (limit && buf->f_blocks > limit) {
5691                 curblock = (dquot->dq_dqb.dqb_curspace +
5692                             dquot->dq_dqb.dqb_rsvspace) >> sb->s_blocksize_bits;
5693                 buf->f_blocks = limit;
5694                 buf->f_bfree = buf->f_bavail =
5695                         (buf->f_blocks > curblock) ?
5696                          (buf->f_blocks - curblock) : 0;
5697         }
5698
5699         limit = min_not_zero(dquot->dq_dqb.dqb_isoftlimit,
5700                              dquot->dq_dqb.dqb_ihardlimit);
5701         if (limit && buf->f_files > limit) {
5702                 buf->f_files = limit;
5703                 buf->f_ffree =
5704                         (buf->f_files > dquot->dq_dqb.dqb_curinodes) ?
5705                          (buf->f_files - dquot->dq_dqb.dqb_curinodes) : 0;
5706         }
5707
5708         spin_unlock(&dquot->dq_dqb_lock);
5709         dqput(dquot);
5710         return 0;
5711 }
5712 #endif
5713
5714 static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
5715 {
5716         struct super_block *sb = dentry->d_sb;
5717         struct ext4_sb_info *sbi = EXT4_SB(sb);
5718         struct ext4_super_block *es = sbi->s_es;
5719         ext4_fsblk_t overhead = 0, resv_blocks;
5720         u64 fsid;
5721         s64 bfree;
5722         resv_blocks = EXT4_C2B(sbi, atomic64_read(&sbi->s_resv_clusters));
5723
5724         if (!test_opt(sb, MINIX_DF))
5725                 overhead = sbi->s_overhead;
5726
5727         buf->f_type = EXT4_SUPER_MAGIC;
5728         buf->f_bsize = sb->s_blocksize;
5729         buf->f_blocks = ext4_blocks_count(es) - EXT4_C2B(sbi, overhead);
5730         bfree = percpu_counter_sum_positive(&sbi->s_freeclusters_counter) -
5731                 percpu_counter_sum_positive(&sbi->s_dirtyclusters_counter);
5732         /* prevent underflow in case that few free space is available */
5733         buf->f_bfree = EXT4_C2B(sbi, max_t(s64, bfree, 0));
5734         buf->f_bavail = buf->f_bfree -
5735                         (ext4_r_blocks_count(es) + resv_blocks);
5736         if (buf->f_bfree < (ext4_r_blocks_count(es) + resv_blocks))
5737                 buf->f_bavail = 0;
5738         buf->f_files = le32_to_cpu(es->s_inodes_count);
5739         buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter);
5740         buf->f_namelen = EXT4_NAME_LEN;
5741         fsid = le64_to_cpup((void *)es->s_uuid) ^
5742                le64_to_cpup((void *)es->s_uuid + sizeof(u64));
5743         buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL;
5744         buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL;
5745
5746 #ifdef CONFIG_QUOTA
5747         if (ext4_test_inode_flag(dentry->d_inode, EXT4_INODE_PROJINHERIT) &&
5748             sb_has_quota_limits_enabled(sb, PRJQUOTA))
5749                 ext4_statfs_project(sb, EXT4_I(dentry->d_inode)->i_projid, buf);
5750 #endif
5751         return 0;
5752 }
5753
5754
5755 #ifdef CONFIG_QUOTA
5756
5757 /*
5758  * Helper functions so that transaction is started before we acquire dqio_sem
5759  * to keep correct lock ordering of transaction > dqio_sem
5760  */
5761 static inline struct inode *dquot_to_inode(struct dquot *dquot)
5762 {
5763         return sb_dqopt(dquot->dq_sb)->files[dquot->dq_id.type];
5764 }
5765
5766 static int ext4_write_dquot(struct dquot *dquot)
5767 {
5768         int ret, err;
5769         handle_t *handle;
5770         struct inode *inode;
5771
5772         inode = dquot_to_inode(dquot);
5773         handle = ext4_journal_start(inode, EXT4_HT_QUOTA,
5774                                     EXT4_QUOTA_TRANS_BLOCKS(dquot->dq_sb));
5775         if (IS_ERR(handle))
5776                 return PTR_ERR(handle);
5777         ret = dquot_commit(dquot);
5778         err = ext4_journal_stop(handle);
5779         if (!ret)
5780                 ret = err;
5781         return ret;
5782 }
5783
5784 static int ext4_acquire_dquot(struct dquot *dquot)
5785 {
5786         int ret, err;
5787         handle_t *handle;
5788
5789         handle = ext4_journal_start(dquot_to_inode(dquot), EXT4_HT_QUOTA,
5790                                     EXT4_QUOTA_INIT_BLOCKS(dquot->dq_sb));
5791         if (IS_ERR(handle))
5792                 return PTR_ERR(handle);
5793         ret = dquot_acquire(dquot);
5794         err = ext4_journal_stop(handle);
5795         if (!ret)
5796                 ret = err;
5797         return ret;
5798 }
5799
5800 static int ext4_release_dquot(struct dquot *dquot)
5801 {
5802         int ret, err;
5803         handle_t *handle;
5804
5805         handle = ext4_journal_start(dquot_to_inode(dquot), EXT4_HT_QUOTA,
5806                                     EXT4_QUOTA_DEL_BLOCKS(dquot->dq_sb));
5807         if (IS_ERR(handle)) {
5808                 /* Release dquot anyway to avoid endless cycle in dqput() */
5809                 dquot_release(dquot);
5810                 return PTR_ERR(handle);
5811         }
5812         ret = dquot_release(dquot);
5813         err = ext4_journal_stop(handle);
5814         if (!ret)
5815                 ret = err;
5816         return ret;
5817 }
5818
5819 static int ext4_mark_dquot_dirty(struct dquot *dquot)
5820 {
5821         struct super_block *sb = dquot->dq_sb;
5822         struct ext4_sb_info *sbi = EXT4_SB(sb);
5823
5824         /* Are we journaling quotas? */
5825         if (ext4_has_feature_quota(sb) ||
5826             sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
5827                 dquot_mark_dquot_dirty(dquot);
5828                 return ext4_write_dquot(dquot);
5829         } else {
5830                 return dquot_mark_dquot_dirty(dquot);
5831         }
5832 }
5833
5834 static int ext4_write_info(struct super_block *sb, int type)
5835 {
5836         int ret, err;
5837         handle_t *handle;
5838
5839         /* Data block + inode block */
5840         handle = ext4_journal_start(d_inode(sb->s_root), EXT4_HT_QUOTA, 2);
5841         if (IS_ERR(handle))
5842                 return PTR_ERR(handle);
5843         ret = dquot_commit_info(sb, type);
5844         err = ext4_journal_stop(handle);
5845         if (!ret)
5846                 ret = err;
5847         return ret;
5848 }
5849
5850 /*
5851  * Turn on quotas during mount time - we need to find
5852  * the quota file and such...
5853  */
5854 static int ext4_quota_on_mount(struct super_block *sb, int type)
5855 {
5856         return dquot_quota_on_mount(sb, get_qf_name(sb, EXT4_SB(sb), type),
5857                                         EXT4_SB(sb)->s_jquota_fmt, type);
5858 }
5859
5860 static void lockdep_set_quota_inode(struct inode *inode, int subclass)
5861 {
5862         struct ext4_inode_info *ei = EXT4_I(inode);
5863
5864         /* The first argument of lockdep_set_subclass has to be
5865          * *exactly* the same as the argument to init_rwsem() --- in
5866          * this case, in init_once() --- or lockdep gets unhappy
5867          * because the name of the lock is set using the
5868          * stringification of the argument to init_rwsem().
5869          */
5870         (void) ei;      /* shut up clang warning if !CONFIG_LOCKDEP */
5871         lockdep_set_subclass(&ei->i_data_sem, subclass);
5872 }
5873
5874 /*
5875  * Standard function to be called on quota_on
5876  */
5877 static int ext4_quota_on(struct super_block *sb, int type, int format_id,
5878                          const struct path *path)
5879 {
5880         int err;
5881
5882         if (!test_opt(sb, QUOTA))
5883                 return -EINVAL;
5884
5885         /* Quotafile not on the same filesystem? */
5886         if (path->dentry->d_sb != sb)
5887                 return -EXDEV;
5888         /* Journaling quota? */
5889         if (EXT4_SB(sb)->s_qf_names[type]) {
5890                 /* Quotafile not in fs root? */
5891                 if (path->dentry->d_parent != sb->s_root)
5892                         ext4_msg(sb, KERN_WARNING,
5893                                 "Quota file not on filesystem root. "
5894                                 "Journaled quota will not work");
5895                 sb_dqopt(sb)->flags |= DQUOT_NOLIST_DIRTY;
5896         } else {
5897                 /*
5898                  * Clear the flag just in case mount options changed since
5899                  * last time.
5900                  */
5901                 sb_dqopt(sb)->flags &= ~DQUOT_NOLIST_DIRTY;
5902         }
5903
5904         /*
5905          * When we journal data on quota file, we have to flush journal to see
5906          * all updates to the file when we bypass pagecache...
5907          */
5908         if (EXT4_SB(sb)->s_journal &&
5909             ext4_should_journal_data(d_inode(path->dentry))) {
5910                 /*
5911                  * We don't need to lock updates but journal_flush() could
5912                  * otherwise be livelocked...
5913                  */
5914                 jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
5915                 err = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
5916                 jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
5917                 if (err)
5918                         return err;
5919         }
5920
5921         lockdep_set_quota_inode(path->dentry->d_inode, I_DATA_SEM_QUOTA);
5922         err = dquot_quota_on(sb, type, format_id, path);
5923         if (err) {
5924                 lockdep_set_quota_inode(path->dentry->d_inode,
5925                                              I_DATA_SEM_NORMAL);
5926         } else {
5927                 struct inode *inode = d_inode(path->dentry);
5928                 handle_t *handle;
5929
5930                 /*
5931                  * Set inode flags to prevent userspace from messing with quota
5932                  * files. If this fails, we return success anyway since quotas
5933                  * are already enabled and this is not a hard failure.
5934                  */
5935                 inode_lock(inode);
5936                 handle = ext4_journal_start(inode, EXT4_HT_QUOTA, 1);
5937                 if (IS_ERR(handle))
5938                         goto unlock_inode;
5939                 EXT4_I(inode)->i_flags |= EXT4_NOATIME_FL | EXT4_IMMUTABLE_FL;
5940                 inode_set_flags(inode, S_NOATIME | S_IMMUTABLE,
5941                                 S_NOATIME | S_IMMUTABLE);
5942                 err = ext4_mark_inode_dirty(handle, inode);
5943                 ext4_journal_stop(handle);
5944         unlock_inode:
5945                 inode_unlock(inode);
5946         }
5947         return err;
5948 }
5949
5950 static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
5951                              unsigned int flags)
5952 {
5953         int err;
5954         struct inode *qf_inode;
5955         unsigned long qf_inums[EXT4_MAXQUOTAS] = {
5956                 le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum),
5957                 le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum),
5958                 le32_to_cpu(EXT4_SB(sb)->s_es->s_prj_quota_inum)
5959         };
5960
5961         BUG_ON(!ext4_has_feature_quota(sb));
5962
5963         if (!qf_inums[type])
5964                 return -EPERM;
5965
5966         qf_inode = ext4_iget(sb, qf_inums[type], EXT4_IGET_SPECIAL);
5967         if (IS_ERR(qf_inode)) {
5968                 ext4_error(sb, "Bad quota inode # %lu", qf_inums[type]);
5969                 return PTR_ERR(qf_inode);
5970         }
5971
5972         /* Don't account quota for quota files to avoid recursion */
5973         qf_inode->i_flags |= S_NOQUOTA;
5974         lockdep_set_quota_inode(qf_inode, I_DATA_SEM_QUOTA);
5975         err = dquot_load_quota_inode(qf_inode, type, format_id, flags);
5976         if (err)
5977                 lockdep_set_quota_inode(qf_inode, I_DATA_SEM_NORMAL);
5978         iput(qf_inode);
5979
5980         return err;
5981 }
5982
5983 /* Enable usage tracking for all quota types. */
5984 static int ext4_enable_quotas(struct super_block *sb)
5985 {
5986         int type, err = 0;
5987         unsigned long qf_inums[EXT4_MAXQUOTAS] = {
5988                 le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum),
5989                 le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum),
5990                 le32_to_cpu(EXT4_SB(sb)->s_es->s_prj_quota_inum)
5991         };
5992         bool quota_mopt[EXT4_MAXQUOTAS] = {
5993                 test_opt(sb, USRQUOTA),
5994                 test_opt(sb, GRPQUOTA),
5995                 test_opt(sb, PRJQUOTA),
5996         };
5997
5998         sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE | DQUOT_NOLIST_DIRTY;
5999         for (type = 0; type < EXT4_MAXQUOTAS; type++) {
6000                 if (qf_inums[type]) {
6001                         err = ext4_quota_enable(sb, type, QFMT_VFS_V1,
6002                                 DQUOT_USAGE_ENABLED |
6003                                 (quota_mopt[type] ? DQUOT_LIMITS_ENABLED : 0));
6004                         if (err) {
6005                                 ext4_warning(sb,
6006                                         "Failed to enable quota tracking "
6007                                         "(type=%d, err=%d). Please run "
6008                                         "e2fsck to fix.", type, err);
6009                                 for (type--; type >= 0; type--)
6010                                         dquot_quota_off(sb, type);
6011
6012                                 return err;
6013                         }
6014                 }
6015         }
6016         return 0;
6017 }
6018
6019 static int ext4_quota_off(struct super_block *sb, int type)
6020 {
6021         struct inode *inode = sb_dqopt(sb)->files[type];
6022         handle_t *handle;
6023         int err;
6024
6025         /* Force all delayed allocation blocks to be allocated.
6026          * Caller already holds s_umount sem */
6027         if (test_opt(sb, DELALLOC))
6028                 sync_filesystem(sb);
6029
6030         if (!inode || !igrab(inode))
6031                 goto out;
6032
6033         err = dquot_quota_off(sb, type);
6034         if (err || ext4_has_feature_quota(sb))
6035                 goto out_put;
6036
6037         inode_lock(inode);
6038         /*
6039          * Update modification times of quota files when userspace can
6040          * start looking at them. If we fail, we return success anyway since
6041          * this is not a hard failure and quotas are already disabled.
6042          */
6043         handle = ext4_journal_start(inode, EXT4_HT_QUOTA, 1);
6044         if (IS_ERR(handle)) {
6045                 err = PTR_ERR(handle);
6046                 goto out_unlock;
6047         }
6048         EXT4_I(inode)->i_flags &= ~(EXT4_NOATIME_FL | EXT4_IMMUTABLE_FL);
6049         inode_set_flags(inode, 0, S_NOATIME | S_IMMUTABLE);
6050         inode->i_mtime = inode->i_ctime = current_time(inode);
6051         err = ext4_mark_inode_dirty(handle, inode);
6052         ext4_journal_stop(handle);
6053 out_unlock:
6054         inode_unlock(inode);
6055 out_put:
6056         lockdep_set_quota_inode(inode, I_DATA_SEM_NORMAL);
6057         iput(inode);
6058         return err;
6059 out:
6060         return dquot_quota_off(sb, type);
6061 }
6062
6063 /* Read data from quotafile - avoid pagecache and such because we cannot afford
6064  * acquiring the locks... As quota files are never truncated and quota code
6065  * itself serializes the operations (and no one else should touch the files)
6066  * we don't have to be afraid of races */
6067 static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
6068                                size_t len, loff_t off)
6069 {
6070         struct inode *inode = sb_dqopt(sb)->files[type];
6071         ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb);
6072         int offset = off & (sb->s_blocksize - 1);
6073         int tocopy;
6074         size_t toread;
6075         struct buffer_head *bh;
6076         loff_t i_size = i_size_read(inode);
6077
6078         if (off > i_size)
6079                 return 0;
6080         if (off+len > i_size)
6081                 len = i_size-off;
6082         toread = len;
6083         while (toread > 0) {
6084                 tocopy = sb->s_blocksize - offset < toread ?
6085                                 sb->s_blocksize - offset : toread;
6086                 bh = ext4_bread(NULL, inode, blk, 0);
6087                 if (IS_ERR(bh))
6088                         return PTR_ERR(bh);
6089                 if (!bh)        /* A hole? */
6090                         memset(data, 0, tocopy);
6091                 else
6092                         memcpy(data, bh->b_data+offset, tocopy);
6093                 brelse(bh);
6094                 offset = 0;
6095                 toread -= tocopy;
6096                 data += tocopy;
6097                 blk++;
6098         }
6099         return len;
6100 }
6101
6102 /* Write to quotafile (we know the transaction is already started and has
6103  * enough credits) */
6104 static ssize_t ext4_quota_write(struct super_block *sb, int type,
6105                                 const char *data, size_t len, loff_t off)
6106 {
6107         struct inode *inode = sb_dqopt(sb)->files[type];
6108         ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb);
6109         int err = 0, err2 = 0, offset = off & (sb->s_blocksize - 1);
6110         int retries = 0;
6111         struct buffer_head *bh;
6112         handle_t *handle = journal_current_handle();
6113
6114         if (EXT4_SB(sb)->s_journal && !handle) {
6115                 ext4_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)"
6116                         " cancelled because transaction is not started",
6117                         (unsigned long long)off, (unsigned long long)len);
6118                 return -EIO;
6119         }
6120         /*
6121          * Since we account only one data block in transaction credits,
6122          * then it is impossible to cross a block boundary.
6123          */
6124         if (sb->s_blocksize - offset < len) {
6125                 ext4_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)"
6126                         " cancelled because not block aligned",
6127                         (unsigned long long)off, (unsigned long long)len);
6128                 return -EIO;
6129         }
6130
6131         do {
6132                 bh = ext4_bread(handle, inode, blk,
6133                                 EXT4_GET_BLOCKS_CREATE |
6134                                 EXT4_GET_BLOCKS_METADATA_NOFAIL);
6135         } while (PTR_ERR(bh) == -ENOSPC &&
6136                  ext4_should_retry_alloc(inode->i_sb, &retries));
6137         if (IS_ERR(bh))
6138                 return PTR_ERR(bh);
6139         if (!bh)
6140                 goto out;
6141         BUFFER_TRACE(bh, "get write access");
6142         err = ext4_journal_get_write_access(handle, bh);
6143         if (err) {
6144                 brelse(bh);
6145                 return err;
6146         }
6147         lock_buffer(bh);
6148         memcpy(bh->b_data+offset, data, len);
6149         flush_dcache_page(bh->b_page);
6150         unlock_buffer(bh);
6151         err = ext4_handle_dirty_metadata(handle, NULL, bh);
6152         brelse(bh);
6153 out:
6154         if (inode->i_size < off + len) {
6155                 i_size_write(inode, off + len);
6156                 EXT4_I(inode)->i_disksize = inode->i_size;
6157                 err2 = ext4_mark_inode_dirty(handle, inode);
6158                 if (unlikely(err2 && !err))
6159                         err = err2;
6160         }
6161         return err ? err : len;
6162 }
6163 #endif
6164
6165 static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
6166                        const char *dev_name, void *data)
6167 {
6168         return mount_bdev(fs_type, flags, dev_name, data, ext4_fill_super);
6169 }
6170
6171 #if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT2)
6172 static inline void register_as_ext2(void)
6173 {
6174         int err = register_filesystem(&ext2_fs_type);
6175         if (err)
6176                 printk(KERN_WARNING
6177                        "EXT4-fs: Unable to register as ext2 (%d)\n", err);
6178 }
6179
6180 static inline void unregister_as_ext2(void)
6181 {
6182         unregister_filesystem(&ext2_fs_type);
6183 }
6184
6185 static inline int ext2_feature_set_ok(struct super_block *sb)
6186 {
6187         if (ext4_has_unknown_ext2_incompat_features(sb))
6188                 return 0;
6189         if (sb_rdonly(sb))
6190                 return 1;
6191         if (ext4_has_unknown_ext2_ro_compat_features(sb))
6192                 return 0;
6193         return 1;
6194 }
6195 #else
6196 static inline void register_as_ext2(void) { }
6197 static inline void unregister_as_ext2(void) { }
6198 static inline int ext2_feature_set_ok(struct super_block *sb) { return 0; }
6199 #endif
6200
6201 static inline void register_as_ext3(void)
6202 {
6203         int err = register_filesystem(&ext3_fs_type);
6204         if (err)
6205                 printk(KERN_WARNING
6206                        "EXT4-fs: Unable to register as ext3 (%d)\n", err);
6207 }
6208
6209 static inline void unregister_as_ext3(void)
6210 {
6211         unregister_filesystem(&ext3_fs_type);
6212 }
6213
6214 static inline int ext3_feature_set_ok(struct super_block *sb)
6215 {
6216         if (ext4_has_unknown_ext3_incompat_features(sb))
6217                 return 0;
6218         if (!ext4_has_feature_journal(sb))
6219                 return 0;
6220         if (sb_rdonly(sb))
6221                 return 1;
6222         if (ext4_has_unknown_ext3_ro_compat_features(sb))
6223                 return 0;
6224         return 1;
6225 }
6226
6227 static struct file_system_type ext4_fs_type = {
6228         .owner          = THIS_MODULE,
6229         .name           = "ext4",
6230         .mount          = ext4_mount,
6231         .kill_sb        = kill_block_super,
6232         .fs_flags       = FS_REQUIRES_DEV,
6233 };
6234 MODULE_ALIAS_FS("ext4");
6235
6236 /* Shared across all ext4 file systems */
6237 wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];
6238
6239 static int __init ext4_init_fs(void)
6240 {
6241         int i, err;
6242
6243         ratelimit_state_init(&ext4_mount_msg_ratelimit, 30 * HZ, 64);
6244         ext4_li_info = NULL;
6245         mutex_init(&ext4_li_mtx);
6246
6247         /* Build-time check for flags consistency */
6248         ext4_check_flag_values();
6249
6250         for (i = 0; i < EXT4_WQ_HASH_SZ; i++)
6251                 init_waitqueue_head(&ext4__ioend_wq[i]);
6252
6253         err = ext4_init_es();
6254         if (err)
6255                 return err;
6256
6257         err = ext4_init_pending();
6258         if (err)
6259                 goto out7;
6260
6261         err = ext4_init_post_read_processing();
6262         if (err)
6263                 goto out6;
6264
6265         err = ext4_init_pageio();
6266         if (err)
6267                 goto out5;
6268
6269         err = ext4_init_system_zone();
6270         if (err)
6271                 goto out4;
6272
6273         err = ext4_init_sysfs();
6274         if (err)
6275                 goto out3;
6276
6277         err = ext4_init_mballoc();
6278         if (err)
6279                 goto out2;
6280         err = init_inodecache();
6281         if (err)
6282                 goto out1;
6283         register_as_ext3();
6284         register_as_ext2();
6285         err = register_filesystem(&ext4_fs_type);
6286         if (err)
6287                 goto out;
6288
6289         return 0;
6290 out:
6291         unregister_as_ext2();
6292         unregister_as_ext3();
6293         destroy_inodecache();
6294 out1:
6295         ext4_exit_mballoc();
6296 out2:
6297         ext4_exit_sysfs();
6298 out3:
6299         ext4_exit_system_zone();
6300 out4:
6301         ext4_exit_pageio();
6302 out5:
6303         ext4_exit_post_read_processing();
6304 out6:
6305         ext4_exit_pending();
6306 out7:
6307         ext4_exit_es();
6308
6309         return err;
6310 }
6311
6312 static void __exit ext4_exit_fs(void)
6313 {
6314         ext4_destroy_lazyinit_thread();
6315         unregister_as_ext2();
6316         unregister_as_ext3();
6317         unregister_filesystem(&ext4_fs_type);
6318         destroy_inodecache();
6319         ext4_exit_mballoc();
6320         ext4_exit_sysfs();
6321         ext4_exit_system_zone();
6322         ext4_exit_pageio();
6323         ext4_exit_post_read_processing();
6324         ext4_exit_es();
6325         ext4_exit_pending();
6326 }
6327
6328 MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
6329 MODULE_DESCRIPTION("Fourth Extended Filesystem");
6330 MODULE_LICENSE("GPL");
6331 MODULE_SOFTDEP("pre: crc32c");
6332 module_init(ext4_init_fs)
6333 module_exit(ext4_exit_fs)