Merge branch 'hwpoison' of git://git.kernel.org/pub/scm/linux/kernel/git/ak/linux...
authorLinus Torvalds <torvalds@linux-foundation.org>
Thu, 24 Sep 2009 14:53:22 +0000 (07:53 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Thu, 24 Sep 2009 14:53:22 +0000 (07:53 -0700)
* 'hwpoison' of git://git.kernel.org/pub/scm/linux/kernel/git/ak/linux-mce-2.6: (21 commits)
  HWPOISON: Enable error_remove_page on btrfs
  HWPOISON: Add simple debugfs interface to inject hwpoison on arbitary PFNs
  HWPOISON: Add madvise() based injector for hardware poisoned pages v4
  HWPOISON: Enable error_remove_page for NFS
  HWPOISON: Enable .remove_error_page for migration aware file systems
  HWPOISON: The high level memory error handler in the VM v7
  HWPOISON: Add PR_MCE_KILL prctl to control early kill behaviour per process
  HWPOISON: shmem: call set_page_dirty() with locked page
  HWPOISON: Define a new error_remove_page address space op for async truncation
  HWPOISON: Add invalidate_inode_page
  HWPOISON: Refactor truncate to allow direct truncating of page v2
  HWPOISON: check and isolate corrupted free pages v2
  HWPOISON: Handle hardware poisoned pages in try_to_unmap
  HWPOISON: Use bitmask/action code for try_to_unmap behaviour
  HWPOISON: x86: Add VM_FAULT_HWPOISON handling to x86 page fault handler v2
  HWPOISON: Add poison check to page fault handling
  HWPOISON: Add basic support for poisoned pages in fault handler v3
  HWPOISON: Add new SIGBUS error codes for hardware poison signals
  HWPOISON: Add support for poison swap entries v2
  HWPOISON: Export some rmap vma locking to outside world
  ...

30 files changed:
1  2 
Documentation/sysctl/vm.txt
arch/x86/mm/fault.c
fs/btrfs/inode.c
fs/ext3/inode.c
fs/ext4/inode.c
fs/ocfs2/aops.c
fs/proc/meminfo.c
fs/xfs/linux-2.6/xfs_aops.c
include/asm-generic/mman-common.h
include/linux/fs.h
include/linux/mm.h
include/linux/page-flags.h
include/linux/prctl.h
include/linux/rmap.h
include/linux/sched.h
include/linux/swap.h
kernel/sys.c
kernel/sysctl.c
mm/Kconfig
mm/Makefile
mm/filemap.c
mm/madvise.c
mm/memory.c
mm/migrate.c
mm/page-writeback.c
mm/page_alloc.c
mm/rmap.c
mm/shmem.c
mm/swapfile.c
mm/vmscan.c

index e6fb1ec2744b180d25bd41e6a672e5627dd8e971,faf62740aa2c2125e19f0c96f2032ac557a4fa11..a6e360d2055c561ae347a75e20c015b3f61a8928
@@@ -32,6 -32,8 +32,8 @@@ Currently, these files are in /proc/sys
  - legacy_va_layout
  - lowmem_reserve_ratio
  - max_map_count
+ - memory_failure_early_kill
+ - memory_failure_recovery
  - min_free_kbytes
  - min_slab_ratio
  - min_unmapped_ratio
@@@ -53,7 -55,6 +55,6 @@@
  - vfs_cache_pressure
  - zone_reclaim_mode
  
  ==============================================================
  
  block_dump
@@@ -275,6 -276,44 +276,44 @@@ e.g., up to one or two maps per allocat
  
  The default value is 65536.
  
+ =============================================================
+ memory_failure_early_kill:
+ Control how to kill processes when uncorrected memory error (typically
+ a 2bit error in a memory module) is detected in the background by hardware
+ that cannot be handled by the kernel. In some cases (like the page
+ still having a valid copy on disk) the kernel will handle the failure
+ transparently without affecting any applications. But if there is
+ no other uptodate copy of the data it will kill to prevent any data
+ corruptions from propagating.
+ 1: Kill all processes that have the corrupted and not reloadable page mapped
+ as soon as the corruption is detected.  Note this is not supported
+ for a few types of pages, like kernel internally allocated data or
+ the swap cache, but works for the majority of user pages.
+ 0: Only unmap the corrupted page from all processes and only kill a process
+ who tries to access it.
+ The kill is done using a catchable SIGBUS with BUS_MCEERR_AO, so processes can
+ handle this if they want to.
+ This is only active on architectures/platforms with advanced machine
+ check handling and depends on the hardware capabilities.
+ Applications can override this setting individually with the PR_MCE_KILL prctl
+ ==============================================================
+ memory_failure_recovery
+ Enable memory failure recovery (when supported by the platform)
+ 1: Attempt recovery.
+ 0: Always panic on a memory failure.
  ==============================================================
  
  min_free_kbytes:
@@@ -585,9 -624,7 +624,9 @@@ caching of directory and inode objects
  At the default value of vfs_cache_pressure=100 the kernel will attempt to
  reclaim dentries and inodes at a "fair" rate with respect to pagecache and
  swapcache reclaim.  Decreasing vfs_cache_pressure causes the kernel to prefer
 -to retain dentry and inode caches.  Increasing vfs_cache_pressure beyond 100
 +to retain dentry and inode caches. When vfs_cache_pressure=0, the kernel will
 +never reclaim dentries and inodes due to memory pressure and this can easily
 +lead to out-of-memory conditions. Increasing vfs_cache_pressure beyond 100
  causes the kernel to prefer to reclaim dentries and inodes.
  
  ==============================================================
diff --combined arch/x86/mm/fault.c
index 82728f2c6d5599ccda0c4cb1dee132ce3305f5ca,8ba5624082000eb5a101e3de8637ea1e4fcbabd9..f4cee9028cf0b01e11951662b625f63371f627e6
@@@ -10,7 -10,7 +10,7 @@@
  #include <linux/bootmem.h>            /* max_low_pfn                  */
  #include <linux/kprobes.h>            /* __kprobes, ...               */
  #include <linux/mmiotrace.h>          /* kmmio_handler, ...           */
 -#include <linux/perf_counter.h>               /* perf_swcounter_event         */
 +#include <linux/perf_event.h>         /* perf_sw_event                */
  
  #include <asm/traps.h>                        /* dotraplinkage, ...           */
  #include <asm/pgalloc.h>              /* pgd_*(), ...                 */
@@@ -167,6 -167,7 +167,7 @@@ force_sig_info_fault(int si_signo, int 
        info.si_errno   = 0;
        info.si_code    = si_code;
        info.si_addr    = (void __user *)address;
+       info.si_addr_lsb = si_code == BUS_MCEERR_AR ? PAGE_SHIFT : 0;
  
        force_sig_info(si_signo, &info, tsk);
  }
@@@ -790,10 -791,12 +791,12 @@@ out_of_memory(struct pt_regs *regs, uns
  }
  
  static void
- do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address)
+ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
+         unsigned int fault)
  {
        struct task_struct *tsk = current;
        struct mm_struct *mm = tsk->mm;
+       int code = BUS_ADRERR;
  
        up_read(&mm->mmap_sem);
  
        tsk->thread.error_code  = error_code;
        tsk->thread.trap_no     = 14;
  
-       force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
+ #ifdef CONFIG_MEMORY_FAILURE
+       if (fault & VM_FAULT_HWPOISON) {
+               printk(KERN_ERR
+       "MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n",
+                       tsk->comm, tsk->pid, address);
+               code = BUS_MCEERR_AR;
+       }
+ #endif
+       force_sig_info_fault(SIGBUS, code, address, tsk);
  }
  
  static noinline void
@@@ -819,8 -830,8 +830,8 @@@ mm_fault_error(struct pt_regs *regs, un
        if (fault & VM_FAULT_OOM) {
                out_of_memory(regs, error_code, address);
        } else {
-               if (fault & VM_FAULT_SIGBUS)
-                       do_sigbus(regs, error_code, address);
+               if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON))
+                       do_sigbus(regs, error_code, address, fault);
                else
                        BUG();
        }
@@@ -1017,7 -1028,7 +1028,7 @@@ do_page_fault(struct pt_regs *regs, uns
        if (unlikely(error_code & PF_RSVD))
                pgtable_bad(regs, error_code, address);
  
 -      perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address);
 +      perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address);
  
        /*
         * If we're in an interrupt, have no user context or are running
@@@ -1114,11 -1125,11 +1125,11 @@@ good_area
  
        if (fault & VM_FAULT_MAJOR) {
                tsk->maj_flt++;
 -              perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
 +              perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
                                     regs, address);
        } else {
                tsk->min_flt++;
 -              perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
 +              perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
                                     regs, address);
        }
  
diff --combined fs/btrfs/inode.c
index 9096fd0ca3ca447a7195f489f03d862340173494,dd86050190fcb28e08d59ffa2b812db14c5764be..d154a3f365d53bf159f6e22aaa1714bccb4e7c56
@@@ -55,13 -55,13 +55,13 @@@ struct btrfs_iget_args 
        struct btrfs_root *root;
  };
  
 -static struct inode_operations btrfs_dir_inode_operations;
 -static struct inode_operations btrfs_symlink_inode_operations;
 -static struct inode_operations btrfs_dir_ro_inode_operations;
 -static struct inode_operations btrfs_special_inode_operations;
 -static struct inode_operations btrfs_file_inode_operations;
 -static struct address_space_operations btrfs_aops;
 -static struct address_space_operations btrfs_symlink_aops;
 +static const struct inode_operations btrfs_dir_inode_operations;
 +static const struct inode_operations btrfs_symlink_inode_operations;
 +static const struct inode_operations btrfs_dir_ro_inode_operations;
 +static const struct inode_operations btrfs_special_inode_operations;
 +static const struct inode_operations btrfs_file_inode_operations;
 +static const struct address_space_operations btrfs_aops;
 +static const struct address_space_operations btrfs_symlink_aops;
  static struct file_operations btrfs_dir_file_operations;
  static struct extent_io_ops btrfs_extent_io_ops;
  
@@@ -5201,7 -5201,7 +5201,7 @@@ static int btrfs_permission(struct inod
        return generic_permission(inode, mask, btrfs_check_acl);
  }
  
 -static struct inode_operations btrfs_dir_inode_operations = {
 +static const struct inode_operations btrfs_dir_inode_operations = {
        .getattr        = btrfs_getattr,
        .lookup         = btrfs_lookup,
        .create         = btrfs_create,
        .removexattr    = btrfs_removexattr,
        .permission     = btrfs_permission,
  };
 -static struct inode_operations btrfs_dir_ro_inode_operations = {
 +static const struct inode_operations btrfs_dir_ro_inode_operations = {
        .lookup         = btrfs_lookup,
        .permission     = btrfs_permission,
  };
@@@ -5259,7 -5259,7 +5259,7 @@@ static struct extent_io_ops btrfs_exten
   *
   * For now we're avoiding this by dropping bmap.
   */
 -static struct address_space_operations btrfs_aops = {
 +static const struct address_space_operations btrfs_aops = {
        .readpage       = btrfs_readpage,
        .writepage      = btrfs_writepage,
        .writepages     = btrfs_writepages,
        .invalidatepage = btrfs_invalidatepage,
        .releasepage    = btrfs_releasepage,
        .set_page_dirty = btrfs_set_page_dirty,
+       .error_remove_page = generic_error_remove_page,
  };
  
 -static struct address_space_operations btrfs_symlink_aops = {
 +static const struct address_space_operations btrfs_symlink_aops = {
        .readpage       = btrfs_readpage,
        .writepage      = btrfs_writepage,
        .invalidatepage = btrfs_invalidatepage,
        .releasepage    = btrfs_releasepage,
  };
  
 -static struct inode_operations btrfs_file_inode_operations = {
 +static const struct inode_operations btrfs_file_inode_operations = {
        .truncate       = btrfs_truncate,
        .getattr        = btrfs_getattr,
        .setattr        = btrfs_setattr,
        .fallocate      = btrfs_fallocate,
        .fiemap         = btrfs_fiemap,
  };
 -static struct inode_operations btrfs_special_inode_operations = {
 +static const struct inode_operations btrfs_special_inode_operations = {
        .getattr        = btrfs_getattr,
        .setattr        = btrfs_setattr,
        .permission     = btrfs_permission,
        .listxattr      = btrfs_listxattr,
        .removexattr    = btrfs_removexattr,
  };
 -static struct inode_operations btrfs_symlink_inode_operations = {
 +static const struct inode_operations btrfs_symlink_inode_operations = {
        .readlink       = generic_readlink,
        .follow_link    = page_follow_link_light,
        .put_link       = page_put_link,
diff --combined fs/ext3/inode.c
index cd098a7b77fc04b7255fe5586248faa67dbbfb07,953b430f92e37f8bfb5d109ef42c7136cddaac56..acf1b14233275e891fd5e1d55560fed331add18c
@@@ -172,21 -172,10 +172,21 @@@ static int try_to_extend_transaction(ha
   * so before we call here everything must be consistently dirtied against
   * this transaction.
   */
 -static int ext3_journal_test_restart(handle_t *handle, struct inode *inode)
 +static int truncate_restart_transaction(handle_t *handle, struct inode *inode)
  {
 +      int ret;
 +
        jbd_debug(2, "restarting handle %p\n", handle);
 -      return ext3_journal_restart(handle, blocks_for_truncate(inode));
 +      /*
 +       * Drop truncate_mutex to avoid deadlock with ext3_get_blocks_handle
 +       * At this moment, get_block can be called only for blocks inside
 +       * i_size since page cache has been already dropped and writes are
 +       * blocked by i_mutex. So we can safely drop the truncate_mutex.
 +       */
 +      mutex_unlock(&EXT3_I(inode)->truncate_mutex);
 +      ret = ext3_journal_restart(handle, blocks_for_truncate(inode));
 +      mutex_lock(&EXT3_I(inode)->truncate_mutex);
 +      return ret;
  }
  
  /*
@@@ -1830,6 -1819,7 +1830,7 @@@ static const struct address_space_opera
        .direct_IO              = ext3_direct_IO,
        .migratepage            = buffer_migrate_page,
        .is_partially_uptodate  = block_is_partially_uptodate,
+       .error_remove_page      = generic_error_remove_page,
  };
  
  static const struct address_space_operations ext3_writeback_aops = {
        .direct_IO              = ext3_direct_IO,
        .migratepage            = buffer_migrate_page,
        .is_partially_uptodate  = block_is_partially_uptodate,
+       .error_remove_page      = generic_error_remove_page,
  };
  
  static const struct address_space_operations ext3_journalled_aops = {
        .invalidatepage         = ext3_invalidatepage,
        .releasepage            = ext3_releasepage,
        .is_partially_uptodate  = block_is_partially_uptodate,
+       .error_remove_page      = generic_error_remove_page,
  };
  
  void ext3_set_aops(struct inode *inode)
@@@ -2083,7 -2075,7 +2086,7 @@@ static void ext3_clear_blocks(handle_t 
                        ext3_journal_dirty_metadata(handle, bh);
                }
                ext3_mark_inode_dirty(handle, inode);
 -              ext3_journal_test_restart(handle, inode);
 +              truncate_restart_transaction(handle, inode);
                if (bh) {
                        BUFFER_TRACE(bh, "retaking write access");
                        ext3_journal_get_write_access(handle, bh);
@@@ -2293,7 -2285,7 +2296,7 @@@ static void ext3_free_branches(handle_
                                return;
                        if (try_to_extend_transaction(handle, inode)) {
                                ext3_mark_inode_dirty(handle, inode);
 -                              ext3_journal_test_restart(handle, inode);
 +                              truncate_restart_transaction(handle, inode);
                        }
  
                        ext3_free_blocks(handle, inode, nr, 1);
@@@ -2903,10 -2895,6 +2906,10 @@@ static int ext3_do_update_inode(handle_
        struct buffer_head *bh = iloc->bh;
        int err = 0, rc, block;
  
 +again:
 +      /* we can't allow multiple procs in here at once, its a bit racey */
 +      lock_buffer(bh);
 +
        /* For fields not not tracking in the in-memory inode,
         * initialise them to zero for new inodes. */
        if (ei->i_state & EXT3_STATE_NEW)
                               /* If this is the first large file
                                * created, add a flag to the superblock.
                                */
 +                              unlock_buffer(bh);
                                err = ext3_journal_get_write_access(handle,
                                                EXT3_SB(sb)->s_sbh);
                                if (err)
                                        goto out_brelse;
 +
                                ext3_update_dynamic_rev(sb);
                                EXT3_SET_RO_COMPAT_FEATURE(sb,
                                        EXT3_FEATURE_RO_COMPAT_LARGE_FILE);
                                handle->h_sync = 1;
                                err = ext3_journal_dirty_metadata(handle,
                                                EXT3_SB(sb)->s_sbh);
 +                              /* get our lock and start over */
 +                              goto again;
                        }
                }
        }
                raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
  
        BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
 +      unlock_buffer(bh);
        rc = ext3_journal_dirty_metadata(handle, bh);
        if (!err)
                err = rc;
diff --combined fs/ext4/inode.c
index 3a798737e305756a493e6ad13f865b302f9174a8,349dd6b4da47492ed2e8e94f613de6e1e86dd676..064746fad5812e693ef6d3ef2578822a3007cadb
@@@ -192,24 -192,11 +192,24 @@@ static int try_to_extend_transaction(ha
   * so before we call here everything must be consistently dirtied against
   * this transaction.
   */
 -static int ext4_journal_test_restart(handle_t *handle, struct inode *inode)
 + int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode,
 +                               int nblocks)
  {
 +      int ret;
 +
 +      /*
 +       * Drop i_data_sem to avoid deadlock with ext4_get_blocks At this
 +       * moment, get_block can be called only for blocks inside i_size since
 +       * page cache has been already dropped and writes are blocked by
 +       * i_mutex. So we can safely drop the i_data_sem here.
 +       */
        BUG_ON(EXT4_JOURNAL(inode) == NULL);
        jbd_debug(2, "restarting handle %p\n", handle);
 -      return ext4_journal_restart(handle, blocks_for_truncate(inode));
 +      up_write(&EXT4_I(inode)->i_data_sem);
 +      ret = ext4_journal_restart(handle, blocks_for_truncate(inode));
 +      down_write(&EXT4_I(inode)->i_data_sem);
 +
 +      return ret;
  }
  
  /*
@@@ -354,7 -341,9 +354,7 @@@ static int ext4_block_to_path(struct in
        int n = 0;
        int final = 0;
  
 -      if (i_block < 0) {
 -              ext4_warning(inode->i_sb, "ext4_block_to_path", "block < 0");
 -      } else if (i_block < direct_blocks) {
 +      if (i_block < direct_blocks) {
                offsets[n++] = i_block;
                final = direct_blocks;
        } else if ((i_block -= direct_blocks) < indirect_blocks) {
@@@ -562,21 -551,15 +562,21 @@@ static ext4_fsblk_t ext4_find_near(stru
   *
   *    Normally this function find the preferred place for block allocation,
   *    returns it.
 + *    Because this is only used for non-extent files, we limit the block nr
 + *    to 32 bits.
   */
  static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
                                   Indirect *partial)
  {
 +      ext4_fsblk_t goal;
 +
        /*
         * XXX need to get goal block from mballoc's data structures
         */
  
 -      return ext4_find_near(inode, partial);
 +      goal = ext4_find_near(inode, partial);
 +      goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
 +      return goal;
  }
  
  /**
@@@ -657,8 -640,6 +657,8 @@@ static int ext4_alloc_blocks(handle_t *
                if (*err)
                        goto failed_out;
  
 +              BUG_ON(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS);
 +
                target -= count;
                /* allocate blocks for indirect blocks */
                while (index < indirect_blks && count) {
                ar.flags = EXT4_MB_HINT_DATA;
  
        current_block = ext4_mb_new_blocks(handle, &ar, err);
 +      BUG_ON(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS);
  
        if (*err && (target == blks)) {
                /*
@@@ -782,9 -762,8 +782,9 @@@ static int ext4_alloc_branch(handle_t *
                BUFFER_TRACE(bh, "call get_create_access");
                err = ext4_journal_get_create_access(handle, bh);
                if (err) {
 +                      /* Don't brelse(bh) here; it's done in
 +                       * ext4_journal_forget() below */
                        unlock_buffer(bh);
 -                      brelse(bh);
                        goto failed;
                }
  
@@@ -1130,15 -1109,16 +1130,15 @@@ static void ext4_da_update_reserve_spac
                ext4_discard_preallocations(inode);
  }
  
 -static int check_block_validity(struct inode *inode, sector_t logical,
 -                              sector_t phys, int len)
 +static int check_block_validity(struct inode *inode, const char *msg,
 +                              sector_t logical, sector_t phys, int len)
  {
        if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), phys, len)) {
 -              ext4_error(inode->i_sb, "check_block_validity",
 +              ext4_error(inode->i_sb, msg,
                           "inode #%lu logical block %llu mapped to %llu "
                           "(size %d)", inode->i_ino,
                           (unsigned long long) logical,
                           (unsigned long long) phys, len);
 -              WARN_ON(1);
                return -EIO;
        }
        return 0;
@@@ -1190,8 -1170,8 +1190,8 @@@ int ext4_get_blocks(handle_t *handle, s
        up_read((&EXT4_I(inode)->i_data_sem));
  
        if (retval > 0 && buffer_mapped(bh)) {
 -              int ret = check_block_validity(inode, block,
 -                                             bh->b_blocknr, retval);
 +              int ret = check_block_validity(inode, "file system corruption",
 +                                             block, bh->b_blocknr, retval);
                if (ret != 0)
                        return ret;
        }
                         * i_data's format changing.  Force the migrate
                         * to fail by clearing migrate flags
                         */
 -                      EXT4_I(inode)->i_flags = EXT4_I(inode)->i_flags &
 -                                                      ~EXT4_EXT_MIGRATE;
 +                      EXT4_I(inode)->i_state &= ~EXT4_STATE_EXT_MIGRATE;
                }
        }
  
  
        up_write((&EXT4_I(inode)->i_data_sem));
        if (retval > 0 && buffer_mapped(bh)) {
 -              int ret = check_block_validity(inode, block,
 -                                             bh->b_blocknr, retval);
 +              int ret = check_block_validity(inode, "file system "
 +                                             "corruption after allocation",
 +                                             block, bh->b_blocknr, retval);
                if (ret != 0)
                        return ret;
        }
@@@ -1883,6 -1863,18 +1883,6 @@@ static void ext4_da_page_release_reserv
   * Delayed allocation stuff
   */
  
 -struct mpage_da_data {
 -      struct inode *inode;
 -      sector_t b_blocknr;             /* start block number of extent */
 -      size_t b_size;                  /* size of extent */
 -      unsigned long b_state;          /* state of the extent */
 -      unsigned long first_page, next_page;    /* extent of pages */
 -      struct writeback_control *wbc;
 -      int io_done;
 -      int pages_written;
 -      int retval;
 -};
 -
  /*
   * mpage_da_submit_io - walks through extent of pages and try to write
   * them with writepage() call back
@@@ -2337,7 -2329,7 +2337,7 @@@ static int __mpage_da_writepage(struct 
                /*
                 * Rest of the page in the page_vec
                 * redirty then and skip then. We will
 -               * try to to write them again after
 +               * try to write them again after
                 * starting a new transaction
                 */
                redirty_page_for_writepage(wbc, page);
@@@ -2745,7 -2737,6 +2745,7 @@@ static int ext4_da_writepages(struct ad
        long pages_skipped;
        int range_cyclic, cycled = 1, io_done = 0;
        int needed_blocks, ret = 0, nr_to_writebump = 0;
 +      loff_t range_start = wbc->range_start;
        struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
  
        trace_ext4_da_writepages(inode, wbc);
@@@ -2859,7 -2850,6 +2859,7 @@@ retry
                        mpd.io_done = 1;
                        ret = MPAGE_DA_EXTENT_TAIL;
                }
 +              trace_ext4_da_write_pages(inode, &mpd);
                wbc->nr_to_write -= mpd.pages_written;
  
                ext4_journal_stop(handle);
@@@ -2915,7 -2905,6 +2915,7 @@@ out_writepages
        if (!no_nrwrite_index_update)
                wbc->no_nrwrite_index_update = 0;
        wbc->nr_to_write -= nr_to_writebump;
 +      wbc->range_start = range_start;
        trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
        return ret;
  }
@@@ -3128,8 -3117,6 +3128,8 @@@ out
   */
  int ext4_alloc_da_blocks(struct inode *inode)
  {
 +      trace_ext4_alloc_da_blocks(inode);
 +
        if (!EXT4_I(inode)->i_reserved_data_blocks &&
            !EXT4_I(inode)->i_reserved_meta_blocks)
                return 0;
@@@ -3386,6 -3373,7 +3386,7 @@@ static const struct address_space_opera
        .direct_IO              = ext4_direct_IO,
        .migratepage            = buffer_migrate_page,
        .is_partially_uptodate  = block_is_partially_uptodate,
+       .error_remove_page      = generic_error_remove_page,
  };
  
  static const struct address_space_operations ext4_writeback_aops = {
        .direct_IO              = ext4_direct_IO,
        .migratepage            = buffer_migrate_page,
        .is_partially_uptodate  = block_is_partially_uptodate,
+       .error_remove_page      = generic_error_remove_page,
  };
  
  static const struct address_space_operations ext4_journalled_aops = {
        .invalidatepage         = ext4_invalidatepage,
        .releasepage            = ext4_releasepage,
        .is_partially_uptodate  = block_is_partially_uptodate,
+       .error_remove_page      = generic_error_remove_page,
  };
  
  static const struct address_space_operations ext4_da_aops = {
        .direct_IO              = ext4_direct_IO,
        .migratepage            = buffer_migrate_page,
        .is_partially_uptodate  = block_is_partially_uptodate,
+       .error_remove_page      = generic_error_remove_page,
  };
  
  void ext4_set_aops(struct inode *inode)
@@@ -3672,8 -3663,7 +3676,8 @@@ static void ext4_clear_blocks(handle_t 
                        ext4_handle_dirty_metadata(handle, inode, bh);
                }
                ext4_mark_inode_dirty(handle, inode);
 -              ext4_journal_test_restart(handle, inode);
 +              ext4_truncate_restart_trans(handle, inode,
 +                                          blocks_for_truncate(inode));
                if (bh) {
                        BUFFER_TRACE(bh, "retaking write access");
                        ext4_journal_get_write_access(handle, bh);
@@@ -3884,8 -3874,7 +3888,8 @@@ static void ext4_free_branches(handle_
                                return;
                        if (try_to_extend_transaction(handle, inode)) {
                                ext4_mark_inode_dirty(handle, inode);
 -                              ext4_journal_test_restart(handle, inode);
 +                              ext4_truncate_restart_trans(handle, inode,
 +                                          blocks_for_truncate(inode));
                        }
  
                        ext4_free_blocks(handle, inode, nr, 1, 1);
@@@ -3973,7 -3962,8 +3977,7 @@@ void ext4_truncate(struct inode *inode
        if (!ext4_can_truncate(inode))
                return;
  
 -      if (ei->i_disksize && inode->i_size == 0 &&
 -          !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
 +      if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
                ei->i_state |= EXT4_STATE_DA_ALLOC_CLOSE;
  
        if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
@@@ -4547,8 -4537,7 +4551,8 @@@ static int ext4_inode_blocks_set(handle
   */
  static int ext4_do_update_inode(handle_t *handle,
                                struct inode *inode,
 -                              struct ext4_iloc *iloc)
 +                              struct ext4_iloc *iloc,
 +                              int do_sync)
  {
        struct ext4_inode *raw_inode = ext4_raw_inode(iloc);
        struct ext4_inode_info *ei = EXT4_I(inode);
        if (ext4_inode_blocks_set(handle, raw_inode, ei))
                goto out_brelse;
        raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
 -      /* clear the migrate flag in the raw_inode */
 -      raw_inode->i_flags = cpu_to_le32(ei->i_flags & ~EXT4_EXT_MIGRATE);
 +      raw_inode->i_flags = cpu_to_le32(ei->i_flags);
        if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
            cpu_to_le32(EXT4_OS_HURD))
                raw_inode->i_file_acl_high =
                raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
        }
  
 -      BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
 -      rc = ext4_handle_dirty_metadata(handle, inode, bh);
 -      if (!err)
 -              err = rc;
 +      /*
 +       * If we're not using a journal and we were called from
 +       * ext4_write_inode() to sync the inode (making do_sync true),
 +       * we can just use sync_dirty_buffer() directly to do our dirty
 +       * work.  Testing s_journal here is a bit redundant but it's
 +       * worth it to avoid potential future trouble.
 +       */
 +      if (EXT4_SB(inode->i_sb)->s_journal == NULL && do_sync) {
 +              BUFFER_TRACE(bh, "call sync_dirty_buffer");
 +              sync_dirty_buffer(bh);
 +      } else {
 +              BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
 +              rc = ext4_handle_dirty_metadata(handle, inode, bh);
 +              if (!err)
 +                      err = rc;
 +      }
        ei->i_state &= ~EXT4_STATE_NEW;
  
  out_brelse:
   */
  int ext4_write_inode(struct inode *inode, int wait)
  {
 +      int err;
 +
        if (current->flags & PF_MEMALLOC)
                return 0;
  
 -      if (ext4_journal_current_handle()) {
 -              jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n");
 -              dump_stack();
 -              return -EIO;
 -      }
 +      if (EXT4_SB(inode->i_sb)->s_journal) {
 +              if (ext4_journal_current_handle()) {
 +                      jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n");
 +                      dump_stack();
 +                      return -EIO;
 +              }
  
 -      if (!wait)
 -              return 0;
 +              if (!wait)
 +                      return 0;
 +
 +              err = ext4_force_commit(inode->i_sb);
 +      } else {
 +              struct ext4_iloc iloc;
  
 -      return ext4_force_commit(inode->i_sb);
 +              err = ext4_get_inode_loc(inode, &iloc);
 +              if (err)
 +                      return err;
 +              err = ext4_do_update_inode(EXT4_NOJOURNAL_HANDLE,
 +                                         inode, &iloc, wait);
 +      }
 +      return err;
  }
  
  /*
@@@ -5029,7 -4994,7 +5033,7 @@@ int ext4_mark_iloc_dirty(handle_t *hand
        get_bh(iloc->bh);
  
        /* ext4_do_update_inode() does jbd2_journal_dirty_metadata */
 -      err = ext4_do_update_inode(handle, inode, iloc);
 +      err = ext4_do_update_inode(handle, inode, iloc, 0);
        put_bh(iloc->bh);
        return err;
  }
@@@ -5320,21 -5285,12 +5324,21 @@@ int ext4_page_mkwrite(struct vm_area_st
        else
                len = PAGE_CACHE_SIZE;
  
 +      lock_page(page);
 +      /*
 +       * return if we have all the buffers mapped. This avoid
 +       * the need to call write_begin/write_end which does a
 +       * journal_start/journal_stop which can block and take
 +       * long time
 +       */
        if (page_has_buffers(page)) {
 -              /* return if we have all the buffers mapped */
                if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
 -                                     ext4_bh_unmapped))
 +                                      ext4_bh_unmapped)) {
 +                      unlock_page(page);
                        goto out_unlock;
 +              }
        }
 +      unlock_page(page);
        /*
         * OK, we need to fill the hole... Do write_begin write_end
         * to do block allocation/reservation.We are not holding
diff --combined fs/ocfs2/aops.c
index 72e76062a900d2555fac2dae44ef325a5d8aa897,747f15eefd82e95dbad0c1afdbad690f1c750c0c..deb2b132ae5ed42b68fd11f58413f2ffa4779b83
@@@ -44,7 -44,6 +44,7 @@@
  #include "suballoc.h"
  #include "super.h"
  #include "symlink.h"
 +#include "refcounttree.h"
  
  #include "buffer_head_io.h"
  
@@@ -127,8 -126,8 +127,8 @@@ bail
        return err;
  }
  
 -static int ocfs2_get_block(struct inode *inode, sector_t iblock,
 -                         struct buffer_head *bh_result, int create)
 +int ocfs2_get_block(struct inode *inode, sector_t iblock,
 +                  struct buffer_head *bh_result, int create)
  {
        int err = 0;
        unsigned int ext_flags;
@@@ -591,8 -590,6 +591,8 @@@ static int ocfs2_direct_IO_get_blocks(s
                goto bail;
        }
  
 +      /* We should already CoW the refcounted extent. */
 +      BUG_ON(ext_flags & OCFS2_EXT_REFCOUNTED);
        /*
         * get_more_blocks() expects us to describe a hole by clearing
         * the mapped bit on bh_result().
@@@ -690,10 -687,6 +690,10 @@@ static ssize_t ocfs2_direct_IO(int rw
        if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
                return 0;
  
 +      /* Fallback to buffered I/O if we are appending. */
 +      if (i_size_read(inode) <= offset)
 +              return 0;
 +
        ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
                                            inode->i_sb->s_bdev, iov, offset,
                                            nr_segs, 
@@@ -1266,8 -1259,7 +1266,8 @@@ static int ocfs2_write_cluster(struct a
                        goto out;
                }
        } else if (unwritten) {
 -              ocfs2_init_dinode_extent_tree(&et, inode, wc->w_di_bh);
 +              ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode),
 +                                            wc->w_di_bh);
                ret = ocfs2_mark_extent_written(inode, &et,
                                                wc->w_handle, cpos, 1, phys,
                                                meta_ac, &wc->w_dealloc);
@@@ -1456,9 -1448,6 +1456,9 @@@ static int ocfs2_populate_write_desc(st
                                goto out;
                        }
  
 +                      /* We should already CoW the refcountd extent. */
 +                      BUG_ON(ext_flags & OCFS2_EXT_REFCOUNTED);
 +
                        /*
                         * Assume worst case - that we're writing in
                         * the middle of the extent.
@@@ -1539,7 -1528,7 +1539,7 @@@ static int ocfs2_write_begin_inline(str
                goto out;
        }
  
 -      ret = ocfs2_journal_access_di(handle, inode, wc->w_di_bh,
 +      ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh,
                                      OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret) {
                ocfs2_commit_trans(osb, handle);
@@@ -1710,19 -1699,6 +1710,19 @@@ int ocfs2_write_begin_nolock(struct add
                goto out;
        }
  
 +      ret = ocfs2_check_range_for_refcount(inode, pos, len);
 +      if (ret < 0) {
 +              mlog_errno(ret);
 +              goto out;
 +      } else if (ret == 1) {
 +              ret = ocfs2_refcount_cow(inode, di_bh,
 +                                       wc->w_cpos, wc->w_clen, UINT_MAX);
 +              if (ret) {
 +                      mlog_errno(ret);
 +                      goto out;
 +              }
 +      }
 +
        ret = ocfs2_populate_write_desc(inode, wc, &clusters_to_alloc,
                                        &extents_to_split);
        if (ret) {
                     (long long)i_size_read(inode), le32_to_cpu(di->i_clusters),
                     clusters_to_alloc, extents_to_split);
  
 -              ocfs2_init_dinode_extent_tree(&et, inode, wc->w_di_bh);
 +              ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode),
 +                                            wc->w_di_bh);
                ret = ocfs2_lock_allocators(inode, &et,
                                            clusters_to_alloc, extents_to_split,
                                            &data_ac, &meta_ac);
         * We don't want this to fail in ocfs2_write_end(), so do it
         * here.
         */
 -      ret = ocfs2_journal_access_di(handle, inode, wc->w_di_bh,
 +      ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh,
                                      OCFS2_JOURNAL_ACCESS_WRITE);
        if (ret) {
                mlog_errno(ret);
@@@ -2022,4 -1997,5 +2022,5 @@@ const struct address_space_operations o
        .releasepage            = ocfs2_releasepage,
        .migratepage            = buffer_migrate_page,
        .is_partially_uptodate  = block_is_partially_uptodate,
+       .error_remove_page      = generic_error_remove_page,
  };
diff --combined fs/proc/meminfo.c
index 171e052c07b3684f4a4264ebb44e802affc20f6e,78faedcb0a8d5e9ac012f4fb612609855bf3995a..c7bff4f603ff1557f7663fa22b9fb7ae8ed8a2e8
@@@ -81,11 -81,9 +81,11 @@@ static int meminfo_proc_show(struct seq
                "Writeback:      %8lu kB\n"
                "AnonPages:      %8lu kB\n"
                "Mapped:         %8lu kB\n"
 +              "Shmem:          %8lu kB\n"
                "Slab:           %8lu kB\n"
                "SReclaimable:   %8lu kB\n"
                "SUnreclaim:     %8lu kB\n"
 +              "KernelStack:    %8lu kB\n"
                "PageTables:     %8lu kB\n"
  #ifdef CONFIG_QUICKLIST
                "Quicklists:     %8lu kB\n"
                "Committed_AS:   %8lu kB\n"
                "VmallocTotal:   %8lu kB\n"
                "VmallocUsed:    %8lu kB\n"
-               "VmallocChunk:   %8lu kB\n",
+               "VmallocChunk:   %8lu kB\n"
+ #ifdef CONFIG_MEMORY_FAILURE
+               "HardwareCorrupted: %8lu kB\n"
+ #endif
+               ,
                K(i.totalram),
                K(i.freeram),
                K(i.bufferram),
                K(global_page_state(NR_WRITEBACK)),
                K(global_page_state(NR_ANON_PAGES)),
                K(global_page_state(NR_FILE_MAPPED)),
 +              K(global_page_state(NR_SHMEM)),
                K(global_page_state(NR_SLAB_RECLAIMABLE) +
                                global_page_state(NR_SLAB_UNRECLAIMABLE)),
                K(global_page_state(NR_SLAB_RECLAIMABLE)),
                K(global_page_state(NR_SLAB_UNRECLAIMABLE)),
 +              global_page_state(NR_KERNEL_STACK) * THREAD_SIZE / 1024,
                K(global_page_state(NR_PAGETABLE)),
  #ifdef CONFIG_QUICKLIST
                K(quicklist_total_size()),
                (unsigned long)VMALLOC_TOTAL >> 10,
                vmi.used >> 10,
                vmi.largest_chunk >> 10
+ #ifdef CONFIG_MEMORY_FAILURE
+               ,atomic_long_read(&mce_bad_pages) << (PAGE_SHIFT - 10)
+ #endif
                );
  
        hugetlb_report_meminfo(m);
index d5e5559e31db3774ba6f76e350302009ccf831eb,52f3fc63571aaaa69765f3be5d2f8ed776477c5b..381854461b282fe928c93cf61efb41edb1ad1acf
@@@ -216,6 -216,7 +216,6 @@@ xfs_setfilesize
        if (ip->i_d.di_size < isize) {
                ip->i_d.di_size = isize;
                ip->i_update_core = 1;
 -              ip->i_update_size = 1;
                xfs_mark_inode_dirty_sync(ip);
        }
  
@@@ -1635,4 -1636,5 +1635,5 @@@ const struct address_space_operations x
        .direct_IO              = xfs_vm_direct_IO,
        .migratepage            = buffer_migrate_page,
        .is_partially_uptodate  = block_is_partially_uptodate,
+       .error_remove_page      = generic_error_remove_page,
  };
index dd63bd38864b23627f36e57b53c8413af63cd238,c325d1ef42ab746bc9086e3c8347802a8a9c561a..5ee13b2fd223599422446b509227eadb24b7f7e8
  #define MADV_REMOVE   9               /* remove these pages & resources */
  #define MADV_DONTFORK 10              /* don't inherit across fork */
  #define MADV_DOFORK   11              /* do inherit across fork */
+ #define MADV_HWPOISON 100             /* poison a page for testing */
  
 +#define MADV_MERGEABLE   12           /* KSM may merge identical pages */
 +#define MADV_UNMERGEABLE 13           /* KSM may not merge identical pages */
 +
  /* compatibility flags */
  #define MAP_FILE      0
  
diff --combined include/linux/fs.h
index 33ed6644abd08a20dfc3e5ba1eef6408a6687add,4f47afd37647b330d0eb16bfcafdf237186c32b1..78e95b8b66d4071dfaf24e319c3aa1470ffd646f
@@@ -595,6 -595,7 +595,7 @@@ struct address_space_operations 
        int (*launder_page) (struct page *);
        int (*is_partially_uptodate) (struct page *, read_descriptor_t *,
                                        unsigned long);
+       int (*error_remove_page)(struct address_space *, struct page *);
  };
  
  /*
@@@ -655,6 -656,7 +656,6 @@@ struct block_device 
        int                     bd_invalidated;
        struct gendisk *        bd_disk;
        struct list_head        bd_list;
 -      struct backing_dev_info *bd_inode_backing_dev_info;
        /*
         * Private data.  You must have bd_claim'ed the block_device
         * to use this.  NOTE:  bd_claim allows an owner to claim
@@@ -1066,8 -1068,8 +1067,8 @@@ struct file_lock 
        struct fasync_struct *  fl_fasync; /* for lease break notifications */
        unsigned long fl_break_time;    /* for nonblocking lease breaks */
  
 -      struct file_lock_operations *fl_ops;    /* Callbacks for filesystems */
 -      struct lock_manager_operations *fl_lmops;       /* Callbacks for lockmanagers */
 +      const struct file_lock_operations *fl_ops;      /* Callbacks for filesystems */
 +      const struct lock_manager_operations *fl_lmops; /* Callbacks for lockmanagers */
        union {
                struct nfs_lock_info    nfs_fl;
                struct nfs4_lock_info   nfs4_fl;
@@@ -1318,8 -1320,8 +1319,8 @@@ struct super_block 
        unsigned long long      s_maxbytes;     /* Max file size */
        struct file_system_type *s_type;
        const struct super_operations   *s_op;
 -      struct dquot_operations *dq_op;
 -      struct quotactl_ops     *s_qcop;
 +      const struct dquot_operations   *dq_op;
 +      const struct quotactl_ops       *s_qcop;
        const struct export_operations *s_export_op;
        unsigned long           s_flags;
        unsigned long           s_magic;
        int                     s_nr_dentry_unused;     /* # of dentry on lru */
  
        struct block_device     *s_bdev;
 +      struct backing_dev_info *s_bdi;
        struct mtd_info         *s_mtd;
        struct list_head        s_instances;
        struct quota_info       s_dquot;        /* Diskquota specific options */
@@@ -2467,7 -2468,7 +2468,7 @@@ ssize_t simple_attr_write(struct file *
                          size_t len, loff_t *ppos);
  
  struct ctl_table;
 -int proc_nr_files(struct ctl_table *table, int write, struct file *filp,
 +int proc_nr_files(struct ctl_table *table, int write,
                  void __user *buffer, size_t *lenp, loff_t *ppos);
  
  int __init get_filesystem_list(char *buf);
diff --combined include/linux/mm.h
index 87218ae84e36f227ac0203c5214bf85351fae6f5,1ffca03f34b799bbdce8f7ee82d8f875fb9e60ae..6953a5a53e4495eeee028ee6c3b8f3bef00839b7
@@@ -25,7 -25,6 +25,7 @@@ extern unsigned long max_mapnr
  #endif
  
  extern unsigned long num_physpages;
 +extern unsigned long totalram_pages;
  extern void * high_memory;
  extern int page_cluster;
  
@@@ -104,7 -103,6 +104,7 @@@ extern unsigned int kobjsize(const voi
  #define VM_MIXEDMAP   0x10000000      /* Can contain "struct page" and pure PFN pages */
  #define VM_SAO                0x20000000      /* Strong Access Ordering (powerpc) */
  #define VM_PFN_AT_MMAP        0x40000000      /* PFNMAP vma that is fully mapped at mmap time */
 +#define VM_MERGEABLE  0x80000000      /* KSM may merge identical pages */
  
  #ifndef VM_STACK_DEFAULT_FLAGS                /* arch can override this */
  #define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS
@@@ -285,14 -283,6 +285,14 @@@ static inline int is_vmalloc_addr(cons
        return 0;
  #endif
  }
 +#ifdef CONFIG_MMU
 +extern int is_vmalloc_or_module_addr(const void *x);
 +#else
 +static int is_vmalloc_or_module_addr(const void *x)
 +{
 +      return 0;
 +}
 +#endif
  
  static inline struct page *compound_head(struct page *page)
  {
@@@ -695,11 -685,12 +695,12 @@@ static inline int page_mapped(struct pa
  #define VM_FAULT_SIGBUS       0x0002
  #define VM_FAULT_MAJOR        0x0004
  #define VM_FAULT_WRITE        0x0008  /* Special case for get_user_pages */
+ #define VM_FAULT_HWPOISON 0x0010      /* Hit poisoned page */
  
  #define VM_FAULT_NOPAGE       0x0100  /* ->fault installed the pte, not return page */
  #define VM_FAULT_LOCKED       0x0200  /* ->fault locked the returned page */
  
- #define VM_FAULT_ERROR        (VM_FAULT_OOM | VM_FAULT_SIGBUS)
+ #define VM_FAULT_ERROR        (VM_FAULT_OOM | VM_FAULT_SIGBUS | VM_FAULT_HWPOISON)
  
  /*
   * Can be called by the pagefault handler when it gets a VM_FAULT_OOM.
@@@ -710,8 -701,17 +711,8 @@@ extern void pagefault_out_of_memory(voi
  
  extern void show_free_areas(void);
  
 -#ifdef CONFIG_SHMEM
 -extern int shmem_lock(struct file *file, int lock, struct user_struct *user);
 -#else
 -static inline int shmem_lock(struct file *file, int lock,
 -                          struct user_struct *user)
 -{
 -      return 0;
 -}
 -#endif
 +int shmem_lock(struct file *file, int lock, struct user_struct *user);
  struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags);
 -
  int shmem_zero_setup(struct vm_area_struct *);
  
  #ifndef CONFIG_MMU
@@@ -794,6 -794,11 +795,11 @@@ static inline void unmap_shared_mapping
  extern int vmtruncate(struct inode * inode, loff_t offset);
  extern int vmtruncate_range(struct inode * inode, loff_t offset, loff_t end);
  
+ int truncate_inode_page(struct address_space *mapping, struct page *page);
+ int generic_error_remove_page(struct address_space *mapping, struct page *page);
+ int invalidate_inode_page(struct page *page);
  #ifdef CONFIG_MMU
  extern int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                        unsigned long address, unsigned int flags);
@@@ -816,7 -821,6 +822,7 @@@ int get_user_pages(struct task_struct *
                        struct page **pages, struct vm_area_struct **vmas);
  int get_user_pages_fast(unsigned long start, int nr_pages, int write,
                        struct page **pages);
 +struct page *get_dump_page(unsigned long addr);
  
  extern int try_to_release_page(struct page * page, gfp_t gfp_mask);
  extern void do_invalidatepage(struct page *page, unsigned long offset);
@@@ -1060,8 -1064,6 +1066,8 @@@ extern void setup_per_cpu_pageset(void)
  static inline void setup_per_cpu_pageset(void) {}
  #endif
  
 +extern void zone_pcp_update(struct zone *zone);
 +
  /* nommu.c */
  extern atomic_long_t mmap_pages_allocated;
  
@@@ -1230,8 -1232,7 +1236,8 @@@ struct page *follow_page(struct vm_area
  #define FOLL_WRITE    0x01    /* check pte is writable */
  #define FOLL_TOUCH    0x02    /* mark page accessed */
  #define FOLL_GET      0x04    /* do get_page on page */
 -#define FOLL_ANON     0x08    /* give ZERO_PAGE if no pgtable */
 +#define FOLL_DUMP     0x08    /* give error on hole if it would be zero */
 +#define FOLL_FORCE    0x10    /* get_user_pages read/write w/o permission */
  
  typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
                        void *data);
@@@ -1279,7 -1280,7 +1285,7 @@@ int in_gate_area_no_task(unsigned long 
  #define in_gate_area(task, addr) ({(void)task; in_gate_area_no_task(addr);})
  #endif        /* __HAVE_ARCH_GATE_AREA */
  
 -int drop_caches_sysctl_handler(struct ctl_table *, int, struct file *,
 +int drop_caches_sysctl_handler(struct ctl_table *, int,
                                        void __user *, size_t *, loff_t *);
  unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
                        unsigned long lru_pages);
@@@ -1308,5 -1309,12 +1314,12 @@@ void vmemmap_populate_print_last(void)
  extern int account_locked_memory(struct mm_struct *mm, struct rlimit *rlim,
                                 size_t size);
  extern void refund_locked_memory(struct mm_struct *mm, size_t size);
+ extern void memory_failure(unsigned long pfn, int trapno);
+ extern int __memory_failure(unsigned long pfn, int trapno, int ref);
+ extern int sysctl_memory_failure_early_kill;
+ extern int sysctl_memory_failure_recovery;
+ extern atomic_long_t mce_bad_pages;
  #endif /* __KERNEL__ */
  #endif /* _LINUX_MM_H */
index 13de789f0a5c1b8b47660dd37ca0c7da73dd26fd,9bc5fd9fdbf6209e55ab87fc267fee12ae6a7b39..6b202b173955541adbe03ebd8137af6749f1c214
@@@ -51,6 -51,9 +51,9 @@@
   * PG_buddy is set to indicate that the page is free and in the buddy system
   * (see mm/page_alloc.c).
   *
+  * PG_hwpoison indicates that a page got corrupted in hardware and contains
+  * data with incorrect ECC bits that triggered a machine check. Accessing is
+  * not safe since it may cause another machine check. Don't touch!
   */
  
  /*
@@@ -101,6 -104,9 +104,9 @@@ enum pageflags 
  #endif
  #ifdef CONFIG_ARCH_USES_PG_UNCACHED
        PG_uncached,            /* Page has been mapped as uncached */
+ #endif
+ #ifdef CONFIG_MEMORY_FAILURE
+       PG_hwpoison,            /* hardware poisoned page. Don't touch */
  #endif
        __NR_PAGEFLAGS,
  
@@@ -158,9 -164,6 +164,9 @@@ static inline int TestSetPage##uname(st
  static inline int TestClearPage##uname(struct page *page)             \
                { return test_and_clear_bit(PG_##lname, &page->flags); }
  
 +#define __TESTCLEARFLAG(uname, lname)                                 \
 +static inline int __TestClearPage##uname(struct page *page)           \
 +              { return __test_and_clear_bit(PG_##lname, &page->flags); }
  
  #define PAGEFLAG(uname, lname) TESTPAGEFLAG(uname, lname)             \
        SETPAGEFLAG(uname, lname) CLEARPAGEFLAG(uname, lname)
@@@ -187,9 -190,6 +193,9 @@@ static inline void __ClearPage##uname(s
  #define TESTCLEARFLAG_FALSE(uname)                                    \
  static inline int TestClearPage##uname(struct page *page) { return 0; }
  
 +#define __TESTCLEARFLAG_FALSE(uname)                                  \
 +static inline int __TestClearPage##uname(struct page *page) { return 0; }
 +
  struct page;  /* forward declaration */
  
  TESTPAGEFLAG(Locked, locked) TESTSETFLAG(Locked, locked)
@@@ -256,11 -256,11 +262,11 @@@ PAGEFLAG(Unevictable, unevictable) __CL
  #ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT
  #define MLOCK_PAGES 1
  PAGEFLAG(Mlocked, mlocked) __CLEARPAGEFLAG(Mlocked, mlocked)
 -      TESTSCFLAG(Mlocked, mlocked)
 +      TESTSCFLAG(Mlocked, mlocked) __TESTCLEARFLAG(Mlocked, mlocked)
  #else
  #define MLOCK_PAGES 0
 -PAGEFLAG_FALSE(Mlocked)
 -      SETPAGEFLAG_NOOP(Mlocked) TESTCLEARFLAG_FALSE(Mlocked)
 +PAGEFLAG_FALSE(Mlocked) SETPAGEFLAG_NOOP(Mlocked)
 +      TESTCLEARFLAG_FALSE(Mlocked) __TESTCLEARFLAG_FALSE(Mlocked)
  #endif
  
  #ifdef CONFIG_ARCH_USES_PG_UNCACHED
@@@ -269,6 -269,15 +275,15 @@@ PAGEFLAG(Uncached, uncached
  PAGEFLAG_FALSE(Uncached)
  #endif
  
+ #ifdef CONFIG_MEMORY_FAILURE
+ PAGEFLAG(HWPoison, hwpoison)
+ TESTSETFLAG(HWPoison, hwpoison)
+ #define __PG_HWPOISON (1UL << PG_hwpoison)
+ #else
+ PAGEFLAG_FALSE(HWPoison)
+ #define __PG_HWPOISON 0
+ #endif
  static inline int PageUptodate(struct page *page)
  {
        int ret = test_bit(PG_uptodate, &(page)->flags);
@@@ -393,7 -402,7 +408,7 @@@ static inline void __ClearPageTail(stru
         1 << PG_private | 1 << PG_private_2 | \
         1 << PG_buddy   | 1 << PG_writeback | 1 << PG_reserved | \
         1 << PG_slab    | 1 << PG_swapcache | 1 << PG_active | \
-        1 << PG_unevictable | __PG_MLOCKED)
+        1 << PG_unevictable | __PG_MLOCKED | __PG_HWPOISON)
  
  /*
   * Flags checked when a page is prepped for return by the page allocator.
   */
  #define PAGE_FLAGS_CHECK_AT_PREP      ((1 << NR_PAGEFLAGS) - 1)
  
 -#endif /* !__GENERATING_BOUNDS_H */
 -
 +#define PAGE_FLAGS_PRIVATE                            \
 +      (1 << PG_private | 1 << PG_private_2)
  /**
   * page_has_private - Determine if page has private stuff
   * @page: The page to be checked
   * Determine if a page has private stuff, indicating that release routines
   * should be invoked upon it.
   */
 -#define page_has_private(page)                        \
 -      ((page)->flags & ((1 << PG_private) |   \
 -                        (1 << PG_private_2)))
 +static inline int page_has_private(struct page *page)
 +{
 +      return !!(page->flags & PAGE_FLAGS_PRIVATE);
 +}
 +
 +#endif /* !__GENERATING_BOUNDS_H */
  
  #endif        /* PAGE_FLAGS_H */
diff --combined include/linux/prctl.h
index 07bff666e65b695d1062638f8d20f9770d610161,3dc303197e674ec1afce3394ca7d0d6a74b768cf..931150566ade8d720f156665b2a5400bbc499f0d
@@@ -85,7 -85,9 +85,9 @@@
  #define PR_SET_TIMERSLACK 29
  #define PR_GET_TIMERSLACK 30
  
 -#define PR_TASK_PERF_COUNTERS_DISABLE         31
 -#define PR_TASK_PERF_COUNTERS_ENABLE          32
 +#define PR_TASK_PERF_EVENTS_DISABLE           31
 +#define PR_TASK_PERF_EVENTS_ENABLE            32
  
+ #define PR_MCE_KILL   33
  #endif /* _LINUX_PRCTL_H */
diff --combined include/linux/rmap.h
index 477841d29fce238a2888a7c0af0c7d05bc795524,3c1004e50747fd2cb82be52f08d6c8f69a2b017d..cb0ba7032609d5602a709a54f4b012413b981587
@@@ -71,17 -71,33 +71,29 @@@ void page_add_new_anon_rmap(struct pag
  void page_add_file_rmap(struct page *);
  void page_remove_rmap(struct page *);
  
 -#ifdef CONFIG_DEBUG_VM
 -void page_dup_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address);
 -#else
 -static inline void page_dup_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address)
 +static inline void page_dup_rmap(struct page *page)
  {
        atomic_inc(&page->_mapcount);
  }
 -#endif
  
  /*
   * Called from mm/vmscan.c to handle paging out
   */
  int page_referenced(struct page *, int is_locked,
                        struct mem_cgroup *cnt, unsigned long *vm_flags);
- int try_to_unmap(struct page *, int ignore_refs);
+ enum ttu_flags {
+       TTU_UNMAP = 0,                  /* unmap mode */
+       TTU_MIGRATION = 1,              /* migration mode */
+       TTU_MUNLOCK = 2,                /* munlock mode */
+       TTU_ACTION_MASK = 0xff,
+       TTU_IGNORE_MLOCK = (1 << 8),    /* ignore mlock */
+       TTU_IGNORE_ACCESS = (1 << 9),   /* don't age */
+       TTU_IGNORE_HWPOISON = (1 << 10),/* corrupted page is recoverable */
+ };
+ #define TTU_ACTION(x) ((x) & TTU_ACTION_MASK)
+ int try_to_unmap(struct page *, enum ttu_flags flags);
  
  /*
   * Called from mm/filemap_xip.c to unmap empty zero page
@@@ -108,6 -124,13 +120,13 @@@ int page_mkclean(struct page *)
   */
  int try_to_munlock(struct page *);
  
+ /*
+  * Called by memory-failure.c to kill processes.
+  */
+ struct anon_vma *page_lock_anon_vma(struct page *page);
+ void page_unlock_anon_vma(struct anon_vma *anon_vma);
+ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma);
  #else /* !CONFIG_MMU */
  
  #define anon_vma_init()               do {} while (0)
diff --combined include/linux/sched.h
index 8a16f6d11dcd3efbb8fcf8536def605ae5684da8,29eae73c951d65dc7bc7e6d93c383bc9bd7f2a50..75e6e60bf583bb89a7784d4476a32766d10db420
@@@ -100,7 -100,7 +100,7 @@@ struct robust_list_head
  struct bio;
  struct fs_struct;
  struct bts_context;
 -struct perf_counter_context;
 +struct perf_event_context;
  
  /*
   * List of flags we want to share for kernel threads,
@@@ -140,10 -140,6 +140,10 @@@ extern int nr_processes(void)
  extern unsigned long nr_running(void);
  extern unsigned long nr_uninterruptible(void);
  extern unsigned long nr_iowait(void);
 +extern unsigned long nr_iowait_cpu(void);
 +extern unsigned long this_cpu_load(void);
 +
 +
  extern void calc_global_load(void);
  extern u64 cpu_nr_migrations(int cpu);
  
@@@ -194,7 -190,6 +194,7 @@@ extern unsigned long long time_sync_thr
  /* in tsk->state again */
  #define TASK_DEAD             64
  #define TASK_WAKEKILL         128
 +#define TASK_WAKING           256
  
  /* Convenience macros for the sake of set_task_state */
  #define TASK_KILLABLE         (TASK_WAKEKILL | TASK_UNINTERRUPTIBLE)
@@@ -261,7 -256,7 +261,7 @@@ extern asmlinkage void schedule_tail(st
  extern void init_idle(struct task_struct *idle, int cpu);
  extern void init_idle_bootup_task(struct task_struct *idle);
  
 -extern int runqueue_is_locked(void);
 +extern int runqueue_is_locked(int cpu);
  extern void task_rq_unlock_wait(struct task_struct *p);
  
  extern cpumask_var_t nohz_cpu_mask;
@@@ -309,7 -304,7 +309,7 @@@ extern void softlockup_tick(void)
  extern void touch_softlockup_watchdog(void);
  extern void touch_all_softlockup_watchdogs(void);
  extern int proc_dosoftlockup_thresh(struct ctl_table *table, int write,
 -                                  struct file *filp, void __user *buffer,
 +                                  void __user *buffer,
                                    size_t *lenp, loff_t *ppos);
  extern unsigned int  softlockup_panic;
  extern int softlockup_thresh;
@@@ -331,7 -326,7 +331,7 @@@ extern unsigned long sysctl_hung_task_c
  extern unsigned long sysctl_hung_task_timeout_secs;
  extern unsigned long sysctl_hung_task_warnings;
  extern int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
 -                                       struct file *filp, void __user *buffer,
 +                                       void __user *buffer,
                                         size_t *lenp, loff_t *ppos);
  #endif
  
@@@ -426,15 -421,6 +426,15 @@@ static inline unsigned long get_mm_hiwa
        return max(mm->hiwater_rss, get_mm_rss(mm));
  }
  
 +static inline void setmax_mm_hiwater_rss(unsigned long *maxrss,
 +                                       struct mm_struct *mm)
 +{
 +      unsigned long hiwater_rss = get_mm_hiwater_rss(mm);
 +
 +      if (*maxrss < hiwater_rss)
 +              *maxrss = hiwater_rss;
 +}
 +
  static inline unsigned long get_mm_hiwater_vm(struct mm_struct *mm)
  {
        return max(mm->hiwater_vm, mm->total_vm);
@@@ -447,9 -433,7 +447,9 @@@ extern int get_dumpable(struct mm_struc
  /* dumpable bits */
  #define MMF_DUMPABLE      0  /* core dump is permitted */
  #define MMF_DUMP_SECURELY 1  /* core file is readable only by root */
 +
  #define MMF_DUMPABLE_BITS 2
 +#define MMF_DUMPABLE_MASK ((1 << MMF_DUMPABLE_BITS) - 1)
  
  /* coredump filter bits */
  #define MMF_DUMP_ANON_PRIVATE 2
  #define MMF_DUMP_ELF_HEADERS  6
  #define MMF_DUMP_HUGETLB_PRIVATE 7
  #define MMF_DUMP_HUGETLB_SHARED  8
 +
  #define MMF_DUMP_FILTER_SHIFT MMF_DUMPABLE_BITS
  #define MMF_DUMP_FILTER_BITS  7
  #define MMF_DUMP_FILTER_MASK \
  #else
  # define MMF_DUMP_MASK_DEFAULT_ELF    0
  #endif
 +                                      /* leave room for more dump flags */
 +#define MMF_VM_MERGEABLE      16      /* KSM may merge identical pages */
 +
 +#define MMF_INIT_MASK         (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK)
  
  struct sighand_struct {
        atomic_t                count;
@@@ -493,13 -472,6 +493,13 @@@ struct pacct_struct 
        unsigned long           ac_minflt, ac_majflt;
  };
  
 +struct cpu_itimer {
 +      cputime_t expires;
 +      cputime_t incr;
 +      u32 error;
 +      u32 incr_error;
 +};
 +
  /**
   * struct task_cputime - collected CPU time counts
   * @utime:            time spent in user mode, in &cputime_t units
@@@ -594,12 -566,9 +594,12 @@@ struct signal_struct 
        struct pid *leader_pid;
        ktime_t it_real_incr;
  
 -      /* ITIMER_PROF and ITIMER_VIRTUAL timers for the process */
 -      cputime_t it_prof_expires, it_virt_expires;
 -      cputime_t it_prof_incr, it_virt_incr;
 +      /*
 +       * ITIMER_PROF and ITIMER_VIRTUAL timers for the process, we use
 +       * CPUCLOCK_PROF and CPUCLOCK_VIRT for indexing array as these
 +       * values are defined to 0 and 1 respectively
 +       */
 +      struct cpu_itimer it[2];
  
        /*
         * Thread group totals for process CPU timers.
        unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
        unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt;
        unsigned long inblock, oublock, cinblock, coublock;
 +      unsigned long maxrss, cmaxrss;
        struct task_io_accounting ioac;
  
        /*
        unsigned audit_tty;
        struct tty_audit_buf *tty_audit_buf;
  #endif
 +
 +      int oom_adj;    /* OOM kill score adjustment (bit shift) */
  };
  
  /* Context switch must be unlocked if interrupts are to be enabled */
@@@ -734,7 -700,7 +734,7 @@@ struct user_struct 
  #endif
  #endif
  
 -#ifdef CONFIG_PERF_COUNTERS
 +#ifdef CONFIG_PERF_EVENTS
        atomic_long_t locked_vm;
  #endif
  };
@@@ -836,14 -802,14 +836,14 @@@ enum cpu_idle_type 
  #define SD_BALANCE_NEWIDLE    0x0002  /* Balance when about to become idle */
  #define SD_BALANCE_EXEC               0x0004  /* Balance on exec */
  #define SD_BALANCE_FORK               0x0008  /* Balance on fork, clone */
 -#define SD_WAKE_IDLE          0x0010  /* Wake to idle CPU on task wakeup */
 +#define SD_BALANCE_WAKE               0x0010  /* Balance on wakeup */
  #define SD_WAKE_AFFINE                0x0020  /* Wake task to waking CPU */
 -#define SD_WAKE_BALANCE               0x0040  /* Perform balancing at task wakeup */
 +#define SD_PREFER_LOCAL               0x0040  /* Prefer to keep tasks local to this domain */
  #define SD_SHARE_CPUPOWER     0x0080  /* Domain members share cpu power */
  #define SD_POWERSAVINGS_BALANCE       0x0100  /* Balance for power savings */
  #define SD_SHARE_PKG_RESOURCES        0x0200  /* Domain members share cpu pkg resources */
  #define SD_SERIALIZE          0x0400  /* Only a single load balancing instance */
 -#define SD_WAKE_IDLE_FAR      0x0800  /* Gain latency sacrificing cache hit */
 +
  #define SD_PREFER_SIBLING     0x1000  /* Prefer to place tasks in a sibling domain */
  
  enum powersavings_balance_level {
@@@ -1025,9 -991,6 +1025,9 @@@ static inline int test_sd_parent(struc
        return 0;
  }
  
 +unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu);
 +unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu);
 +
  #else /* CONFIG_SMP */
  
  struct sched_domain_attr;
@@@ -1039,7 -1002,6 +1039,7 @@@ partition_sched_domains(int ndoms_new, 
  }
  #endif        /* !CONFIG_SMP */
  
 +
  struct io_context;                    /* See blkdev.h */
  
  
@@@ -1057,12 -1019,6 +1057,12 @@@ struct uts_namespace
  struct rq;
  struct sched_domain;
  
 +/*
 + * wake flags
 + */
 +#define WF_SYNC               0x01            /* waker goes to sleep after wakup */
 +#define WF_FORK               0x02            /* child wakeup after fork */
 +
  struct sched_class {
        const struct sched_class *next;
  
        void (*dequeue_task) (struct rq *rq, struct task_struct *p, int sleep);
        void (*yield_task) (struct rq *rq);
  
 -      void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int sync);
 +      void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags);
  
        struct task_struct * (*pick_next_task) (struct rq *rq);
        void (*put_prev_task) (struct rq *rq, struct task_struct *p);
  
  #ifdef CONFIG_SMP
 -      int  (*select_task_rq)(struct task_struct *p, int sync);
 +      int  (*select_task_rq)(struct task_struct *p, int sd_flag, int flags);
  
        unsigned long (*load_balance) (struct rq *this_rq, int this_cpu,
                        struct rq *busiest, unsigned long max_load_move,
        void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
                             int oldprio, int running);
  
 +      unsigned int (*get_rr_interval) (struct task_struct *task);
 +
  #ifdef CONFIG_FAIR_GROUP_SCHED
        void (*moved_group) (struct task_struct *p);
  #endif
@@@ -1148,8 -1102,6 +1148,8 @@@ struct sched_entity 
        u64                     start_runtime;
        u64                     avg_wakeup;
  
 +      u64                     avg_running;
 +
  #ifdef CONFIG_SCHEDSTATS
        u64                     wait_start;
        u64                     wait_max;
@@@ -1247,6 -1199,7 +1247,6 @@@ struct task_struct 
         * a short time
         */
        unsigned char fpu_counter;
 -      s8 oomkilladj; /* OOM kill score adjustment (bit shift). */
  #ifdef CONFIG_BLK_DEV_IO_TRACE
        unsigned int btrace_seq;
  #endif
        struct mm_struct *mm, *active_mm;
  
  /* task state */
 -      struct linux_binfmt *binfmt;
        int exit_state;
        int exit_code, exit_signal;
        int pdeath_signal;  /*  The signal sent when the parent dies  */
        struct list_head pi_state_list;
        struct futex_pi_state *pi_state_cache;
  #endif
 -#ifdef CONFIG_PERF_COUNTERS
 -      struct perf_counter_context *perf_counter_ctxp;
 -      struct mutex perf_counter_mutex;
 -      struct list_head perf_counter_list;
 +#ifdef CONFIG_PERF_EVENTS
 +      struct perf_event_context *perf_event_ctxp;
 +      struct mutex perf_event_mutex;
 +      struct list_head perf_event_list;
  #endif
  #ifdef CONFIG_NUMA
        struct mempolicy *mempolicy;    /* Protected by alloc_lock */
        /* bitmask of trace recursion */
        unsigned long trace_recursion;
  #endif /* CONFIG_TRACING */
 +      unsigned long stack_start;
  };
  
  /* Future-safe accessor for struct task_struct's cpus_allowed. */
@@@ -1734,6 -1687,7 +1734,7 @@@ extern cputime_t task_gtime(struct task
  #define PF_EXITPIDONE 0x00000008      /* pi exit done on shut down */
  #define PF_VCPU               0x00000010      /* I'm a virtual CPU */
  #define PF_FORKNOEXEC 0x00000040      /* forked but didn't exec */
+ #define PF_MCE_PROCESS  0x00000080      /* process policy on mce errors */
  #define PF_SUPERPRIV  0x00000100      /* used super-user privileges */
  #define PF_DUMPCORE   0x00000200      /* dumped core */
  #define PF_SIGNALED   0x00000400      /* killed by a signal */
  #define PF_FROZEN     0x00010000      /* frozen for system suspend */
  #define PF_FSTRANS    0x00020000      /* inside a filesystem transaction */
  #define PF_KSWAPD     0x00040000      /* I am kswapd */
 -#define PF_SWAPOFF    0x00080000      /* I am in swapoff */
 +#define PF_OOM_ORIGIN 0x00080000      /* Allocating much memory to others */
  #define PF_LESS_THROTTLE 0x00100000   /* Throttle me less: I clean memory */
  #define PF_KTHREAD    0x00200000      /* I am a kernel thread */
  #define PF_RANDOMIZE  0x00400000      /* randomize virtual address space */
  #define PF_SPREAD_PAGE        0x01000000      /* Spread page cache over cpuset */
  #define PF_SPREAD_SLAB        0x02000000      /* Spread some slab caches over cpuset */
  #define PF_THREAD_BOUND       0x04000000      /* Thread bound to specific cpu */
+ #define PF_MCE_EARLY    0x08000000      /* Early kill for mce process policy */
  #define PF_MEMPOLICY  0x10000000      /* Non-default NUMA mempolicy */
  #define PF_MUTEX_TESTER       0x20000000      /* Thread belongs to the rt mutex tester */
  #define PF_FREEZER_SKIP       0x40000000      /* Freezer should not count it as freezeable */
  
  #define RCU_READ_UNLOCK_BLOCKED (1 << 0) /* blocked while in RCU read-side. */
  #define RCU_READ_UNLOCK_NEED_QS (1 << 1) /* RCU core needs CPU response. */
 -#define RCU_READ_UNLOCK_GOT_QS  (1 << 2) /* CPU has responded to RCU core. */
  
  static inline void rcu_copy_process(struct task_struct *p)
  {
@@@ -1816,13 -1772,10 +1818,13 @@@ static inline int set_cpus_allowed_ptr(
        return 0;
  }
  #endif
 +
 +#ifndef CONFIG_CPUMASK_OFFSTACK
  static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
  {
        return set_cpus_allowed_ptr(p, &new_mask);
  }
 +#endif
  
  /*
   * Architectures can set this to 1 if they have specified
@@@ -1905,7 -1858,7 +1907,7 @@@ extern unsigned int sysctl_sched_time_a
  extern unsigned int sysctl_timer_migration;
  
  int sched_nr_latency_handler(struct ctl_table *table, int write,
 -              struct file *file, void __user *buffer, size_t *length,
 +              void __user *buffer, size_t *length,
                loff_t *ppos);
  #endif
  #ifdef CONFIG_SCHED_DEBUG
@@@ -1923,7 -1876,7 +1925,7 @@@ extern unsigned int sysctl_sched_rt_per
  extern int sysctl_sched_rt_runtime;
  
  int sched_rt_handler(struct ctl_table *table, int write,
 -              struct file *filp, void __user *buffer, size_t *lenp,
 +              void __user *buffer, size_t *lenp,
                loff_t *ppos);
  
  extern unsigned int sysctl_sched_compat_yield;
@@@ -2058,7 -2011,6 +2060,7 @@@ extern int kill_pgrp(struct pid *pid, i
  extern int kill_pid(struct pid *pid, int sig, int priv);
  extern int kill_proc_info(int, struct siginfo *, pid_t);
  extern int do_notify_parent(struct task_struct *, int);
 +extern void __wake_up_parent(struct task_struct *p, struct task_struct *parent);
  extern void force_sig(int, struct task_struct *);
  extern void force_sig_specific(int, struct task_struct *);
  extern int send_sig(int, struct task_struct *, int);
@@@ -2336,10 -2288,7 +2338,10 @@@ static inline int signal_pending(struc
        return unlikely(test_tsk_thread_flag(p,TIF_SIGPENDING));
  }
  
 -extern int __fatal_signal_pending(struct task_struct *p);
 +static inline int __fatal_signal_pending(struct task_struct *p)
 +{
 +      return unlikely(sigismember(&p->pending.signal, SIGKILL));
 +}
  
  static inline int fatal_signal_pending(struct task_struct *p)
  {
diff --combined include/linux/swap.h
index 82232dbea3f711217384bc4e2bbf1380f0851de8,f077e454c65918cfa7e3812adf92d14f4610ef1e..4ec90019c1a4c3997c8da80dd3d36f3b31d09b11
@@@ -34,15 -34,37 +34,37 @@@ static inline int current_is_kswapd(voi
   * the type/offset into the pte as 5/27 as well.
   */
  #define MAX_SWAPFILES_SHIFT   5
- #ifndef CONFIG_MIGRATION
- #define MAX_SWAPFILES         (1 << MAX_SWAPFILES_SHIFT)
+ /*
+  * Use some of the swap files numbers for other purposes. This
+  * is a convenient way to hook into the VM to trigger special
+  * actions on faults.
+  */
+ /*
+  * NUMA node memory migration support
+  */
+ #ifdef CONFIG_MIGRATION
+ #define SWP_MIGRATION_NUM 2
+ #define SWP_MIGRATION_READ    (MAX_SWAPFILES + SWP_HWPOISON_NUM)
+ #define SWP_MIGRATION_WRITE   (MAX_SWAPFILES + SWP_HWPOISON_NUM + 1)
  #else
- /* Use last two entries for page migration swap entries */
- #define MAX_SWAPFILES         ((1 << MAX_SWAPFILES_SHIFT)-2)
- #define SWP_MIGRATION_READ    MAX_SWAPFILES
- #define SWP_MIGRATION_WRITE   (MAX_SWAPFILES + 1)
+ #define SWP_MIGRATION_NUM 0
  #endif
  
+ /*
+  * Handling of hardware poisoned pages with memory corruption.
+  */
+ #ifdef CONFIG_MEMORY_FAILURE
+ #define SWP_HWPOISON_NUM 1
+ #define SWP_HWPOISON          MAX_SWAPFILES
+ #else
+ #define SWP_HWPOISON_NUM 0
+ #endif
+ #define MAX_SWAPFILES \
+       ((1 << MAX_SWAPFILES_SHIFT) - SWP_MIGRATION_NUM - SWP_HWPOISON_NUM)
  /*
   * Magic header for a swap area. The first part of the union is
   * what the swap magic looks like for the old (limited to 128MB)
@@@ -217,11 -239,6 +239,11 @@@ extern unsigned long try_to_free_pages(
  extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem,
                                                  gfp_t gfp_mask, bool noswap,
                                                  unsigned int swappiness);
 +extern unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
 +                                              gfp_t gfp_mask, bool noswap,
 +                                              unsigned int swappiness,
 +                                              struct zone *zone,
 +                                              int nid);
  extern int __isolate_lru_page(struct page *page, int mode, int file);
  extern unsigned long shrink_all_memory(unsigned long nr_pages);
  extern int vm_swappiness;
@@@ -245,7 -262,7 +267,7 @@@ extern int page_evictable(struct page *
  extern void scan_mapping_unevictable_pages(struct address_space *);
  
  extern unsigned long scan_unevictable_pages;
 -extern int scan_unevictable_handler(struct ctl_table *, int, struct file *,
 +extern int scan_unevictable_handler(struct ctl_table *, int,
                                        void __user *, size_t *, loff_t *);
  extern int scan_unevictable_register_node(struct node *node);
  extern void scan_unevictable_unregister_node(struct node *node);
@@@ -424,22 -441,10 +446,22 @@@ static inline swp_entry_t get_swap_page
  }
  
  /* linux/mm/thrash.c */
 -#define put_swap_token(mm)    do { } while (0)
 -#define grab_swap_token(mm)   do { } while (0)
 -#define has_swap_token(mm)    0
 -#define disable_swap_token()  do { } while (0)
 +static inline void put_swap_token(struct mm_struct *mm)
 +{
 +}
 +
 +static inline void grab_swap_token(struct mm_struct *mm)
 +{
 +}
 +
 +static inline int has_swap_token(struct mm_struct *mm)
 +{
 +      return 0;
 +}
 +
 +static inline void disable_swap_token(void)
 +{
 +}
  
  static inline void
  mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent)
diff --combined kernel/sys.c
index ebcb15611728c510ef59312e3a0cd398757e0b8f,41e02eff33986ca3b32834a696b107433a0b5551..255475d163e0cdb62602306a28134d67005e87c0
@@@ -14,7 -14,7 +14,7 @@@
  #include <linux/prctl.h>
  #include <linux/highuid.h>
  #include <linux/fs.h>
 -#include <linux/perf_counter.h>
 +#include <linux/perf_event.h>
  #include <linux/resource.h>
  #include <linux/kernel.h>
  #include <linux/kexec.h>
@@@ -1338,7 -1338,6 +1338,7 @@@ static void k_getrusage(struct task_str
        unsigned long flags;
        cputime_t utime, stime;
        struct task_cputime cputime;
 +      unsigned long maxrss = 0;
  
        memset((char *) r, 0, sizeof *r);
        utime = stime = cputime_zero;
                utime = task_utime(current);
                stime = task_stime(current);
                accumulate_thread_rusage(p, r);
 +              maxrss = p->signal->maxrss;
                goto out;
        }
  
                        r->ru_majflt = p->signal->cmaj_flt;
                        r->ru_inblock = p->signal->cinblock;
                        r->ru_oublock = p->signal->coublock;
 +                      maxrss = p->signal->cmaxrss;
  
                        if (who == RUSAGE_CHILDREN)
                                break;
                        r->ru_majflt += p->signal->maj_flt;
                        r->ru_inblock += p->signal->inblock;
                        r->ru_oublock += p->signal->oublock;
 +                      if (maxrss < p->signal->maxrss)
 +                              maxrss = p->signal->maxrss;
                        t = p;
                        do {
                                accumulate_thread_rusage(t, r);
  out:
        cputime_to_timeval(utime, &r->ru_utime);
        cputime_to_timeval(stime, &r->ru_stime);
 +
 +      if (who != RUSAGE_CHILDREN) {
 +              struct mm_struct *mm = get_task_mm(p);
 +              if (mm) {
 +                      setmax_mm_hiwater_rss(&maxrss, mm);
 +                      mmput(mm);
 +              }
 +      }
 +      r->ru_maxrss = maxrss * (PAGE_SIZE / 1024); /* convert pages to KBs */
  }
  
  int getrusage(struct task_struct *p, int who, struct rusage __user *ru)
@@@ -1525,11 -1511,11 +1525,11 @@@ SYSCALL_DEFINE5(prctl, int, option, uns
                case PR_SET_TSC:
                        error = SET_TSC_CTL(arg2);
                        break;
 -              case PR_TASK_PERF_COUNTERS_DISABLE:
 -                      error = perf_counter_task_disable();
 +              case PR_TASK_PERF_EVENTS_DISABLE:
 +                      error = perf_event_task_disable();
                        break;
 -              case PR_TASK_PERF_COUNTERS_ENABLE:
 -                      error = perf_counter_task_enable();
 +              case PR_TASK_PERF_EVENTS_ENABLE:
 +                      error = perf_event_task_enable();
                        break;
                case PR_GET_TIMERSLACK:
                        error = current->timer_slack_ns;
                                current->timer_slack_ns = arg2;
                        error = 0;
                        break;
+               case PR_MCE_KILL:
+                       if (arg4 | arg5)
+                               return -EINVAL;
+                       switch (arg2) {
+                       case 0:
+                               if (arg3 != 0)
+                                       return -EINVAL;
+                               current->flags &= ~PF_MCE_PROCESS;
+                               break;
+                       case 1:
+                               current->flags |= PF_MCE_PROCESS;
+                               if (arg3 != 0)
+                                       current->flags |= PF_MCE_EARLY;
+                               else
+                                       current->flags &= ~PF_MCE_EARLY;
+                               break;
+                       default:
+                               return -EINVAL;
+                       }
+                       error = 0;
+                       break;
                default:
                        error = -EINVAL;
                        break;
diff --combined kernel/sysctl.c
index a02697b7cb97a94a1f9632846a488c23e099fa7c,eacae77ac9fc0a3c8d0660cbdfb0e5337e61fe82..0d949c517412ee16822a5ca7d6e7c79218543741
@@@ -26,6 -26,7 +26,6 @@@
  #include <linux/proc_fs.h>
  #include <linux/security.h>
  #include <linux/ctype.h>
 -#include <linux/utsname.h>
  #include <linux/kmemcheck.h>
  #include <linux/smp_lock.h>
  #include <linux/fs.h>
@@@ -49,7 -50,7 +49,7 @@@
  #include <linux/reboot.h>
  #include <linux/ftrace.h>
  #include <linux/slow-work.h>
 -#include <linux/perf_counter.h>
 +#include <linux/perf_event.h>
  
  #include <asm/uaccess.h>
  #include <asm/processor.h>
@@@ -76,7 -77,6 +76,7 @@@ extern int max_threads
  extern int core_uses_pid;
  extern int suid_dumpable;
  extern char core_pattern[];
 +extern unsigned int core_pipe_limit;
  extern int pid_max;
  extern int min_free_kbytes;
  extern int pid_max_min, pid_max_max;
@@@ -91,9 -91,7 +91,9 @@@ extern int sysctl_nr_trim_pages
  #ifdef CONFIG_RCU_TORTURE_TEST
  extern int rcutorture_runnable;
  #endif /* #ifdef CONFIG_RCU_TORTURE_TEST */
 +#ifdef CONFIG_BLOCK
  extern int blk_iopoll_enabled;
 +#endif
  
  /* Constants used for minimum and  maximum */
  #ifdef CONFIG_DETECT_SOFTLOCKUP
@@@ -106,9 -104,6 +106,9 @@@ static int __maybe_unused one = 1
  static int __maybe_unused two = 2;
  static unsigned long one_ul = 1;
  static int one_hundred = 100;
 +#ifdef CONFIG_PRINTK
 +static int ten_thousand = 10000;
 +#endif
  
  /* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */
  static unsigned long dirty_bytes_min = 2 * PAGE_SIZE;
@@@ -163,9 -158,9 +163,9 @@@ extern int max_lock_depth
  #endif
  
  #ifdef CONFIG_PROC_SYSCTL
 -static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp,
 +static int proc_do_cad_pid(struct ctl_table *table, int write,
                  void __user *buffer, size_t *lenp, loff_t *ppos);
 -static int proc_taint(struct ctl_table *table, int write, struct file *filp,
 +static int proc_taint(struct ctl_table *table, int write,
                               void __user *buffer, size_t *lenp, loff_t *ppos);
  #endif
  
@@@ -424,14 -419,6 +424,14 @@@ static struct ctl_table kern_table[] = 
                .proc_handler   = &proc_dostring,
                .strategy       = &sysctl_string,
        },
 +      {
 +              .ctl_name       = CTL_UNNUMBERED,
 +              .procname       = "core_pipe_limit",
 +              .data           = &core_pipe_limit,
 +              .maxlen         = sizeof(unsigned int),
 +              .mode           = 0644,
 +              .proc_handler   = &proc_dointvec,
 +      },
  #ifdef CONFIG_PROC_SYSCTL
        {
                .procname       = "tainted",
                .mode           = 0644,
                .proc_handler   = &proc_dointvec,
        },
 +      {
 +              .ctl_name       = CTL_UNNUMBERED,
 +              .procname       = "printk_delay",
 +              .data           = &printk_delay_msec,
 +              .maxlen         = sizeof(int),
 +              .mode           = 0644,
 +              .proc_handler   = &proc_dointvec_minmax,
 +              .strategy       = &sysctl_intvec,
 +              .extra1         = &zero,
 +              .extra2         = &ten_thousand,
 +      },
  #endif
        {
                .ctl_name       = KERN_NGROUPS_MAX,
                .child          = slow_work_sysctls,
        },
  #endif
 -#ifdef CONFIG_PERF_COUNTERS
 +#ifdef CONFIG_PERF_EVENTS
        {
                .ctl_name       = CTL_UNNUMBERED,
 -              .procname       = "perf_counter_paranoid",
 -              .data           = &sysctl_perf_counter_paranoid,
 -              .maxlen         = sizeof(sysctl_perf_counter_paranoid),
 +              .procname       = "perf_event_paranoid",
 +              .data           = &sysctl_perf_event_paranoid,
 +              .maxlen         = sizeof(sysctl_perf_event_paranoid),
                .mode           = 0644,
                .proc_handler   = &proc_dointvec,
        },
        {
                .ctl_name       = CTL_UNNUMBERED,
 -              .procname       = "perf_counter_mlock_kb",
 -              .data           = &sysctl_perf_counter_mlock,
 -              .maxlen         = sizeof(sysctl_perf_counter_mlock),
 +              .procname       = "perf_event_mlock_kb",
 +              .data           = &sysctl_perf_event_mlock,
 +              .maxlen         = sizeof(sysctl_perf_event_mlock),
                .mode           = 0644,
                .proc_handler   = &proc_dointvec,
        },
        {
                .ctl_name       = CTL_UNNUMBERED,
 -              .procname       = "perf_counter_max_sample_rate",
 -              .data           = &sysctl_perf_counter_sample_rate,
 -              .maxlen         = sizeof(sysctl_perf_counter_sample_rate),
 +              .procname       = "perf_event_max_sample_rate",
 +              .data           = &sysctl_perf_event_sample_rate,
 +              .maxlen         = sizeof(sysctl_perf_event_sample_rate),
                .mode           = 0644,
                .proc_handler   = &proc_dointvec,
        },
                .proc_handler   = &proc_dointvec,
        },
  #endif
 +#ifdef CONFIG_BLOCK
        {
                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "blk_iopoll",
                .mode           = 0644,
                .proc_handler   = &proc_dointvec,
        },
 +#endif
  /*
   * NOTE: do not add new entries to this table unless you have read
   * Documentation/sysctl/ctl_unnumbered.txt
@@@ -1398,6 -1372,31 +1398,31 @@@ static struct ctl_table vm_table[] = 
                .mode           = 0644,
                .proc_handler   = &scan_unevictable_handler,
        },
+ #ifdef CONFIG_MEMORY_FAILURE
+       {
+               .ctl_name       = CTL_UNNUMBERED,
+               .procname       = "memory_failure_early_kill",
+               .data           = &sysctl_memory_failure_early_kill,
+               .maxlen         = sizeof(sysctl_memory_failure_early_kill),
+               .mode           = 0644,
+               .proc_handler   = &proc_dointvec_minmax,
+               .strategy       = &sysctl_intvec,
+               .extra1         = &zero,
+               .extra2         = &one,
+       },
+       {
+               .ctl_name       = CTL_UNNUMBERED,
+               .procname       = "memory_failure_recovery",
+               .data           = &sysctl_memory_failure_recovery,
+               .maxlen         = sizeof(sysctl_memory_failure_recovery),
+               .mode           = 0644,
+               .proc_handler   = &proc_dointvec_minmax,
+               .strategy       = &sysctl_intvec,
+               .extra1         = &zero,
+               .extra2         = &one,
+       },
+ #endif
  /*
   * NOTE: do not add new entries to this table unless you have read
   * Documentation/sysctl/ctl_unnumbered.txt
@@@ -2226,7 -2225,7 +2251,7 @@@ void sysctl_head_put(struct ctl_table_h
  #ifdef CONFIG_PROC_SYSCTL
  
  static int _proc_do_string(void* data, int maxlen, int write,
 -                         struct file *filp, void __user *buffer,
 +                         void __user *buffer,
                           size_t *lenp, loff_t *ppos)
  {
        size_t len;
   * proc_dostring - read a string sysctl
   * @table: the sysctl table
   * @write: %TRUE if this is a write to the sysctl file
 - * @filp: the file structure
   * @buffer: the user buffer
   * @lenp: the size of the user buffer
   * @ppos: file position
   *
   * Returns 0 on success.
   */
 -int proc_dostring(struct ctl_table *table, int write, struct file *filp,
 +int proc_dostring(struct ctl_table *table, int write,
                  void __user *buffer, size_t *lenp, loff_t *ppos)
  {
 -      return _proc_do_string(table->data, table->maxlen, write, filp,
 +      return _proc_do_string(table->data, table->maxlen, write,
                               buffer, lenp, ppos);
  }
  
@@@ -2328,7 -2328,7 +2353,7 @@@ static int do_proc_dointvec_conv(int *n
  }
  
  static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
 -                int write, struct file *filp, void __user *buffer,
 +                int write, void __user *buffer,
                  size_t *lenp, loff_t *ppos,
                  int (*conv)(int *negp, unsigned long *lvalp, int *valp,
                              int write, void *data),
  #undef TMPBUFLEN
  }
  
 -static int do_proc_dointvec(struct ctl_table *table, int write, struct file *filp,
 +static int do_proc_dointvec(struct ctl_table *table, int write,
                  void __user *buffer, size_t *lenp, loff_t *ppos,
                  int (*conv)(int *negp, unsigned long *lvalp, int *valp,
                              int write, void *data),
                  void *data)
  {
 -      return __do_proc_dointvec(table->data, table, write, filp,
 +      return __do_proc_dointvec(table->data, table, write,
                        buffer, lenp, ppos, conv, data);
  }
  
   * proc_dointvec - read a vector of integers
   * @table: the sysctl table
   * @write: %TRUE if this is a write to the sysctl file
 - * @filp: the file structure
   * @buffer: the user buffer
   * @lenp: the size of the user buffer
   * @ppos: file position
   *
   * Returns 0 on success.
   */
 -int proc_dointvec(struct ctl_table *table, int write, struct file *filp,
 +int proc_dointvec(struct ctl_table *table, int write,
                     void __user *buffer, size_t *lenp, loff_t *ppos)
  {
 -    return do_proc_dointvec(table,write,filp,buffer,lenp,ppos,
 +    return do_proc_dointvec(table,write,buffer,lenp,ppos,
                            NULL,NULL);
  }
  
   * Taint values can only be increased
   * This means we can safely use a temporary.
   */
 -static int proc_taint(struct ctl_table *table, int write, struct file *filp,
 +static int proc_taint(struct ctl_table *table, int write,
                               void __user *buffer, size_t *lenp, loff_t *ppos)
  {
        struct ctl_table t;
  
        t = *table;
        t.data = &tmptaint;
 -      err = proc_doulongvec_minmax(&t, write, filp, buffer, lenp, ppos);
 +      err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos);
        if (err < 0)
                return err;
  
@@@ -2533,6 -2534,7 +2558,6 @@@ static int do_proc_dointvec_minmax_conv
   * proc_dointvec_minmax - read a vector of integers with min/max values
   * @table: the sysctl table
   * @write: %TRUE if this is a write to the sysctl file
 - * @filp: the file structure
   * @buffer: the user buffer
   * @lenp: the size of the user buffer
   * @ppos: file position
   *
   * Returns 0 on success.
   */
 -int proc_dointvec_minmax(struct ctl_table *table, int write, struct file *filp,
 +int proc_dointvec_minmax(struct ctl_table *table, int write,
                  void __user *buffer, size_t *lenp, loff_t *ppos)
  {
        struct do_proc_dointvec_minmax_conv_param param = {
                .min = (int *) table->extra1,
                .max = (int *) table->extra2,
        };
 -      return do_proc_dointvec(table, write, filp, buffer, lenp, ppos,
 +      return do_proc_dointvec(table, write, buffer, lenp, ppos,
                                do_proc_dointvec_minmax_conv, &param);
  }
  
  static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write,
 -                                   struct file *filp,
                                     void __user *buffer,
                                     size_t *lenp, loff_t *ppos,
                                     unsigned long convmul,
  }
  
  static int do_proc_doulongvec_minmax(struct ctl_table *table, int write,
 -                                   struct file *filp,
                                     void __user *buffer,
                                     size_t *lenp, loff_t *ppos,
                                     unsigned long convmul,
                                     unsigned long convdiv)
  {
        return __do_proc_doulongvec_minmax(table->data, table, write,
 -                      filp, buffer, lenp, ppos, convmul, convdiv);
 +                      buffer, lenp, ppos, convmul, convdiv);
  }
  
  /**
   * proc_doulongvec_minmax - read a vector of long integers with min/max values
   * @table: the sysctl table
   * @write: %TRUE if this is a write to the sysctl file
 - * @filp: the file structure
   * @buffer: the user buffer
   * @lenp: the size of the user buffer
   * @ppos: file position
   *
   * Returns 0 on success.
   */
 -int proc_doulongvec_minmax(struct ctl_table *table, int write, struct file *filp,
 +int proc_doulongvec_minmax(struct ctl_table *table, int write,
                           void __user *buffer, size_t *lenp, loff_t *ppos)
  {
 -    return do_proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos, 1l, 1l);
 +    return do_proc_doulongvec_minmax(table, write, buffer, lenp, ppos, 1l, 1l);
  }
  
  /**
   * proc_doulongvec_ms_jiffies_minmax - read a vector of millisecond values with min/max values
   * @table: the sysctl table
   * @write: %TRUE if this is a write to the sysctl file
 - * @filp: the file structure
   * @buffer: the user buffer
   * @lenp: the size of the user buffer
   * @ppos: file position
   * Returns 0 on success.
   */
  int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
 -                                    struct file *filp,
                                      void __user *buffer,
                                      size_t *lenp, loff_t *ppos)
  {
 -    return do_proc_doulongvec_minmax(table, write, filp, buffer,
 +    return do_proc_doulongvec_minmax(table, write, buffer,
                                     lenp, ppos, HZ, 1000l);
  }
  
@@@ -2789,6 -2796,7 +2814,6 @@@ static int do_proc_dointvec_ms_jiffies_
   * proc_dointvec_jiffies - read a vector of integers as seconds
   * @table: the sysctl table
   * @write: %TRUE if this is a write to the sysctl file
 - * @filp: the file structure
   * @buffer: the user buffer
   * @lenp: the size of the user buffer
   * @ppos: file position
   *
   * Returns 0 on success.
   */
 -int proc_dointvec_jiffies(struct ctl_table *table, int write, struct file *filp,
 +int proc_dointvec_jiffies(struct ctl_table *table, int write,
                          void __user *buffer, size_t *lenp, loff_t *ppos)
  {
 -    return do_proc_dointvec(table,write,filp,buffer,lenp,ppos,
 +    return do_proc_dointvec(table,write,buffer,lenp,ppos,
                            do_proc_dointvec_jiffies_conv,NULL);
  }
  
   * proc_dointvec_userhz_jiffies - read a vector of integers as 1/USER_HZ seconds
   * @table: the sysctl table
   * @write: %TRUE if this is a write to the sysctl file
 - * @filp: the file structure
   * @buffer: the user buffer
   * @lenp: the size of the user buffer
   * @ppos: pointer to the file position
   *
   * Returns 0 on success.
   */
 -int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, struct file *filp,
 +int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write,
                                 void __user *buffer, size_t *lenp, loff_t *ppos)
  {
 -    return do_proc_dointvec(table,write,filp,buffer,lenp,ppos,
 +    return do_proc_dointvec(table,write,buffer,lenp,ppos,
                            do_proc_dointvec_userhz_jiffies_conv,NULL);
  }
  
   * proc_dointvec_ms_jiffies - read a vector of integers as 1 milliseconds
   * @table: the sysctl table
   * @write: %TRUE if this is a write to the sysctl file
 - * @filp: the file structure
   * @buffer: the user buffer
   * @lenp: the size of the user buffer
   * @ppos: file position
   *
   * Returns 0 on success.
   */
 -int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, struct file *filp,
 +int proc_dointvec_ms_jiffies(struct ctl_table *table, int write,
                             void __user *buffer, size_t *lenp, loff_t *ppos)
  {
 -      return do_proc_dointvec(table, write, filp, buffer, lenp, ppos,
 +      return do_proc_dointvec(table, write, buffer, lenp, ppos,
                                do_proc_dointvec_ms_jiffies_conv, NULL);
  }
  
 -static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp,
 +static int proc_do_cad_pid(struct ctl_table *table, int write,
                           void __user *buffer, size_t *lenp, loff_t *ppos)
  {
        struct pid *new_pid;
  
        tmp = pid_vnr(cad_pid);
  
 -      r = __do_proc_dointvec(&tmp, table, write, filp, buffer,
 +      r = __do_proc_dointvec(&tmp, table, write, buffer,
                               lenp, ppos, NULL, NULL);
        if (r || !write)
                return r;
  
  #else /* CONFIG_PROC_FS */
  
 -int proc_dostring(struct ctl_table *table, int write, struct file *filp,
 +int proc_dostring(struct ctl_table *table, int write,
                  void __user *buffer, size_t *lenp, loff_t *ppos)
  {
        return -ENOSYS;
  }
  
 -int proc_dointvec(struct ctl_table *table, int write, struct file *filp,
 +int proc_dointvec(struct ctl_table *table, int write,
                  void __user *buffer, size_t *lenp, loff_t *ppos)
  {
        return -ENOSYS;
  }
  
 -int proc_dointvec_minmax(struct ctl_table *table, int write, struct file *filp,
 +int proc_dointvec_minmax(struct ctl_table *table, int write,
                    void __user *buffer, size_t *lenp, loff_t *ppos)
  {
        return -ENOSYS;
  }
  
 -int proc_dointvec_jiffies(struct ctl_table *table, int write, struct file *filp,
 +int proc_dointvec_jiffies(struct ctl_table *table, int write,
                    void __user *buffer, size_t *lenp, loff_t *ppos)
  {
        return -ENOSYS;
  }
  
 -int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, struct file *filp,
 +int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write,
                    void __user *buffer, size_t *lenp, loff_t *ppos)
  {
        return -ENOSYS;
  }
  
 -int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, struct file *filp,
 +int proc_dointvec_ms_jiffies(struct ctl_table *table, int write,
                             void __user *buffer, size_t *lenp, loff_t *ppos)
  {
        return -ENOSYS;
  }
  
 -int proc_doulongvec_minmax(struct ctl_table *table, int write, struct file *filp,
 +int proc_doulongvec_minmax(struct ctl_table *table, int write,
                    void __user *buffer, size_t *lenp, loff_t *ppos)
  {
        return -ENOSYS;
  }
  
  int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
 -                                    struct file *filp,
                                      void __user *buffer,
                                      size_t *lenp, loff_t *ppos)
  {
diff --combined mm/Kconfig
index 71eb0b4cce8dbc425aa476d54f2048e5679c7dea,4b4e57a9643e240c90163f23fe3913d6e32891be..247760729593d37f841655dd54ec4572523255f7
@@@ -214,18 -214,6 +214,18 @@@ config HAVE_MLOCKED_PAGE_BI
  config MMU_NOTIFIER
        bool
  
 +config KSM
 +      bool "Enable KSM for page merging"
 +      depends on MMU
 +      help
 +        Enable Kernel Samepage Merging: KSM periodically scans those areas
 +        of an application's address space that an app has advised may be
 +        mergeable.  When it finds pages of identical content, it replaces
 +        the many instances by a single resident page with that content, so
 +        saving memory until one or another app needs to modify the content.
 +        Recommended for use with KVM, or with other duplicative applications.
 +        See Documentation/vm/ksm.txt for more information.
 +
  config DEFAULT_MMAP_MIN_ADDR
          int "Low address space to protect from user allocation"
          default 4096
          /proc/sys/vm/mmap_min_addr tunable.
  
  
+ config MEMORY_FAILURE
+       depends on MMU
+       depends on X86_MCE
+       bool "Enable recovery from hardware memory errors"
+       help
+         Enables code to recover from some memory failures on systems
+         with MCA recovery. This allows a system to continue running
+         even when some of its memory has uncorrected errors. This requires
+         special hardware support and typically ECC memory.
+ config HWPOISON_INJECT
+       tristate "Poison pages injector"
+       depends on MEMORY_FAILURE && DEBUG_KERNEL
  config NOMMU_INITIAL_TRIM_EXCESS
        int "Turn on mmap() excess space trimming before booting"
        depends on !MMU
diff --combined mm/Makefile
index 88193d73cd1a30dd623e94eb9b5bed0c96cf08e0,713c9f82d5ab86b306cc03408c3930b539a44b7f..515fd793c17fa989cffe0f3a686c8086e2f7ddca
@@@ -11,10 -11,10 +11,10 @@@ obj-y                      := bootmem.o filemap.o mempool.
                           maccess.o page_alloc.o page-writeback.o \
                           readahead.o swap.o truncate.o vmscan.o shmem.o \
                           prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
 -                         page_isolation.o mm_init.o $(mmu-y)
 +                         page_isolation.o mm_init.o mmu_context.o \
 +                         pagewalk.o $(mmu-y)
  obj-y += init-mm.o
  
 -obj-$(CONFIG_PROC_PAGE_MONITOR) += pagewalk.o
  obj-$(CONFIG_BOUNCE)  += bounce.o
  obj-$(CONFIG_SWAP)    += page_io.o swap_state.o swapfile.o thrash.o
  obj-$(CONFIG_HAS_DMA) += dmapool.o
@@@ -25,7 -25,6 +25,7 @@@ obj-$(CONFIG_SPARSEMEM_VMEMMAP) += spar
  obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o
  obj-$(CONFIG_SLOB) += slob.o
  obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
 +obj-$(CONFIG_KSM) += ksm.o
  obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o
  obj-$(CONFIG_SLAB) += slab.o
  obj-$(CONFIG_SLUB) += slub.o
@@@ -41,5 -40,7 +41,7 @@@ obj-$(CONFIG_SMP) += allocpercpu.
  endif
  obj-$(CONFIG_QUICKLIST) += quicklist.o
  obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
+ obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
+ obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
  obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
  obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
diff --combined mm/filemap.c
index bcc7372aebbc4375d0763e4f3acd8d096bcb612d,75575c3921672e91a0690231659d1ecd1eda2103..c1fc205a92c6eae9840961ad34dbfc88b58a80a4
   *
   *  ->task->proc_lock
   *    ->dcache_lock           (proc_pid_lookup)
+  *
+  *  (code doesn't rely on that order, so you could switch it around)
+  *  ->tasklist_lock             (memory_failure, collect_procs_ao)
+  *    ->i_mmap_lock
   */
  
  /*
@@@ -119,8 -123,6 +123,8 @@@ void __remove_from_page_cache(struct pa
        page->mapping = NULL;
        mapping->nrpages--;
        __dec_zone_page_state(page, NR_FILE_PAGES);
 +      if (PageSwapBacked(page))
 +              __dec_zone_page_state(page, NR_SHMEM);
        BUG_ON(page_mapped(page));
  
        /*
@@@ -433,8 -435,6 +437,8 @@@ int add_to_page_cache_locked(struct pag
                if (likely(!error)) {
                        mapping->nrpages++;
                        __inc_zone_page_state(page, NR_FILE_PAGES);
 +                      if (PageSwapBacked(page))
 +                              __inc_zone_page_state(page, NR_SHMEM);
                        spin_unlock_irq(&mapping->tree_lock);
                } else {
                        page->mapping = NULL;
diff --combined mm/madvise.c
index d9ae2067952e5d2b5d09b260bc443d1ba0899d6a,8dbd38b8e4a4c9bf75997a9a4766e61884824f5b..35b1479b7c9d080ed97b772da44603c4f757093c
@@@ -11,7 -11,6 +11,7 @@@
  #include <linux/mempolicy.h>
  #include <linux/hugetlb.h>
  #include <linux/sched.h>
 +#include <linux/ksm.h>
  
  /*
   * Any behaviour which results in changes to the vma->vm_flags needs to
@@@ -42,7 -41,7 +42,7 @@@ static long madvise_behavior(struct vm_
        struct mm_struct * mm = vma->vm_mm;
        int error = 0;
        pgoff_t pgoff;
 -      int new_flags = vma->vm_flags;
 +      unsigned long new_flags = vma->vm_flags;
  
        switch (behavior) {
        case MADV_NORMAL:
                new_flags |= VM_DONTCOPY;
                break;
        case MADV_DOFORK:
 +              if (vma->vm_flags & VM_IO) {
 +                      error = -EINVAL;
 +                      goto out;
 +              }
                new_flags &= ~VM_DONTCOPY;
                break;
 +      case MADV_MERGEABLE:
 +      case MADV_UNMERGEABLE:
 +              error = ksm_madvise(vma, start, end, behavior, &new_flags);
 +              if (error)
 +                      goto out;
 +              break;
        }
  
        if (new_flags == vma->vm_flags) {
@@@ -218,20 -207,67 +218,46 @@@ static long madvise_remove(struct vm_ar
        return error;
  }
  
+ #ifdef CONFIG_MEMORY_FAILURE
+ /*
+  * Error injection support for memory error handling.
+  */
+ static int madvise_hwpoison(unsigned long start, unsigned long end)
+ {
+       int ret = 0;
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM;
+       for (; start < end; start += PAGE_SIZE) {
+               struct page *p;
+               int ret = get_user_pages(current, current->mm, start, 1,
+                                               0, 0, &p, NULL);
+               if (ret != 1)
+                       return ret;
+               printk(KERN_INFO "Injecting memory failure for page %lx at %lx\n",
+                      page_to_pfn(p), start);
+               /* Ignore return value for now */
+               __memory_failure(page_to_pfn(p), 0, 1);
+               put_page(p);
+       }
+       return ret;
+ }
+ #endif
  static long
  madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
                unsigned long start, unsigned long end, int behavior)
  {
 -      long error;
 -
        switch (behavior) {
 -      case MADV_DOFORK:
 -              if (vma->vm_flags & VM_IO) {
 -                      error = -EINVAL;
 -                      break;
 -              }
 -      case MADV_DONTFORK:
 -      case MADV_NORMAL:
 -      case MADV_SEQUENTIAL:
 -      case MADV_RANDOM:
 -              error = madvise_behavior(vma, prev, start, end, behavior);
 -              break;
        case MADV_REMOVE:
 -              error = madvise_remove(vma, prev, start, end);
 -              break;
 -
 +              return madvise_remove(vma, prev, start, end);
        case MADV_WILLNEED:
 -              error = madvise_willneed(vma, prev, start, end);
 -              break;
 -
 +              return madvise_willneed(vma, prev, start, end);
        case MADV_DONTNEED:
 -              error = madvise_dontneed(vma, prev, start, end);
 -              break;
 -
 +              return madvise_dontneed(vma, prev, start, end);
        default:
 -              BUG();
 -              break;
 +              return madvise_behavior(vma, prev, start, end, behavior);
        }
 -      return error;
  }
  
  static int
@@@ -246,17 -282,12 +272,17 @@@ madvise_behavior_valid(int behavior
        case MADV_REMOVE:
        case MADV_WILLNEED:
        case MADV_DONTNEED:
 +#ifdef CONFIG_KSM
 +      case MADV_MERGEABLE:
 +      case MADV_UNMERGEABLE:
 +#endif
                return 1;
  
        default:
                return 0;
        }
  }
 +
  /*
   * The madvise(2) system call.
   *
   *            so the kernel can free resources associated with it.
   *  MADV_REMOVE - the application wants to free up the given range of
   *            pages and associated backing store.
 + *  MADV_DONTFORK - omit this area from child's address space when forking:
 + *            typically, to avoid COWing pages pinned by get_user_pages().
 + *  MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking.
 + *  MADV_MERGEABLE - the application recommends that KSM try to merge pages in
 + *            this area with pages of identical content from other such areas.
 + *  MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others.
   *
   * return values:
   *  zero    - success
@@@ -308,6 -333,10 +334,10 @@@ SYSCALL_DEFINE3(madvise, unsigned long
        int write;
        size_t len;
  
+ #ifdef CONFIG_MEMORY_FAILURE
+       if (behavior == MADV_HWPOISON)
+               return madvise_hwpoison(start, start+len_in);
+ #endif
        if (!madvise_behavior_valid(behavior))
                return error;
  
diff --combined mm/memory.c
index b1443ac07c00a4f1a46de6bb260d00e8f52f99db,44ea41196c139ab80ecbb1a6066f6e8f759223cf..987389a809e77accb1915dbd291d3e38b8dc1e60
@@@ -45,7 -45,6 +45,7 @@@
  #include <linux/swap.h>
  #include <linux/highmem.h>
  #include <linux/pagemap.h>
 +#include <linux/ksm.h>
  #include <linux/rmap.h>
  #include <linux/module.h>
  #include <linux/delayacct.h>
@@@ -57,7 -56,6 +57,7 @@@
  #include <linux/swapops.h>
  #include <linux/elf.h>
  
 +#include <asm/io.h>
  #include <asm/pgalloc.h>
  #include <asm/uaccess.h>
  #include <asm/tlb.h>
@@@ -108,18 -106,6 +108,18 @@@ static int __init disable_randmaps(cha
  }
  __setup("norandmaps", disable_randmaps);
  
 +unsigned long zero_pfn __read_mostly;
 +unsigned long highest_memmap_pfn __read_mostly;
 +
 +/*
 + * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init()
 + */
 +static int __init init_zero_pfn(void)
 +{
 +      zero_pfn = page_to_pfn(ZERO_PAGE(0));
 +      return 0;
 +}
 +core_initcall(init_zero_pfn);
  
  /*
   * If a p?d_bad entry is found while walking page tables, report
@@@ -456,20 -442,6 +456,20 @@@ static inline int is_cow_mapping(unsign
        return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
  }
  
 +#ifndef is_zero_pfn
 +static inline int is_zero_pfn(unsigned long pfn)
 +{
 +      return pfn == zero_pfn;
 +}
 +#endif
 +
 +#ifndef my_zero_pfn
 +static inline unsigned long my_zero_pfn(unsigned long addr)
 +{
 +      return zero_pfn;
 +}
 +#endif
 +
  /*
   * vm_normal_page -- This function gets the "struct page" associated with a pte.
   *
@@@ -525,9 -497,7 +525,9 @@@ struct page *vm_normal_page(struct vm_a
        if (HAVE_PTE_SPECIAL) {
                if (likely(!pte_special(pte)))
                        goto check_pfn;
 -              if (!(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)))
 +              if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
 +                      return NULL;
 +              if (!is_zero_pfn(pfn))
                        print_bad_pte(vma, addr, pte, NULL);
                return NULL;
        }
                }
        }
  
 +      if (is_zero_pfn(pfn))
 +              return NULL;
  check_pfn:
        if (unlikely(pfn > highest_memmap_pfn)) {
                print_bad_pte(vma, addr, pte, NULL);
@@@ -628,8 -596,8 +628,8 @@@ copy_one_pte(struct mm_struct *dst_mm, 
        page = vm_normal_page(vma, addr, pte);
        if (page) {
                get_page(page);
 -              page_dup_rmap(page, vma, addr);
 -              rss[!!PageAnon(page)]++;
 +              page_dup_rmap(page);
 +              rss[PageAnon(page)]++;
        }
  
  out_set_pte:
@@@ -1174,14 -1142,9 +1174,14 @@@ struct page *follow_page(struct vm_area
                goto no_page;
        if ((flags & FOLL_WRITE) && !pte_write(pte))
                goto unlock;
 +
        page = vm_normal_page(vma, address, pte);
 -      if (unlikely(!page))
 -              goto bad_page;
 +      if (unlikely(!page)) {
 +              if ((flags & FOLL_DUMP) ||
 +                  !is_zero_pfn(pte_pfn(pte)))
 +                      goto bad_page;
 +              page = pte_page(pte);
 +      }
  
        if (flags & FOLL_GET)
                get_page(page);
@@@ -1209,46 -1172,65 +1209,46 @@@ no_page
        pte_unmap_unlock(ptep, ptl);
        if (!pte_none(pte))
                return page;
 -      /* Fall through to ZERO_PAGE handling */
 +
  no_page_table:
        /*
         * When core dumping an enormous anonymous area that nobody
 -       * has touched so far, we don't want to allocate page tables.
 +       * has touched so far, we don't want to allocate unnecessary pages or
 +       * page tables.  Return error instead of NULL to skip handle_mm_fault,
 +       * then get_dump_page() will return NULL to leave a hole in the dump.
 +       * But we can only make this optimization where a hole would surely
 +       * be zero-filled if handle_mm_fault() actually did handle it.
         */
 -      if (flags & FOLL_ANON) {
 -              page = ZERO_PAGE(0);
 -              if (flags & FOLL_GET)
 -                      get_page(page);
 -              BUG_ON(flags & FOLL_WRITE);
 -      }
 +      if ((flags & FOLL_DUMP) &&
 +          (!vma->vm_ops || !vma->vm_ops->fault))
 +              return ERR_PTR(-EFAULT);
        return page;
  }
  
 -/* Can we do the FOLL_ANON optimization? */
 -static inline int use_zero_page(struct vm_area_struct *vma)
 -{
 -      /*
 -       * We don't want to optimize FOLL_ANON for make_pages_present()
 -       * when it tries to page in a VM_LOCKED region. As to VM_SHARED,
 -       * we want to get the page from the page tables to make sure
 -       * that we serialize and update with any other user of that
 -       * mapping.
 -       */
 -      if (vma->vm_flags & (VM_LOCKED | VM_SHARED))
 -              return 0;
 -      /*
 -       * And if we have a fault routine, it's not an anonymous region.
 -       */
 -      return !vma->vm_ops || !vma->vm_ops->fault;
 -}
 -
 -
 -
  int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 -                   unsigned long start, int nr_pages, int flags,
 +                   unsigned long start, int nr_pages, unsigned int gup_flags,
                     struct page **pages, struct vm_area_struct **vmas)
  {
        int i;
 -      unsigned int vm_flags = 0;
 -      int write = !!(flags & GUP_FLAGS_WRITE);
 -      int force = !!(flags & GUP_FLAGS_FORCE);
 -      int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS);
 -      int ignore_sigkill = !!(flags & GUP_FLAGS_IGNORE_SIGKILL);
 +      unsigned long vm_flags;
  
        if (nr_pages <= 0)
                return 0;
 +
 +      VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET));
 +
        /* 
         * Require read or write permissions.
 -       * If 'force' is set, we only require the "MAY" flags.
 +       * If FOLL_FORCE is set, we only require the "MAY" flags.
         */
 -      vm_flags  = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
 -      vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
 +      vm_flags  = (gup_flags & FOLL_WRITE) ?
 +                      (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
 +      vm_flags &= (gup_flags & FOLL_FORCE) ?
 +                      (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
        i = 0;
  
        do {
                struct vm_area_struct *vma;
 -              unsigned int foll_flags;
  
                vma = find_extend_vma(mm, start);
                if (!vma && in_gate_area(tsk, start)) {
                        pte_t *pte;
  
                        /* user gate pages are read-only */
 -                      if (!ignore && write)
 +                      if (gup_flags & FOLL_WRITE)
                                return i ? : -EFAULT;
                        if (pg > TASK_SIZE)
                                pgd = pgd_offset_k(pg);
  
                if (!vma ||
                    (vma->vm_flags & (VM_IO | VM_PFNMAP)) ||
 -                  (!ignore && !(vm_flags & vma->vm_flags)))
 +                  !(vm_flags & vma->vm_flags))
                        return i ? : -EFAULT;
  
                if (is_vm_hugetlb_page(vma)) {
                        i = follow_hugetlb_page(mm, vma, pages, vmas,
 -                                              &start, &nr_pages, i, write);
 +                                      &start, &nr_pages, i, gup_flags);
                        continue;
                }
  
 -              foll_flags = FOLL_TOUCH;
 -              if (pages)
 -                      foll_flags |= FOLL_GET;
 -              if (!write && use_zero_page(vma))
 -                      foll_flags |= FOLL_ANON;
 -
                do {
                        struct page *page;
 +                      unsigned int foll_flags = gup_flags;
  
                        /*
                         * If we have a pending SIGKILL, don't keep faulting
 -                       * pages and potentially allocating memory, unless
 -                       * current is handling munlock--e.g., on exit. In
 -                       * that case, we are not allocating memory.  Rather,
 -                       * we're only unlocking already resident/mapped pages.
 +                       * pages and potentially allocating memory.
                         */
 -                      if (unlikely(!ignore_sigkill &&
 -                                      fatal_signal_pending(current)))
 +                      if (unlikely(fatal_signal_pending(current)))
                                return i ? i : -ERESTARTSYS;
  
 -                      if (write)
 -                              foll_flags |= FOLL_WRITE;
 -
                        cond_resched();
                        while (!(page = follow_page(vma, start, foll_flags))) {
                                int ret;
                                if (ret & VM_FAULT_ERROR) {
                                        if (ret & VM_FAULT_OOM)
                                                return i ? i : -ENOMEM;
-                                       else if (ret & VM_FAULT_SIGBUS)
+                                       if (ret &
+                                           (VM_FAULT_HWPOISON|VM_FAULT_SIGBUS))
                                                return i ? i : -EFAULT;
                                        BUG();
                                }
@@@ -1424,47 -1419,18 +1425,47 @@@ int get_user_pages(struct task_struct *
                unsigned long start, int nr_pages, int write, int force,
                struct page **pages, struct vm_area_struct **vmas)
  {
 -      int flags = 0;
 +      int flags = FOLL_TOUCH;
  
 +      if (pages)
 +              flags |= FOLL_GET;
        if (write)
 -              flags |= GUP_FLAGS_WRITE;
 +              flags |= FOLL_WRITE;
        if (force)
 -              flags |= GUP_FLAGS_FORCE;
 +              flags |= FOLL_FORCE;
  
        return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas);
  }
 -
  EXPORT_SYMBOL(get_user_pages);
  
 +/**
 + * get_dump_page() - pin user page in memory while writing it to core dump
 + * @addr: user address
 + *
 + * Returns struct page pointer of user page pinned for dump,
 + * to be freed afterwards by page_cache_release() or put_page().
 + *
 + * Returns NULL on any kind of failure - a hole must then be inserted into
 + * the corefile, to preserve alignment with its headers; and also returns
 + * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found -
 + * allowing a hole to be left in the corefile to save diskspace.
 + *
 + * Called without mmap_sem, but after all other threads have been killed.
 + */
 +#ifdef CONFIG_ELF_CORE
 +struct page *get_dump_page(unsigned long addr)
 +{
 +      struct vm_area_struct *vma;
 +      struct page *page;
 +
 +      if (__get_user_pages(current, current->mm, addr, 1,
 +                      FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma) < 1)
 +              return NULL;
 +      flush_cache_page(vma, addr, page_to_pfn(page));
 +      return page;
 +}
 +#endif /* CONFIG_ELF_CORE */
 +
  pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr,
                        spinlock_t **ptl)
  {
@@@ -1642,8 -1608,7 +1643,8 @@@ int vm_insert_mixed(struct vm_area_stru
         * If we don't have pte special, then we have to use the pfn_valid()
         * based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must*
         * refcount the page if pfn_valid is true (hence insert_page rather
 -       * than insert_pfn).
 +       * than insert_pfn).  If a zero_pfn were inserted into a VM_MIXEDMAP
 +       * without pte special, it would there be refcounted as a normal page.
         */
        if (!HAVE_PTE_SPECIAL && pfn_valid(pfn)) {
                struct page *page;
@@@ -2009,7 -1974,7 +2010,7 @@@ static int do_wp_page(struct mm_struct 
         * Take out anonymous pages first, anonymous shared vmas are
         * not dirty accountable.
         */
 -      if (PageAnon(old_page)) {
 +      if (PageAnon(old_page) && !PageKsm(old_page)) {
                if (!trylock_page(old_page)) {
                        page_cache_get(old_page);
                        pte_unmap_unlock(page_table, ptl);
@@@ -2110,19 -2075,10 +2111,19 @@@ gotten
  
        if (unlikely(anon_vma_prepare(vma)))
                goto oom;
 -      VM_BUG_ON(old_page == ZERO_PAGE(0));
 -      new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
 -      if (!new_page)
 -              goto oom;
 +
 +      if (is_zero_pfn(pte_pfn(orig_pte))) {
 +              new_page = alloc_zeroed_user_highpage_movable(vma, address);
 +              if (!new_page)
 +                      goto oom;
 +      } else {
 +              new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
 +              if (!new_page)
 +                      goto oom;
 +              cow_user_page(new_page, old_page, address, vma);
 +      }
 +      __SetPageUptodate(new_page);
 +
        /*
         * Don't let another task, with possibly unlocked vma,
         * keep the mlocked page.
                clear_page_mlock(old_page);
                unlock_page(old_page);
        }
 -      cow_user_page(new_page, old_page, address, vma);
 -      __SetPageUptodate(new_page);
  
        if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))
                goto oom_free_new;
                 * seen in the presence of one thread doing SMC and another
                 * thread doing COW.
                 */
 -              ptep_clear_flush_notify(vma, address, page_table);
 +              ptep_clear_flush(vma, address, page_table);
                page_add_new_anon_rmap(new_page, vma, address);
 -              set_pte_at(mm, address, page_table, entry);
 +              /*
 +               * We call the notify macro here because, when using secondary
 +               * mmu page tables (such as kvm shadow page tables), we want the
 +               * new page to be mapped directly into the secondary page table.
 +               */
 +              set_pte_at_notify(mm, address, page_table, entry);
                update_mmu_cache(vma, address, entry);
                if (old_page) {
                        /*
@@@ -2559,8 -2512,15 +2560,15 @@@ static int do_swap_page(struct mm_struc
                goto out;
  
        entry = pte_to_swp_entry(orig_pte);
-       if (is_migration_entry(entry)) {
-               migration_entry_wait(mm, pmd, address);
+       if (unlikely(non_swap_entry(entry))) {
+               if (is_migration_entry(entry)) {
+                       migration_entry_wait(mm, pmd, address);
+               } else if (is_hwpoison_entry(entry)) {
+                       ret = VM_FAULT_HWPOISON;
+               } else {
+                       print_bad_pte(vma, address, orig_pte, NULL);
+                       ret = VM_FAULT_OOM;
+               }
                goto out;
        }
        delayacct_set_flag(DELAYACCT_PF_SWAPIN);
                /* Had to read the page from swap area: Major fault */
                ret = VM_FAULT_MAJOR;
                count_vm_event(PGMAJFAULT);
+       } else if (PageHWPoison(page)) {
+               ret = VM_FAULT_HWPOISON;
+               delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
+               goto out;
        }
  
        lock_page(page);
@@@ -2672,16 -2636,6 +2684,16 @@@ static int do_anonymous_page(struct mm_
        spinlock_t *ptl;
        pte_t entry;
  
 +      if (!(flags & FAULT_FLAG_WRITE)) {
 +              entry = pte_mkspecial(pfn_pte(my_zero_pfn(address),
 +                                              vma->vm_page_prot));
 +              ptl = pte_lockptr(mm, pmd);
 +              spin_lock(ptl);
 +              if (!pte_none(*page_table))
 +                      goto unlock;
 +              goto setpte;
 +      }
 +
        /* Allocate our own private page. */
        pte_unmap(page_table);
  
                goto oom_free_page;
  
        entry = mk_pte(page, vma->vm_page_prot);
 -      entry = maybe_mkwrite(pte_mkdirty(entry), vma);
 +      if (vma->vm_flags & VM_WRITE)
 +              entry = pte_mkwrite(pte_mkdirty(entry));
  
        page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
        if (!pte_none(*page_table))
                goto release;
 +
        inc_mm_counter(mm, anon_rss);
        page_add_new_anon_rmap(page, vma, address);
 +setpte:
        set_pte_at(mm, address, page_table, entry);
  
        /* No need to invalidate - it was non-present before */
@@@ -2760,6 -2711,12 +2772,12 @@@ static int __do_fault(struct mm_struct 
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
                return ret;
  
+       if (unlikely(PageHWPoison(vmf.page))) {
+               if (ret & VM_FAULT_LOCKED)
+                       unlock_page(vmf.page);
+               return VM_FAULT_HWPOISON;
+       }
        /*
         * For consistency in subsequent calls, make the faulted page always
         * locked.
diff --combined mm/migrate.c
index 16052e80aaacbc182c9ea421bb7f84b02fc3a5b4,e3a0cd3859a9a71b682178c08ec346e6f22aec73..1a4bf4813780eb700ee026030bca18fedc2fbae6
@@@ -67,8 -67,6 +67,8 @@@ int putback_lru_pages(struct list_head 
  
        list_for_each_entry_safe(page, page2, l, lru) {
                list_del(&page->lru);
 +              dec_zone_page_state(page, NR_ISOLATED_ANON +
 +                              page_is_file_cache(page));
                putback_lru_page(page);
                count++;
        }
@@@ -149,7 -147,7 +149,7 @@@ out
  static void remove_file_migration_ptes(struct page *old, struct page *new)
  {
        struct vm_area_struct *vma;
 -      struct address_space *mapping = page_mapping(new);
 +      struct address_space *mapping = new->mapping;
        struct prio_tree_iter iter;
        pgoff_t pgoff = new->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
  
@@@ -272,7 -270,7 +272,7 @@@ static int migrate_page_move_mapping(st
        pslot = radix_tree_lookup_slot(&mapping->page_tree,
                                        page_index(page));
  
 -      expected_count = 2 + !!page_has_private(page);
 +      expected_count = 2 + page_has_private(page);
        if (page_count(page) != expected_count ||
                        (struct page *)radix_tree_deref_slot(pslot) != page) {
                spin_unlock_irq(&mapping->tree_lock);
         */
        __dec_zone_page_state(page, NR_FILE_PAGES);
        __inc_zone_page_state(newpage, NR_FILE_PAGES);
 -
 +      if (PageSwapBacked(page)) {
 +              __dec_zone_page_state(page, NR_SHMEM);
 +              __inc_zone_page_state(newpage, NR_SHMEM);
 +      }
        spin_unlock_irq(&mapping->tree_lock);
  
        return 0;
@@@ -669,15 -664,13 +669,15 @@@ static int unmap_and_move(new_page_t ge
                         *    needs to be effective.
                         */
                        try_to_free_buffers(page);
 +                      goto rcu_unlock;
                }
 -              goto rcu_unlock;
 +              goto skip_unmap;
        }
  
        /* Establish migration ptes or remove ptes */
-       try_to_unmap(page, 1);
+       try_to_unmap(page, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
  
 +skip_unmap:
        if (!page_mapped(page))
                rc = move_to_new_page(newpage, page);
  
@@@ -700,8 -693,6 +700,8 @@@ unlock
                 * restored.
                 */
                list_del(&page->lru);
 +              dec_zone_page_state(page, NR_ISOLATED_ANON +
 +                              page_is_file_cache(page));
                putback_lru_page(page);
        }
  
@@@ -746,13 -737,6 +746,13 @@@ int migrate_pages(struct list_head *fro
        struct page *page2;
        int swapwrite = current->flags & PF_SWAPWRITE;
        int rc;
 +      unsigned long flags;
 +
 +      local_irq_save(flags);
 +      list_for_each_entry(page, from, lru)
 +              __inc_zone_page_state(page, NR_ISOLATED_ANON +
 +                              page_is_file_cache(page));
 +      local_irq_restore(flags);
  
        if (!swapwrite)
                current->flags |= PF_SWAPWRITE;
diff --combined mm/page-writeback.c
index be197f71b096cb83d6ac48ab68cef84b2ba3fc61,bba82c414ba81eb5aaa521a6e31fd0fdbe787d11..d99664e8607e761235a13b2662353df131ac5b41
@@@ -155,37 -155,37 +155,37 @@@ static void update_completion_period(vo
  }
  
  int dirty_background_ratio_handler(struct ctl_table *table, int write,
 -              struct file *filp, void __user *buffer, size_t *lenp,
 +              void __user *buffer, size_t *lenp,
                loff_t *ppos)
  {
        int ret;
  
 -      ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
 +      ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
        if (ret == 0 && write)
                dirty_background_bytes = 0;
        return ret;
  }
  
  int dirty_background_bytes_handler(struct ctl_table *table, int write,
 -              struct file *filp, void __user *buffer, size_t *lenp,
 +              void __user *buffer, size_t *lenp,
                loff_t *ppos)
  {
        int ret;
  
 -      ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos);
 +      ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
        if (ret == 0 && write)
                dirty_background_ratio = 0;
        return ret;
  }
  
  int dirty_ratio_handler(struct ctl_table *table, int write,
 -              struct file *filp, void __user *buffer, size_t *lenp,
 +              void __user *buffer, size_t *lenp,
                loff_t *ppos)
  {
        int old_ratio = vm_dirty_ratio;
        int ret;
  
 -      ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
 +      ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
        if (ret == 0 && write && vm_dirty_ratio != old_ratio) {
                update_completion_period();
                vm_dirty_bytes = 0;
  
  
  int dirty_bytes_handler(struct ctl_table *table, int write,
 -              struct file *filp, void __user *buffer, size_t *lenp,
 +              void __user *buffer, size_t *lenp,
                loff_t *ppos)
  {
        unsigned long old_bytes = vm_dirty_bytes;
        int ret;
  
 -      ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos);
 +      ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
        if (ret == 0 && write && vm_dirty_bytes != old_bytes) {
                update_completion_period();
                vm_dirty_ratio = 0;
@@@ -315,7 -315,7 +315,7 @@@ int bdi_set_min_ratio(struct backing_de
  {
        int ret = 0;
  
 -      spin_lock(&bdi_lock);
 +      spin_lock_bh(&bdi_lock);
        if (min_ratio > bdi->max_ratio) {
                ret = -EINVAL;
        } else {
                        ret = -EINVAL;
                }
        }
 -      spin_unlock(&bdi_lock);
 +      spin_unlock_bh(&bdi_lock);
  
        return ret;
  }
@@@ -339,14 -339,14 +339,14 @@@ int bdi_set_max_ratio(struct backing_de
        if (max_ratio > 100)
                return -EINVAL;
  
 -      spin_lock(&bdi_lock);
 +      spin_lock_bh(&bdi_lock);
        if (bdi->min_ratio > max_ratio) {
                ret = -EINVAL;
        } else {
                bdi->max_ratio = max_ratio;
                bdi->max_prop_frac = (PROP_FRAC_BASE * max_ratio) / 100;
        }
 -      spin_unlock(&bdi_lock);
 +      spin_unlock_bh(&bdi_lock);
  
        return ret;
  }
@@@ -380,8 -380,7 +380,8 @@@ static unsigned long highmem_dirtyable_
                struct zone *z =
                        &NODE_DATA(node)->node_zones[ZONE_HIGHMEM];
  
 -              x += zone_page_state(z, NR_FREE_PAGES) + zone_lru_pages(z);
 +              x += zone_page_state(z, NR_FREE_PAGES) +
 +                   zone_reclaimable_pages(z);
        }
        /*
         * Make sure that the number of highmem pages is never larger
@@@ -405,7 -404,7 +405,7 @@@ unsigned long determine_dirtyable_memor
  {
        unsigned long x;
  
 -      x = global_page_state(NR_FREE_PAGES) + global_lru_pages();
 +      x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages();
  
        if (!vm_highmem_is_dirtyable)
                x -= highmem_dirtyable_memory(x);
@@@ -486,7 -485,6 +486,7 @@@ static void balance_dirty_pages(struct 
        unsigned long bdi_thresh;
        unsigned long pages_written = 0;
        unsigned long write_chunk = sync_writeback_pages();
 +      unsigned long pause = 1;
  
        struct backing_dev_info *bdi = mapping->backing_dev_info;
  
                if (pages_written >= write_chunk)
                        break;          /* We've done our duty */
  
 -              schedule_timeout(1);
 +              schedule_timeout_interruptible(pause);
 +
 +              /*
 +               * Increase the delay for each loop, up to our previous
 +               * default of taking a 100ms nap.
 +               */
 +              pause <<= 1;
 +              if (pause > HZ / 10)
 +                      pause = HZ / 10;
        }
  
        if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh &&
        if ((laptop_mode && pages_written) ||
            (!laptop_mode && ((nr_writeback = global_page_state(NR_FILE_DIRTY)
                                          + global_page_state(NR_UNSTABLE_NFS))
 -                                        > background_thresh))) {
 -              struct writeback_control wbc = {
 -                      .bdi            = bdi,
 -                      .sync_mode      = WB_SYNC_NONE,
 -                      .nr_to_write    = nr_writeback,
 -              };
 -
 -
 -              bdi_start_writeback(&wbc);
 -      }
 +                                        > background_thresh)))
 +              bdi_start_writeback(bdi, nr_writeback);
  }
  
  void set_page_dirty_balance(struct page *page, int page_mkwrite)
@@@ -686,9 -684,9 +686,9 @@@ static DEFINE_TIMER(laptop_mode_wb_time
   * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs
   */
  int dirty_writeback_centisecs_handler(ctl_table *table, int write,
 -      struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
 +      void __user *buffer, size_t *length, loff_t *ppos)
  {
 -      proc_dointvec(table, write, file, buffer, length, ppos);
 +      proc_dointvec(table, write, buffer, length, ppos);
        return 0;
  }
  
@@@ -1022,10 -1020,12 +1022,10 @@@ int do_writepages(struct address_space 
  
        if (wbc->nr_to_write <= 0)
                return 0;
 -      wbc->for_writepages = 1;
        if (mapping->a_ops->writepages)
                ret = mapping->a_ops->writepages(mapping, wbc);
        else
                ret = generic_writepages(mapping, wbc);
 -      wbc->for_writepages = 0;
        return ret;
  }
  
@@@ -1149,6 -1149,13 +1149,13 @@@ int redirty_page_for_writepage(struct w
  EXPORT_SYMBOL(redirty_page_for_writepage);
  
  /*
+  * Dirty a page.
+  *
+  * For pages with a mapping this should be done under the page lock
+  * for the benefit of asynchronous memory errors who prefer a consistent
+  * dirty state. This rule can be broken in some special cases,
+  * but should be better not to.
+  *
   * If the mapping doesn't provide a set_page_dirty a_op, then
   * just fall through and assume that it wants buffer_heads.
   */
diff --combined mm/page_alloc.c
index 88248b3c20bb30dc7216221c05f9f8b47b737df2,9faa7ad95ac536ad34f16bf16996261554705fc3..bf720550b44d85adc294f7fd0b8ede38f73a8902
@@@ -48,7 -48,6 +48,7 @@@
  #include <linux/page_cgroup.h>
  #include <linux/debugobjects.h>
  #include <linux/kmemleak.h>
 +#include <trace/events/kmem.h>
  
  #include <asm/tlbflush.h>
  #include <asm/div64.h>
@@@ -72,6 -71,7 +72,6 @@@ EXPORT_SYMBOL(node_states)
  
  unsigned long totalram_pages __read_mostly;
  unsigned long totalreserve_pages __read_mostly;
 -unsigned long highest_memmap_pfn __read_mostly;
  int percpu_pagelist_fraction;
  gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
  
@@@ -123,8 -123,8 +123,8 @@@ static char * const zone_names[MAX_NR_Z
  
  int min_free_kbytes = 1024;
  
 -unsigned long __meminitdata nr_kernel_pages;
 -unsigned long __meminitdata nr_all_pages;
 +static unsigned long __meminitdata nr_kernel_pages;
 +static unsigned long __meminitdata nr_all_pages;
  static unsigned long __meminitdata dma_reserve;
  
  #ifdef CONFIG_ARCH_POPULATES_NODE_MAP
@@@ -234,6 -234,12 +234,12 @@@ static void bad_page(struct page *page
        static unsigned long nr_shown;
        static unsigned long nr_unshown;
  
+       /* Don't complain about poisoned pages */
+       if (PageHWPoison(page)) {
+               __ClearPageBuddy(page);
+               return;
+       }
        /*
         * Allow a burst of 60 reports, then keep quiet for that minute;
         * or allow a steady drip of one report per second.
@@@ -510,7 -516,7 +516,7 @@@ static inline int free_pages_check(stru
  }
  
  /*
 - * Frees a list of pages. 
 + * Frees a number of pages from the PCP lists
   * Assumes all pages on list are in same zone, and of same order.
   * count is the number of pages to free.
   *
   * And clear the zone's pages_scanned counter, to hold off the "all pages are
   * pinned" detection logic.
   */
 -static void free_pages_bulk(struct zone *zone, int count,
 -                                      struct list_head *list, int order)
 +static void free_pcppages_bulk(struct zone *zone, int count,
 +                                      struct per_cpu_pages *pcp)
  {
 +      int migratetype = 0;
 +      int batch_free = 0;
 +
        spin_lock(&zone->lock);
        zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE);
        zone->pages_scanned = 0;
  
 -      __mod_zone_page_state(zone, NR_FREE_PAGES, count << order);
 -      while (count--) {
 +      __mod_zone_page_state(zone, NR_FREE_PAGES, count);
 +      while (count) {
                struct page *page;
 +              struct list_head *list;
  
 -              VM_BUG_ON(list_empty(list));
 -              page = list_entry(list->prev, struct page, lru);
 -              /* have to delete it as __free_one_page list manipulates */
 -              list_del(&page->lru);
 -              __free_one_page(page, zone, order, page_private(page));
 +              /*
 +               * Remove pages from lists in a round-robin fashion. A
 +               * batch_free count is maintained that is incremented when an
 +               * empty list is encountered.  This is so more pages are freed
 +               * off fuller lists instead of spinning excessively around empty
 +               * lists
 +               */
 +              do {
 +                      batch_free++;
 +                      if (++migratetype == MIGRATE_PCPTYPES)
 +                              migratetype = 0;
 +                      list = &pcp->lists[migratetype];
 +              } while (list_empty(list));
 +
 +              do {
 +                      page = list_entry(list->prev, struct page, lru);
 +                      /* must delete as __free_one_page list manipulates */
 +                      list_del(&page->lru);
 +                      __free_one_page(page, zone, 0, migratetype);
 +                      trace_mm_page_pcpu_drain(page, 0, migratetype);
 +              } while (--count && --batch_free && !list_empty(list));
        }
        spin_unlock(&zone->lock);
  }
@@@ -577,7 -563,7 +583,7 @@@ static void __free_pages_ok(struct pag
        unsigned long flags;
        int i;
        int bad = 0;
 -      int wasMlocked = TestClearPageMlocked(page);
 +      int wasMlocked = __TestClearPageMlocked(page);
  
        kmemcheck_free_shadow(page, order);
  
@@@ -666,7 -652,7 +672,7 @@@ static inline void expand(struct zone *
  /*
   * This page is about to be returned from the page allocator
   */
- static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
+ static inline int check_new_page(struct page *page)
  {
        if (unlikely(page_mapcount(page) |
                (page->mapping != NULL)  |
                bad_page(page);
                return 1;
        }
+       return 0;
+ }
+ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
+ {
+       int i;
+       for (i = 0; i < (1 << order); i++) {
+               struct page *p = page + i;
+               if (unlikely(check_new_page(p)))
+                       return 1;
+       }
  
        set_page_private(page, 0);
        set_page_refcounted(page);
@@@ -803,17 -801,6 +821,17 @@@ static int move_freepages_block(struct 
        return move_freepages(zone, start_page, end_page, migratetype);
  }
  
 +static void change_pageblock_range(struct page *pageblock_page,
 +                                      int start_order, int migratetype)
 +{
 +      int nr_pageblocks = 1 << (start_order - pageblock_order);
 +
 +      while (nr_pageblocks--) {
 +              set_pageblock_migratetype(pageblock_page, migratetype);
 +              pageblock_page += pageblock_nr_pages;
 +      }
 +}
 +
  /* Remove an element from the buddy allocator from the fallback list */
  static inline struct page *
  __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
                        list_del(&page->lru);
                        rmv_page_order(page);
  
 -                      if (current_order == pageblock_order)
 -                              set_pageblock_migratetype(page,
 +                      /* Take ownership for orders >= pageblock_order */
 +                      if (current_order >= pageblock_order)
 +                              change_pageblock_range(page, current_order,
                                                        start_migratetype);
  
                        expand(zone, page, order, current_order, area, migratetype);
 +
 +                      trace_mm_page_alloc_extfrag(page, order, current_order,
 +                              start_migratetype, migratetype);
 +
                        return page;
                }
        }
@@@ -910,7 -892,6 +928,7 @@@ retry_reserve
                }
        }
  
 +      trace_mm_page_alloc_zone_locked(page, order, migratetype);
        return page;
  }
  
@@@ -971,7 -952,7 +989,7 @@@ void drain_zone_pages(struct zone *zone
                to_drain = pcp->batch;
        else
                to_drain = pcp->count;
 -      free_pages_bulk(zone, to_drain, &pcp->list, 0);
 +      free_pcppages_bulk(zone, to_drain, pcp);
        pcp->count -= to_drain;
        local_irq_restore(flags);
  }
@@@ -997,7 -978,7 +1015,7 @@@ static void drain_pages(unsigned int cp
  
                pcp = &pset->pcp;
                local_irq_save(flags);
 -              free_pages_bulk(zone, pcp->count, &pcp->list, 0);
 +              free_pcppages_bulk(zone, pcp->count, pcp);
                pcp->count = 0;
                local_irq_restore(flags);
        }
@@@ -1063,8 -1044,7 +1081,8 @@@ static void free_hot_cold_page(struct p
        struct zone *zone = page_zone(page);
        struct per_cpu_pages *pcp;
        unsigned long flags;
 -      int wasMlocked = TestClearPageMlocked(page);
 +      int migratetype;
 +      int wasMlocked = __TestClearPageMlocked(page);
  
        kmemcheck_free_shadow(page, 0);
  
        kernel_map_pages(page, 1, 0);
  
        pcp = &zone_pcp(zone, get_cpu())->pcp;
 -      set_page_private(page, get_pageblock_migratetype(page));
 +      migratetype = get_pageblock_migratetype(page);
 +      set_page_private(page, migratetype);
        local_irq_save(flags);
        if (unlikely(wasMlocked))
                free_page_mlock(page);
        __count_vm_event(PGFREE);
  
 +      /*
 +       * We only track unmovable, reclaimable and movable on pcp lists.
 +       * Free ISOLATE pages back to the allocator because they are being
 +       * offlined but treat RESERVE as movable pages so we can get those
 +       * areas back if necessary. Otherwise, we may have to free
 +       * excessively into the page allocator
 +       */
 +      if (migratetype >= MIGRATE_PCPTYPES) {
 +              if (unlikely(migratetype == MIGRATE_ISOLATE)) {
 +                      free_one_page(zone, page, 0, migratetype);
 +                      goto out;
 +              }
 +              migratetype = MIGRATE_MOVABLE;
 +      }
 +
        if (cold)
 -              list_add_tail(&page->lru, &pcp->list);
 +              list_add_tail(&page->lru, &pcp->lists[migratetype]);
        else
 -              list_add(&page->lru, &pcp->list);
 +              list_add(&page->lru, &pcp->lists[migratetype]);
        pcp->count++;
        if (pcp->count >= pcp->high) {
 -              free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
 +              free_pcppages_bulk(zone, pcp->batch, pcp);
                pcp->count -= pcp->batch;
        }
 +
 +out:
        local_irq_restore(flags);
        put_cpu();
  }
  
  void free_hot_page(struct page *page)
  {
 +      trace_mm_page_free_direct(page, 0);
        free_hot_cold_page(page, 0);
  }
        
 -void free_cold_page(struct page *page)
 -{
 -      free_hot_cold_page(page, 1);
 -}
 -
  /*
   * split_page takes a non-compound higher-order page, and splits it into
   * n (1<<order) sub-pages: page[0..n]
@@@ -1171,23 -1137,35 +1189,23 @@@ again
        cpu  = get_cpu();
        if (likely(order == 0)) {
                struct per_cpu_pages *pcp;
 +              struct list_head *list;
  
                pcp = &zone_pcp(zone, cpu)->pcp;
 +              list = &pcp->lists[migratetype];
                local_irq_save(flags);
 -              if (!pcp->count) {
 -                      pcp->count = rmqueue_bulk(zone, 0,
 -                                      pcp->batch, &pcp->list,
 +              if (list_empty(list)) {
 +                      pcp->count += rmqueue_bulk(zone, 0,
 +                                      pcp->batch, list,
                                        migratetype, cold);
 -                      if (unlikely(!pcp->count))
 +                      if (unlikely(list_empty(list)))
                                goto failed;
                }
  
 -              /* Find a page of the appropriate migrate type */
 -              if (cold) {
 -                      list_for_each_entry_reverse(page, &pcp->list, lru)
 -                              if (page_private(page) == migratetype)
 -                                      break;
 -              } else {
 -                      list_for_each_entry(page, &pcp->list, lru)
 -                              if (page_private(page) == migratetype)
 -                                      break;
 -              }
 -
 -              /* Allocate more to the pcp list if necessary */
 -              if (unlikely(&page->lru == &pcp->list)) {
 -                      pcp->count += rmqueue_bulk(zone, 0,
 -                                      pcp->batch, &pcp->list,
 -                                      migratetype, cold);
 -                      page = list_entry(pcp->list.next, struct page, lru);
 -              }
 +              if (cold)
 +                      page = list_entry(list->prev, struct page, lru);
 +              else
 +                      page = list_entry(list->next, struct page, lru);
  
                list_del(&page->lru);
                pcp->count--;
@@@ -1667,6 -1645,10 +1685,6 @@@ __alloc_pages_direct_reclaim(gfp_t gfp_
  
        /* We now go into synchronous reclaim */
        cpuset_memory_pressure_bump();
 -
 -      /*
 -       * The task's cpuset might have expanded its set of allowable nodes
 -       */
        p->flags |= PF_MEMALLOC;
        lockdep_set_current_reclaim_state(gfp_mask);
        reclaim_state.reclaimed_slab = 0;
@@@ -1801,7 -1783,6 +1819,7 @@@ __alloc_pages_slowpath(gfp_t gfp_mask, 
  
        wake_all_kswapd(order, zonelist, high_zoneidx);
  
 +restart:
        /*
         * OK, we're below the kswapd watermark and have kicked background
         * reclaim. Now things get more complex, so set up alloc_flags according
         */
        alloc_flags = gfp_to_alloc_flags(gfp_mask);
  
 -restart:
        /* This is the last chance, in general, before the goto nopage. */
        page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
                        high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
@@@ -1943,7 -1925,6 +1961,7 @@@ __alloc_pages_nodemask(gfp_t gfp_mask, 
                                zonelist, high_zoneidx, nodemask,
                                preferred_zone, migratetype);
  
 +      trace_mm_page_alloc(page, order, gfp_mask, migratetype);
        return page;
  }
  EXPORT_SYMBOL(__alloc_pages_nodemask);
   */
  unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
  {
 -      struct page * page;
 +      struct page *page;
 +
 +      /*
 +       * __get_free_pages() returns a 32-bit address, which cannot represent
 +       * a highmem page
 +       */
 +      VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);
 +
        page = alloc_pages(gfp_mask, order);
        if (!page)
                return 0;
        return (unsigned long) page_address(page);
  }
 -
  EXPORT_SYMBOL(__get_free_pages);
  
  unsigned long get_zeroed_page(gfp_t gfp_mask)
  {
 -      struct page * page;
 -
 -      /*
 -       * get_zeroed_page() returns a 32-bit address, which cannot represent
 -       * a highmem page
 -       */
 -      VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);
 -
 -      page = alloc_pages(gfp_mask | __GFP_ZERO, 0);
 -      if (page)
 -              return (unsigned long) page_address(page);
 -      return 0;
 +      return __get_free_pages(gfp_mask | __GFP_ZERO, 0);
  }
 -
  EXPORT_SYMBOL(get_zeroed_page);
  
  void __pagevec_free(struct pagevec *pvec)
  {
        int i = pagevec_count(pvec);
  
 -      while (--i >= 0)
 +      while (--i >= 0) {
 +              trace_mm_pagevec_free(pvec->pages[i], pvec->cold);
                free_hot_cold_page(pvec->pages[i], pvec->cold);
 +      }
  }
  
  void __free_pages(struct page *page, unsigned int order)
  {
        if (put_page_testzero(page)) {
 +              trace_mm_page_free_direct(page, order);
                if (order == 0)
                        free_hot_page(page);
                else
@@@ -2162,28 -2146,23 +2180,28 @@@ void show_free_areas(void
                }
        }
  
 -      printk("Active_anon:%lu active_file:%lu inactive_anon:%lu\n"
 -              " inactive_file:%lu"
 +      printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"
 +              " active_file:%lu inactive_file:%lu isolated_file:%lu\n"
                " unevictable:%lu"
 -              " dirty:%lu writeback:%lu unstable:%lu\n"
 -              " free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n",
 +              " dirty:%lu writeback:%lu unstable:%lu buffer:%lu\n"
 +              " free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n"
 +              " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n",
                global_page_state(NR_ACTIVE_ANON),
 -              global_page_state(NR_ACTIVE_FILE),
                global_page_state(NR_INACTIVE_ANON),
 +              global_page_state(NR_ISOLATED_ANON),
 +              global_page_state(NR_ACTIVE_FILE),
                global_page_state(NR_INACTIVE_FILE),
 +              global_page_state(NR_ISOLATED_FILE),
                global_page_state(NR_UNEVICTABLE),
                global_page_state(NR_FILE_DIRTY),
                global_page_state(NR_WRITEBACK),
                global_page_state(NR_UNSTABLE_NFS),
 +              nr_blockdev_pages(),
                global_page_state(NR_FREE_PAGES),
 -              global_page_state(NR_SLAB_RECLAIMABLE) +
 -                      global_page_state(NR_SLAB_UNRECLAIMABLE),
 +              global_page_state(NR_SLAB_RECLAIMABLE),
 +              global_page_state(NR_SLAB_UNRECLAIMABLE),
                global_page_state(NR_FILE_MAPPED),
 +              global_page_state(NR_SHMEM),
                global_page_state(NR_PAGETABLE),
                global_page_state(NR_BOUNCE));
  
                        " active_file:%lukB"
                        " inactive_file:%lukB"
                        " unevictable:%lukB"
 +                      " isolated(anon):%lukB"
 +                      " isolated(file):%lukB"
                        " present:%lukB"
 +                      " mlocked:%lukB"
 +                      " dirty:%lukB"
 +                      " writeback:%lukB"
 +                      " mapped:%lukB"
 +                      " shmem:%lukB"
 +                      " slab_reclaimable:%lukB"
 +                      " slab_unreclaimable:%lukB"
 +                      " kernel_stack:%lukB"
 +                      " pagetables:%lukB"
 +                      " unstable:%lukB"
 +                      " bounce:%lukB"
 +                      " writeback_tmp:%lukB"
                        " pages_scanned:%lu"
                        " all_unreclaimable? %s"
                        "\n",
                        K(zone_page_state(zone, NR_ACTIVE_FILE)),
                        K(zone_page_state(zone, NR_INACTIVE_FILE)),
                        K(zone_page_state(zone, NR_UNEVICTABLE)),
 +                      K(zone_page_state(zone, NR_ISOLATED_ANON)),
 +                      K(zone_page_state(zone, NR_ISOLATED_FILE)),
                        K(zone->present_pages),
 +                      K(zone_page_state(zone, NR_MLOCK)),
 +                      K(zone_page_state(zone, NR_FILE_DIRTY)),
 +                      K(zone_page_state(zone, NR_WRITEBACK)),
 +                      K(zone_page_state(zone, NR_FILE_MAPPED)),
 +                      K(zone_page_state(zone, NR_SHMEM)),
 +                      K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)),
 +                      K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)),
 +                      zone_page_state(zone, NR_KERNEL_STACK) *
 +                              THREAD_SIZE / 1024,
 +                      K(zone_page_state(zone, NR_PAGETABLE)),
 +                      K(zone_page_state(zone, NR_UNSTABLE_NFS)),
 +                      K(zone_page_state(zone, NR_BOUNCE)),
 +                      K(zone_page_state(zone, NR_WRITEBACK_TEMP)),
                        zone->pages_scanned,
                        (zone_is_all_unreclaimable(zone) ? "yes" : "no")
                        );
@@@ -2373,7 -2323,7 +2391,7 @@@ early_param("numa_zonelist_order", setu
   * sysctl handler for numa_zonelist_order
   */
  int numa_zonelist_order_handler(ctl_table *table, int write,
 -              struct file *file, void __user *buffer, size_t *length,
 +              void __user *buffer, size_t *length,
                loff_t *ppos)
  {
        char saved_string[NUMA_ZONELIST_ORDER_LEN];
        if (write)
                strncpy(saved_string, (char*)table->data,
                        NUMA_ZONELIST_ORDER_LEN);
 -      ret = proc_dostring(table, write, file, buffer, length, ppos);
 +      ret = proc_dostring(table, write, buffer, length, ppos);
        if (ret)
                return ret;
        if (write) {
@@@ -2851,8 -2801,7 +2869,8 @@@ static void setup_zone_migrate_reserve(
  {
        unsigned long start_pfn, pfn, end_pfn;
        struct page *page;
 -      unsigned long reserve, block_migratetype;
 +      unsigned long block_migratetype;
 +      int reserve;
  
        /* Get the start pfn, end pfn and the number of blocks to reserve */
        start_pfn = zone->zone_start_pfn;
        reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>
                                                        pageblock_order;
  
 +      /*
 +       * Reserve blocks are generally in place to help high-order atomic
 +       * allocations that are short-lived. A min_free_kbytes value that
 +       * would result in more than 2 reserve blocks for atomic allocations
 +       * is assumed to be in place to help anti-fragmentation for the
 +       * future allocation of hugepages at runtime.
 +       */
 +      reserve = min(2, reserve);
 +
        for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
                if (!pfn_valid(pfn))
                        continue;
@@@ -3039,7 -2979,6 +3057,7 @@@ static int zone_batchsize(struct zone *
  static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
  {
        struct per_cpu_pages *pcp;
 +      int migratetype;
  
        memset(p, 0, sizeof(*p));
  
        pcp->count = 0;
        pcp->high = 6 * batch;
        pcp->batch = max(1UL, 1 * batch);
 -      INIT_LIST_HEAD(&pcp->list);
 +      for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)
 +              INIT_LIST_HEAD(&pcp->lists[migratetype]);
  }
  
  /*
@@@ -3226,32 -3164,6 +3244,32 @@@ int zone_wait_table_init(struct zone *z
        return 0;
  }
  
 +static int __zone_pcp_update(void *data)
 +{
 +      struct zone *zone = data;
 +      int cpu;
 +      unsigned long batch = zone_batchsize(zone), flags;
 +
 +      for (cpu = 0; cpu < NR_CPUS; cpu++) {
 +              struct per_cpu_pageset *pset;
 +              struct per_cpu_pages *pcp;
 +
 +              pset = zone_pcp(zone, cpu);
 +              pcp = &pset->pcp;
 +
 +              local_irq_save(flags);
 +              free_pcppages_bulk(zone, pcp->count, pcp);
 +              setup_pageset(pset, batch);
 +              local_irq_restore(flags);
 +      }
 +      return 0;
 +}
 +
 +void zone_pcp_update(struct zone *zone)
 +{
 +      stop_machine(__zone_pcp_update, zone, NULL);
 +}
 +
  static __meminit void zone_pcp_init(struct zone *zone)
  {
        int cpu;
@@@ -3826,7 -3738,7 +3844,7 @@@ static void __paginginit free_area_init
                zone_pcp_init(zone);
                for_each_lru(l) {
                        INIT_LIST_HEAD(&zone->lru[l].list);
 -                      zone->lru[l].nr_saved_scan = 0;
 +                      zone->reclaim_stat.nr_saved_scan[l] = 0;
                }
                zone->reclaim_stat.recent_rotated[0] = 0;
                zone->reclaim_stat.recent_rotated[1] = 0;
@@@ -4615,7 -4527,7 +4633,7 @@@ void setup_per_zone_wmarks(void
        calculate_totalreserve_pages();
  }
  
 -/**
 +/*
   * The inactive anon list should be small enough that the VM never has to
   * do too much work, but large enough that each inactive page has a chance
   * to be referenced again before it is swapped out.
@@@ -4706,9 -4618,9 +4724,9 @@@ module_init(init_per_zone_wmark_min
   *    changes.
   */
  int min_free_kbytes_sysctl_handler(ctl_table *table, int write, 
 -      struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
 +      void __user *buffer, size_t *length, loff_t *ppos)
  {
 -      proc_dointvec(table, write, file, buffer, length, ppos);
 +      proc_dointvec(table, write, buffer, length, ppos);
        if (write)
                setup_per_zone_wmarks();
        return 0;
  
  #ifdef CONFIG_NUMA
  int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
 -      struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
 +      void __user *buffer, size_t *length, loff_t *ppos)
  {
        struct zone *zone;
        int rc;
  
 -      rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
 +      rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
        if (rc)
                return rc;
  
  }
  
  int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
 -      struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
 +      void __user *buffer, size_t *length, loff_t *ppos)
  {
        struct zone *zone;
        int rc;
  
 -      rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
 +      rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
        if (rc)
                return rc;
  
   * if in function of the boot time zone sizes.
   */
  int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
 -      struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
 +      void __user *buffer, size_t *length, loff_t *ppos)
  {
 -      proc_dointvec_minmax(table, write, file, buffer, length, ppos);
 +      proc_dointvec_minmax(table, write, buffer, length, ppos);
        setup_per_zone_lowmem_reserve();
        return 0;
  }
   */
  
  int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
 -      struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
 +      void __user *buffer, size_t *length, loff_t *ppos)
  {
        struct zone *zone;
        unsigned int cpu;
        int ret;
  
 -      ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
 +      ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
        if (!write || (ret == -EINVAL))
                return ret;
        for_each_populated_zone(zone) {
@@@ -4838,14 -4750,7 +4856,14 @@@ void *__init alloc_large_system_hash(co
                        numentries <<= (PAGE_SHIFT - scale);
  
                /* Make sure we've got at least a 0-order allocation.. */
 -              if (unlikely((numentries * bucketsize) < PAGE_SIZE))
 +              if (unlikely(flags & HASH_SMALL)) {
 +                      /* Makes no sense without HASH_EARLY */
 +                      WARN_ON(!(flags & HASH_EARLY));
 +                      if (!(numentries >> *_hash_shift)) {
 +                              numentries = 1UL << *_hash_shift;
 +                              BUG_ON(!numentries);
 +                      }
 +              } else if (unlikely((numentries * bucketsize) < PAGE_SIZE))
                        numentries = PAGE_SIZE / bucketsize;
        }
        numentries = roundup_pow_of_two(numentries);
@@@ -4987,16 -4892,13 +5005,16 @@@ int set_migratetype_isolate(struct pag
        struct zone *zone;
        unsigned long flags;
        int ret = -EBUSY;
 +      int zone_idx;
  
        zone = page_zone(page);
 +      zone_idx = zone_idx(zone);
        spin_lock_irqsave(&zone->lock, flags);
        /*
         * In future, more migrate types will be able to be isolation target.
         */
 -      if (get_pageblock_migratetype(page) != MIGRATE_MOVABLE)
 +      if (get_pageblock_migratetype(page) != MIGRATE_MOVABLE &&
 +          zone_idx != ZONE_MOVABLE)
                goto out;
        set_pageblock_migratetype(page, MIGRATE_ISOLATE);
        move_freepages_block(zone, page, MIGRATE_ISOLATE);
diff --combined mm/rmap.c
index 720fc03a7bc454de75fa86f542770ab9b9660788,09c3d0b961168b6168a41b47a80e7f2c4774799f..28aafe2b530668b03c766619a83873ee2a91087e
+++ b/mm/rmap.c
   *                 mapping->tree_lock (widely used, in set_page_dirty,
   *                           in arch-dependent flush_dcache_mmap_lock,
   *                           within inode_lock in __sync_single_inode)
+  *
+  * (code doesn't rely on that order so it could be switched around)
+  * ->tasklist_lock
+  *   anon_vma->lock      (memory_failure, collect_procs_anon)
+  *     pte map lock
   */
  
  #include <linux/mm.h>
@@@ -191,7 -196,7 +196,7 @@@ void __init anon_vma_init(void
   * Getting a lock on a stable anon_vma from a page off the LRU is
   * tricky: page_lock_anon_vma rely on RCU to guard against the races.
   */
- static struct anon_vma *page_lock_anon_vma(struct page *page)
+ struct anon_vma *page_lock_anon_vma(struct page *page)
  {
        struct anon_vma *anon_vma;
        unsigned long anon_mapping;
@@@ -211,7 -216,7 +216,7 @@@ out
        return NULL;
  }
  
static void page_unlock_anon_vma(struct anon_vma *anon_vma)
+ void page_unlock_anon_vma(struct anon_vma *anon_vma)
  {
        spin_unlock(&anon_vma->lock);
        rcu_read_unlock();
@@@ -311,7 -316,7 +316,7 @@@ pte_t *page_check_address(struct page *
   * if the page is not mapped into the page tables of this VMA.  Only
   * valid for normal file or anonymous VMAs.
   */
static int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
+ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
  {
        unsigned long address;
        pte_t *pte;
@@@ -710,6 -715,27 +715,6 @@@ void page_add_file_rmap(struct page *pa
        }
  }
  
 -#ifdef CONFIG_DEBUG_VM
 -/**
 - * page_dup_rmap - duplicate pte mapping to a page
 - * @page:     the page to add the mapping to
 - * @vma:      the vm area being duplicated
 - * @address:  the user virtual address mapped
 - *
 - * For copy_page_range only: minimal extract from page_add_file_rmap /
 - * page_add_anon_rmap, avoiding unnecessary tests (already checked) so it's
 - * quicker.
 - *
 - * The caller needs to hold the pte lock.
 - */
 -void page_dup_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address)
 -{
 -      if (PageAnon(page))
 -              __page_check_anon_rmap(page, vma, address);
 -      atomic_inc(&page->_mapcount);
 -}
 -#endif
 -
  /**
   * page_remove_rmap - take down pte mapping from a page
   * @page: page to remove mapping from
   */
  void page_remove_rmap(struct page *page)
  {
 -      if (atomic_add_negative(-1, &page->_mapcount)) {
 -              /*
 -               * Now that the last pte has gone, s390 must transfer dirty
 -               * flag from storage key to struct page.  We can usually skip
 -               * this if the page is anon, so about to be freed; but perhaps
 -               * not if it's in swapcache - there might be another pte slot
 -               * containing the swap entry, but page not yet written to swap.
 -               */
 -              if ((!PageAnon(page) || PageSwapCache(page)) &&
 -                  page_test_dirty(page)) {
 -                      page_clear_dirty(page);
 -                      set_page_dirty(page);
 -              }
 -              if (PageAnon(page))
 -                      mem_cgroup_uncharge_page(page);
 -              __dec_zone_page_state(page,
 -                      PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED);
 -              mem_cgroup_update_mapped_file_stat(page, -1);
 -              /*
 -               * It would be tidy to reset the PageAnon mapping here,
 -               * but that might overwrite a racing page_add_anon_rmap
 -               * which increments mapcount after us but sets mapping
 -               * before us: so leave the reset to free_hot_cold_page,
 -               * and remember that it's only reliable while mapped.
 -               * Leaving it set also helps swapoff to reinstate ptes
 -               * faster for those pages still in swapcache.
 -               */
 +      /* page still mapped by someone else? */
 +      if (!atomic_add_negative(-1, &page->_mapcount))
 +              return;
 +
 +      /*
 +       * Now that the last pte has gone, s390 must transfer dirty
 +       * flag from storage key to struct page.  We can usually skip
 +       * this if the page is anon, so about to be freed; but perhaps
 +       * not if it's in swapcache - there might be another pte slot
 +       * containing the swap entry, but page not yet written to swap.
 +       */
 +      if ((!PageAnon(page) || PageSwapCache(page)) && page_test_dirty(page)) {
 +              page_clear_dirty(page);
 +              set_page_dirty(page);
 +      }
 +      if (PageAnon(page)) {
 +              mem_cgroup_uncharge_page(page);
 +              __dec_zone_page_state(page, NR_ANON_PAGES);
 +      } else {
 +              __dec_zone_page_state(page, NR_FILE_MAPPED);
        }
 +      mem_cgroup_update_mapped_file_stat(page, -1);
 +      /*
 +       * It would be tidy to reset the PageAnon mapping here,
 +       * but that might overwrite a racing page_add_anon_rmap
 +       * which increments mapcount after us but sets mapping
 +       * before us: so leave the reset to free_hot_cold_page,
 +       * and remember that it's only reliable while mapped.
 +       * Leaving it set also helps swapoff to reinstate ptes
 +       * faster for those pages still in swapcache.
 +       */
  }
  
  /*
   * repeatedly from either try_to_unmap_anon or try_to_unmap_file.
   */
  static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
-                               int migration)
+                               enum ttu_flags flags)
  {
        struct mm_struct *mm = vma->vm_mm;
        unsigned long address;
         * If it's recently referenced (perhaps page_referenced
         * skipped over this mm) then we should reactivate it.
         */
-       if (!migration) {
+       if (!(flags & TTU_IGNORE_MLOCK)) {
                if (vma->vm_flags & VM_LOCKED) {
                        ret = SWAP_MLOCK;
                        goto out_unmap;
                }
+       }
+       if (!(flags & TTU_IGNORE_ACCESS)) {
                if (ptep_clear_flush_young_notify(vma, address, pte)) {
                        ret = SWAP_FAIL;
                        goto out_unmap;
        /* Update high watermark before we lower rss */
        update_hiwater_rss(mm);
  
-       if (PageAnon(page)) {
+       if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) {
+               if (PageAnon(page))
+                       dec_mm_counter(mm, anon_rss);
+               else
+                       dec_mm_counter(mm, file_rss);
+               set_pte_at(mm, address, pte,
+                               swp_entry_to_pte(make_hwpoison_entry(page)));
+       } else if (PageAnon(page)) {
                swp_entry_t entry = { .val = page_private(page) };
  
                if (PageSwapCache(page)) {
                         * pte. do_swap_page() will wait until the migration
                         * pte is removed and then restart fault handling.
                         */
-                       BUG_ON(!migration);
+                       BUG_ON(TTU_ACTION(flags) != TTU_MIGRATION);
                        entry = make_migration_entry(page, pte_write(pteval));
                }
                set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
                BUG_ON(pte_file(*pte));
-       } else if (PAGE_MIGRATION && migration) {
+       } else if (PAGE_MIGRATION && (TTU_ACTION(flags) == TTU_MIGRATION)) {
                /* Establish migration entry for a file page */
                swp_entry_t entry;
                entry = make_migration_entry(page, pte_write(pteval));
@@@ -996,12 -1028,13 +1010,13 @@@ static int try_to_mlock_page(struct pag
   * vm_flags for that VMA.  That should be OK, because that vma shouldn't be
   * 'LOCKED.
   */
- static int try_to_unmap_anon(struct page *page, int unlock, int migration)
+ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
  {
        struct anon_vma *anon_vma;
        struct vm_area_struct *vma;
        unsigned int mlocked = 0;
        int ret = SWAP_AGAIN;
+       int unlock = TTU_ACTION(flags) == TTU_MUNLOCK;
  
        if (MLOCK_PAGES && unlikely(unlock))
                ret = SWAP_SUCCESS;     /* default for try_to_munlock() */
                                continue;  /* must visit all unlocked vmas */
                        ret = SWAP_MLOCK;  /* saw at least one mlocked vma */
                } else {
-                       ret = try_to_unmap_one(page, vma, migration);
+                       ret = try_to_unmap_one(page, vma, flags);
                        if (ret == SWAP_FAIL || !page_mapped(page))
                                break;
                }
  /**
   * try_to_unmap_file - unmap/unlock file page using the object-based rmap method
   * @page: the page to unmap/unlock
-  * @unlock:  request for unlock rather than unmap [unlikely]
-  * @migration:  unmapping for migration - ignored if @unlock
+  * @flags: action and flags
   *
   * Find all the mappings of a page using the mapping pointer and the vma chains
   * contained in the address_space struct it points to.
   * vm_flags for that VMA.  That should be OK, because that vma shouldn't be
   * 'LOCKED.
   */
- static int try_to_unmap_file(struct page *page, int unlock, int migration)
+ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
  {
        struct address_space *mapping = page->mapping;
        pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
        unsigned long max_nl_size = 0;
        unsigned int mapcount;
        unsigned int mlocked = 0;
+       int unlock = TTU_ACTION(flags) == TTU_MUNLOCK;
  
        if (MLOCK_PAGES && unlikely(unlock))
                ret = SWAP_SUCCESS;     /* default for try_to_munlock() */
                                continue;       /* must visit all vmas */
                        ret = SWAP_MLOCK;
                } else {
-                       ret = try_to_unmap_one(page, vma, migration);
+                       ret = try_to_unmap_one(page, vma, flags);
                        if (ret == SWAP_FAIL || !page_mapped(page))
                                goto out;
                }
                        ret = SWAP_MLOCK;       /* leave mlocked == 0 */
                        goto out;               /* no need to look further */
                }
-               if (!MLOCK_PAGES && !migration && (vma->vm_flags & VM_LOCKED))
+               if (!MLOCK_PAGES && !(flags & TTU_IGNORE_MLOCK) &&
+                       (vma->vm_flags & VM_LOCKED))
                        continue;
                cursor = (unsigned long) vma->vm_private_data;
                if (cursor > max_nl_cursor)
        do {
                list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
                                                shared.vm_set.list) {
-                       if (!MLOCK_PAGES && !migration &&
+                       if (!MLOCK_PAGES && !(flags & TTU_IGNORE_MLOCK) &&
                            (vma->vm_flags & VM_LOCKED))
                                continue;
                        cursor = (unsigned long) vma->vm_private_data;
@@@ -1177,7 -1211,7 +1193,7 @@@ out
  /**
   * try_to_unmap - try to remove all page table mappings to a page
   * @page: the page to get unmapped
-  * @migration: migration flag
+  * @flags: action and flags
   *
   * Tries to remove all the page table entries which are mapping this
   * page, used in the pageout path.  Caller must hold the page lock.
   * SWAP_FAIL  - the page is unswappable
   * SWAP_MLOCK - page is mlocked.
   */
- int try_to_unmap(struct page *page, int migration)
+ int try_to_unmap(struct page *page, enum ttu_flags flags)
  {
        int ret;
  
        BUG_ON(!PageLocked(page));
  
        if (PageAnon(page))
-               ret = try_to_unmap_anon(page, 0, migration);
+               ret = try_to_unmap_anon(page, flags);
        else
-               ret = try_to_unmap_file(page, 0, migration);
+               ret = try_to_unmap_file(page, flags);
        if (ret != SWAP_MLOCK && !page_mapped(page))
                ret = SWAP_SUCCESS;
        return ret;
@@@ -1222,8 -1256,8 +1238,8 @@@ int try_to_munlock(struct page *page
        VM_BUG_ON(!PageLocked(page) || PageLRU(page));
  
        if (PageAnon(page))
-               return try_to_unmap_anon(page, 1, 0);
+               return try_to_unmap_anon(page, TTU_MUNLOCK);
        else
-               return try_to_unmap_file(page, 1, 0);
+               return try_to_unmap_file(page, TTU_MUNLOCK);
  }
  
diff --combined mm/shmem.c
index b206a7a32e2a4e00bc7446ae839407f656031643,bec85895a1fe1d75c7e15c1115d128f411295dd0..98631c26c20001931a6e4ca13032716992d6808c
@@@ -49,6 -49,7 +49,6 @@@ static struct vfsmount *shm_mnt
  #include <linux/backing-dev.h>
  #include <linux/shmem_fs.h>
  #include <linux/writeback.h>
 -#include <linux/vfs.h>
  #include <linux/blkdev.h>
  #include <linux/security.h>
  #include <linux/swapops.h>
@@@ -1096,10 -1097,6 +1096,10 @@@ static int shmem_writepage(struct page 
        shmem_swp_unmap(entry);
  unlock:
        spin_unlock(&info->lock);
 +      /*
 +       * add_to_swap_cache() doesn't return -EEXIST, so we can safely
 +       * clear SWAP_HAS_CACHE flag.
 +       */
        swapcache_free(swap, NULL);
  redirty:
        set_page_dirty(page);
@@@ -1633,8 -1630,8 +1633,8 @@@ shmem_write_end(struct file *file, stru
        if (pos + copied > inode->i_size)
                i_size_write(inode, pos + copied);
  
-       unlock_page(page);
        set_page_dirty(page);
+       unlock_page(page);
        page_cache_release(page);
  
        return copied;
@@@ -1971,13 -1968,13 +1971,13 @@@ static int shmem_symlink(struct inode *
                        iput(inode);
                        return error;
                }
-               unlock_page(page);
                inode->i_mapping->a_ops = &shmem_aops;
                inode->i_op = &shmem_symlink_inode_operations;
                kaddr = kmap_atomic(page, KM_USER0);
                memcpy(kaddr, symname, len);
                kunmap_atomic(kaddr, KM_USER0);
                set_page_dirty(page);
+               unlock_page(page);
                page_cache_release(page);
        }
        if (dir->i_mode & S_ISGID)
@@@ -2301,7 -2298,8 +2301,7 @@@ static void shmem_put_super(struct supe
        sb->s_fs_info = NULL;
  }
  
 -static int shmem_fill_super(struct super_block *sb,
 -                          void *data, int silent)
 +int shmem_fill_super(struct super_block *sb, void *data, int silent)
  {
        struct inode *inode;
        struct dentry *root;
        int err = -ENOMEM;
  
        /* Round up to L1_CACHE_BYTES to resist false sharing */
 -      sbinfo = kmalloc(max((int)sizeof(struct shmem_sb_info),
 +      sbinfo = kzalloc(max((int)sizeof(struct shmem_sb_info),
                                L1_CACHE_BYTES), GFP_KERNEL);
        if (!sbinfo)
                return -ENOMEM;
  
 -      sbinfo->max_blocks = 0;
 -      sbinfo->max_inodes = 0;
        sbinfo->mode = S_IRWXUGO | S_ISVTX;
        sbinfo->uid = current_fsuid();
        sbinfo->gid = current_fsgid();
 -      sbinfo->mpol = NULL;
        sb->s_fs_info = sbinfo;
  
  #ifdef CONFIG_TMPFS
@@@ -2420,6 -2421,7 +2420,7 @@@ static const struct address_space_opera
        .write_end      = shmem_write_end,
  #endif
        .migratepage    = migrate_page,
+       .error_remove_page = generic_error_remove_page,
  };
  
  static const struct file_operations shmem_file_operations = {
@@@ -2518,7 -2520,7 +2519,7 @@@ static struct file_system_type tmpfs_fs
        .kill_sb        = kill_litter_super,
  };
  
 -static int __init init_tmpfs(void)
 +int __init init_tmpfs(void)
  {
        int error;
  
@@@ -2575,7 -2577,7 +2576,7 @@@ static struct file_system_type tmpfs_fs
        .kill_sb        = kill_litter_super,
  };
  
 -static int __init init_tmpfs(void)
 +int __init init_tmpfs(void)
  {
        BUG_ON(register_filesystem(&tmpfs_fs_type) != 0);
  
@@@ -2590,11 -2592,6 +2591,11 @@@ int shmem_unuse(swp_entry_t entry, stru
        return 0;
  }
  
 +int shmem_lock(struct file *file, int lock, struct user_struct *user)
 +{
 +      return 0;
 +}
 +
  #define shmem_vm_ops                          generic_file_vm_ops
  #define shmem_file_operations                 ramfs_file_operations
  #define shmem_get_inode(sb, mode, dev, flags) ramfs_get_inode(sb, mode, dev)
@@@ -2691,3 -2688,5 +2692,3 @@@ int shmem_zero_setup(struct vm_area_str
        vma->vm_ops = &shmem_vm_ops;
        return 0;
  }
 -
 -module_init(init_tmpfs)
diff --combined mm/swapfile.c
index f1bf19daadc67143b099518c1bc29aa52ae04227,ce5dda6d604b503f2284d727f1600a3f12af97eb..4de7f02f820b03bfcf36b5fc8d6827b5eecd38cb
@@@ -699,7 -699,7 +699,7 @@@ int free_swap_and_cache(swp_entry_t ent
        struct swap_info_struct *p;
        struct page *page = NULL;
  
-       if (is_migration_entry(entry))
+       if (non_swap_entry(entry))
                return 1;
  
        p = swap_info_get(entry);
@@@ -1575,9 -1575,9 +1575,9 @@@ SYSCALL_DEFINE1(swapoff, const char __u
        p->flags &= ~SWP_WRITEOK;
        spin_unlock(&swap_lock);
  
 -      current->flags |= PF_SWAPOFF;
 +      current->flags |= PF_OOM_ORIGIN;
        err = try_to_unuse(type);
 -      current->flags &= ~PF_SWAPOFF;
 +      current->flags &= ~PF_OOM_ORIGIN;
  
        if (err) {
                /* re-insert swap space back into swap_list */
@@@ -2085,7 -2085,7 +2085,7 @@@ static int __swap_duplicate(swp_entry_
        int count;
        bool has_cache;
  
-       if (is_migration_entry(entry))
+       if (non_swap_entry(entry))
                return -EINVAL;
  
        type = swp_type(entry);
diff --combined mm/vmscan.c
index f444b7409085fc8e224da0b11e592b4364daccdd,ab3b0ad3ce527460fc08fe8169c16a6f59d479b6..1219ceb8a9b2d992da20bb9a10942e7cef2d98b1
@@@ -148,8 -148,8 +148,8 @@@ static struct zone_reclaim_stat *get_re
        return &zone->reclaim_stat;
  }
  
 -static unsigned long zone_nr_pages(struct zone *zone, struct scan_control *sc,
 -                                 enum lru_list lru)
 +static unsigned long zone_nr_lru_pages(struct zone *zone,
 +                              struct scan_control *sc, enum lru_list lru)
  {
        if (!scanning_global_lru(sc))
                return mem_cgroup_zone_nr_pages(sc->mem_cgroup, zone, lru);
@@@ -286,12 -286,7 +286,12 @@@ static inline int page_mapping_inuse(st
  
  static inline int is_page_cache_freeable(struct page *page)
  {
 -      return page_count(page) - !!page_has_private(page) == 2;
 +      /*
 +       * A freeable page cache page is referenced only by the caller
 +       * that isolated the page, the page cache radix tree and
 +       * optional buffer heads at page->private.
 +       */
 +      return page_count(page) - page_has_private(page) == 2;
  }
  
  static int may_write_to_queue(struct backing_dev_info *bdi)
@@@ -366,6 -361,7 +366,6 @@@ static pageout_t pageout(struct page *p
         * block, for some throttling. This happens by accident, because
         * swap_backing_dev_info is bust: it doesn't reflect the
         * congestion state of the swapdevs.  Easy to fix, if needed.
 -       * See swapfile.c:page_queue_congested().
         */
        if (!is_page_cache_freeable(page))
                return PAGE_KEEP;
@@@ -535,7 -531,7 +535,7 @@@ redo
                 * unevictable page on [in]active list.
                 * We know how to handle that.
                 */
 -              lru = active + page_is_file_cache(page);
 +              lru = active + page_lru_base_type(page);
                lru_cache_add_lru(page, lru);
        } else {
                /*
@@@ -663,7 -659,7 +663,7 @@@ static unsigned long shrink_page_list(s
                 * processes. Try to unmap it here.
                 */
                if (page_mapped(page) && mapping) {
-                       switch (try_to_unmap(page, 0)) {
+                       switch (try_to_unmap(page, TTU_UNMAP)) {
                        case SWAP_FAIL:
                                goto activate_locked;
                        case SWAP_AGAIN:
@@@ -825,7 -821,7 +825,7 @@@ int __isolate_lru_page(struct page *pag
        if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode))
                return ret;
  
 -      if (mode != ISOLATE_BOTH && (!page_is_file_cache(page) != !file))
 +      if (mode != ISOLATE_BOTH && page_is_file_cache(page) != file)
                return ret;
  
        /*
@@@ -939,16 -935,6 +939,16 @@@ static unsigned long isolate_lru_pages(
                        /* Check that we have not crossed a zone boundary. */
                        if (unlikely(page_zone_id(cursor_page) != zone_id))
                                continue;
 +
 +                      /*
 +                       * If we don't have enough swap space, reclaiming of
 +                       * anon page which don't already have a swap slot is
 +                       * pointless.
 +                       */
 +                      if (nr_swap_pages <= 0 && PageAnon(cursor_page) &&
 +                                      !PageSwapCache(cursor_page))
 +                              continue;
 +
                        if (__isolate_lru_page(cursor_page, mode, file) == 0) {
                                list_move(&cursor_page->lru, dst);
                                mem_cgroup_del_lru(cursor_page);
@@@ -975,7 -961,7 +975,7 @@@ static unsigned long isolate_pages_glob
        if (file)
                lru += LRU_FILE;
        return isolate_lru_pages(nr, &z->lru[lru].list, dst, scanned, order,
 -                                                              mode, !!file);
 +                                                              mode, file);
  }
  
  /*
@@@ -990,7 -976,7 +990,7 @@@ static unsigned long clear_active_flags
        struct page *page;
  
        list_for_each_entry(page, page_list, lru) {
 -              lru = page_is_file_cache(page);
 +              lru = page_lru_base_type(page);
                if (PageActive(page)) {
                        lru += LRU_ACTIVE;
                        ClearPageActive(page);
@@@ -1047,31 -1033,6 +1047,31 @@@ int isolate_lru_page(struct page *page
        return ret;
  }
  
 +/*
 + * Are there way too many processes in the direct reclaim path already?
 + */
 +static int too_many_isolated(struct zone *zone, int file,
 +              struct scan_control *sc)
 +{
 +      unsigned long inactive, isolated;
 +
 +      if (current_is_kswapd())
 +              return 0;
 +
 +      if (!scanning_global_lru(sc))
 +              return 0;
 +
 +      if (file) {
 +              inactive = zone_page_state(zone, NR_INACTIVE_FILE);
 +              isolated = zone_page_state(zone, NR_ISOLATED_FILE);
 +      } else {
 +              inactive = zone_page_state(zone, NR_INACTIVE_ANON);
 +              isolated = zone_page_state(zone, NR_ISOLATED_ANON);
 +      }
 +
 +      return isolated > inactive;
 +}
 +
  /*
   * shrink_inactive_list() is a helper for shrink_zone().  It returns the number
   * of reclaimed pages
@@@ -1087,14 -1048,6 +1087,14 @@@ static unsigned long shrink_inactive_li
        struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
        int lumpy_reclaim = 0;
  
 +      while (unlikely(too_many_isolated(zone, file, sc))) {
 +              congestion_wait(WRITE, HZ/10);
 +
 +              /* We are about to die and free our memory. Return now. */
 +              if (fatal_signal_pending(current))
 +                      return SWAP_CLUSTER_MAX;
 +      }
 +
        /*
         * If we need a large contiguous chunk of memory, or have
         * trouble getting a small set of contiguous pages, we
                unsigned long nr_active;
                unsigned int count[NR_LRU_LISTS] = { 0, };
                int mode = lumpy_reclaim ? ISOLATE_BOTH : ISOLATE_INACTIVE;
 +              unsigned long nr_anon;
 +              unsigned long nr_file;
  
                nr_taken = sc->isolate_pages(sc->swap_cluster_max,
                             &page_list, &nr_scan, sc->order, mode,
                                zone, sc->mem_cgroup, 0, file);
 +
 +              if (scanning_global_lru(sc)) {
 +                      zone->pages_scanned += nr_scan;
 +                      if (current_is_kswapd())
 +                              __count_zone_vm_events(PGSCAN_KSWAPD, zone,
 +                                                     nr_scan);
 +                      else
 +                              __count_zone_vm_events(PGSCAN_DIRECT, zone,
 +                                                     nr_scan);
 +              }
 +
 +              if (nr_taken == 0)
 +                      goto done;
 +
                nr_active = clear_active_flags(&page_list, count);
                __count_vm_events(PGDEACTIVATE, nr_active);
  
                __mod_zone_page_state(zone, NR_INACTIVE_ANON,
                                                -count[LRU_INACTIVE_ANON]);
  
 -              if (scanning_global_lru(sc))
 -                      zone->pages_scanned += nr_scan;
 +              nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON];
 +              nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE];
 +              __mod_zone_page_state(zone, NR_ISOLATED_ANON, nr_anon);
 +              __mod_zone_page_state(zone, NR_ISOLATED_FILE, nr_file);
  
                reclaim_stat->recent_scanned[0] += count[LRU_INACTIVE_ANON];
                reclaim_stat->recent_scanned[0] += count[LRU_ACTIVE_ANON];
                }
  
                nr_reclaimed += nr_freed;
 +
                local_irq_disable();
 -              if (current_is_kswapd()) {
 -                      __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scan);
 +              if (current_is_kswapd())
                        __count_vm_events(KSWAPD_STEAL, nr_freed);
 -              } else if (scanning_global_lru(sc))
 -                      __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan);
 -
                __count_zone_vm_events(PGSTEAL, zone, nr_freed);
  
 -              if (nr_taken == 0)
 -                      goto done;
 -
                spin_lock(&zone->lru_lock);
                /*
                 * Put back any unfreeable pages.
                        SetPageLRU(page);
                        lru = page_lru(page);
                        add_page_to_lru_list(zone, page, lru);
 -                      if (PageActive(page)) {
 -                              int file = !!page_is_file_cache(page);
 +                      if (is_active_lru(lru)) {
 +                              int file = is_file_lru(lru);
                                reclaim_stat->recent_rotated[file]++;
                        }
                        if (!pagevec_add(&pvec, page)) {
                                spin_lock_irq(&zone->lru_lock);
                        }
                }
 +              __mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon);
 +              __mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file);
 +
        } while (nr_scanned < max_scan);
 -      spin_unlock(&zone->lru_lock);
 +
  done:
 -      local_irq_enable();
 +      spin_unlock_irq(&zone->lru_lock);
        pagevec_release(&pvec);
        return nr_reclaimed;
  }
@@@ -1277,10 -1215,15 +1277,10 @@@ static void move_active_pages_to_lru(st
  
        while (!list_empty(list)) {
                page = lru_to_page(list);
 -              prefetchw_prev_lru_page(page, list, flags);
  
                VM_BUG_ON(PageLRU(page));
                SetPageLRU(page);
  
 -              VM_BUG_ON(!PageActive(page));
 -              if (!is_active_lru(lru))
 -                      ClearPageActive(page);  /* we are de-activating */
 -
                list_move(&page->lru, &zone->lru[lru].list);
                mem_cgroup_add_lru_list(page, lru);
                pgmoved++;
  static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
                        struct scan_control *sc, int priority, int file)
  {
 -      unsigned long pgmoved;
 +      unsigned long nr_taken;
        unsigned long pgscanned;
        unsigned long vm_flags;
        LIST_HEAD(l_hold);      /* The pages which were snipped off */
        LIST_HEAD(l_inactive);
        struct page *page;
        struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
 +      unsigned long nr_rotated = 0;
  
        lru_add_drain();
        spin_lock_irq(&zone->lru_lock);
 -      pgmoved = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order,
 +      nr_taken = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order,
                                        ISOLATE_ACTIVE, zone,
                                        sc->mem_cgroup, 1, file);
        /*
        if (scanning_global_lru(sc)) {
                zone->pages_scanned += pgscanned;
        }
 -      reclaim_stat->recent_scanned[!!file] += pgmoved;
 +      reclaim_stat->recent_scanned[file] += nr_taken;
  
        __count_zone_vm_events(PGREFILL, zone, pgscanned);
        if (file)
 -              __mod_zone_page_state(zone, NR_ACTIVE_FILE, -pgmoved);
 +              __mod_zone_page_state(zone, NR_ACTIVE_FILE, -nr_taken);
        else
 -              __mod_zone_page_state(zone, NR_ACTIVE_ANON, -pgmoved);
 +              __mod_zone_page_state(zone, NR_ACTIVE_ANON, -nr_taken);
 +      __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
        spin_unlock_irq(&zone->lru_lock);
  
 -      pgmoved = 0;  /* count referenced (mapping) mapped pages */
        while (!list_empty(&l_hold)) {
                cond_resched();
                page = lru_to_page(&l_hold);
                /* page_referenced clears PageReferenced */
                if (page_mapping_inuse(page) &&
                    page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) {
 -                      pgmoved++;
 +                      nr_rotated++;
                        /*
                         * Identify referenced, file-backed active pages and
                         * give them one more trip around the active list. So
                        }
                }
  
 +              ClearPageActive(page);  /* we are de-activating */
                list_add(&page->lru, &l_inactive);
        }
  
         * helps balance scan pressure between file and anonymous pages in
         * get_scan_ratio.
         */
 -      reclaim_stat->recent_rotated[!!file] += pgmoved;
 +      reclaim_stat->recent_rotated[file] += nr_rotated;
  
        move_active_pages_to_lru(zone, &l_active,
                                                LRU_ACTIVE + file * LRU_FILE);
        move_active_pages_to_lru(zone, &l_inactive,
                                                LRU_BASE   + file * LRU_FILE);
 -
 +      __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
        spin_unlock_irq(&zone->lru_lock);
  }
  
@@@ -1488,10 -1429,10 +1488,10 @@@ static void get_scan_ratio(struct zone 
        unsigned long ap, fp;
        struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
  
 -      anon  = zone_nr_pages(zone, sc, LRU_ACTIVE_ANON) +
 -              zone_nr_pages(zone, sc, LRU_INACTIVE_ANON);
 -      file  = zone_nr_pages(zone, sc, LRU_ACTIVE_FILE) +
 -              zone_nr_pages(zone, sc, LRU_INACTIVE_FILE);
 +      anon  = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) +
 +              zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
 +      file  = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) +
 +              zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
  
        if (scanning_global_lru(sc)) {
                free  = zone_page_state(zone, NR_FREE_PAGES);
@@@ -1585,7 -1526,6 +1585,7 @@@ static void shrink_zone(int priority, s
        enum lru_list l;
        unsigned long nr_reclaimed = sc->nr_reclaimed;
        unsigned long swap_cluster_max = sc->swap_cluster_max;
 +      struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
        int noswap = 0;
  
        /* If we have no swap space, do not bother scanning anon pages. */
                int file = is_file_lru(l);
                unsigned long scan;
  
 -              scan = zone_nr_pages(zone, sc, l);
 +              scan = zone_nr_lru_pages(zone, sc, l);
                if (priority || noswap) {
                        scan >>= priority;
                        scan = (scan * percent[file]) / 100;
                }
 -              if (scanning_global_lru(sc))
 -                      nr[l] = nr_scan_try_batch(scan,
 -                                                &zone->lru[l].nr_saved_scan,
 -                                                swap_cluster_max);
 -              else
 -                      nr[l] = scan;
 +              nr[l] = nr_scan_try_batch(scan,
 +                                        &reclaim_stat->nr_saved_scan[l],
 +                                        swap_cluster_max);
        }
  
        while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
@@@ -1742,7 -1685,7 +1742,7 @@@ static unsigned long do_try_to_free_pag
                        if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
                                continue;
  
 -                      lru_pages += zone_lru_pages(zone);
 +                      lru_pages += zone_reclaimable_pages(zone);
                }
        }
  
@@@ -1836,45 -1779,11 +1836,45 @@@ unsigned long try_to_free_pages(struct 
  
  #ifdef CONFIG_CGROUP_MEM_RES_CTLR
  
 +unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
 +                                              gfp_t gfp_mask, bool noswap,
 +                                              unsigned int swappiness,
 +                                              struct zone *zone, int nid)
 +{
 +      struct scan_control sc = {
 +              .may_writepage = !laptop_mode,
 +              .may_unmap = 1,
 +              .may_swap = !noswap,
 +              .swap_cluster_max = SWAP_CLUSTER_MAX,
 +              .swappiness = swappiness,
 +              .order = 0,
 +              .mem_cgroup = mem,
 +              .isolate_pages = mem_cgroup_isolate_pages,
 +      };
 +      nodemask_t nm  = nodemask_of_node(nid);
 +
 +      sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
 +                      (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
 +      sc.nodemask = &nm;
 +      sc.nr_reclaimed = 0;
 +      sc.nr_scanned = 0;
 +      /*
 +       * NOTE: Although we can get the priority field, using it
 +       * here is not a good idea, since it limits the pages we can scan.
 +       * if we don't reclaim here, the shrink_zone from balance_pgdat
 +       * will pick up pages from other mem cgroup's as well. We hack
 +       * the priority and make it zero.
 +       */
 +      shrink_zone(0, zone, &sc);
 +      return sc.nr_reclaimed;
 +}
 +
  unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
                                           gfp_t gfp_mask,
                                           bool noswap,
                                           unsigned int swappiness)
  {
 +      struct zonelist *zonelist;
        struct scan_control sc = {
                .may_writepage = !laptop_mode,
                .may_unmap = 1,
                .isolate_pages = mem_cgroup_isolate_pages,
                .nodemask = NULL, /* we don't care the placement */
        };
 -      struct zonelist *zonelist;
  
        sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
                        (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
@@@ -1992,7 -1902,7 +1992,7 @@@ loop_again
                for (i = 0; i <= end_zone; i++) {
                        struct zone *zone = pgdat->node_zones + i;
  
 -                      lru_pages += zone_lru_pages(zone);
 +                      lru_pages += zone_reclaimable_pages(zone);
                }
  
                /*
                for (i = 0; i <= end_zone; i++) {
                        struct zone *zone = pgdat->node_zones + i;
                        int nr_slab;
 +                      int nid, zid;
  
                        if (!populated_zone(zone))
                                continue;
                        temp_priority[i] = priority;
                        sc.nr_scanned = 0;
                        note_zone_scanning_priority(zone, priority);
 +
 +                      nid = pgdat->node_id;
 +                      zid = zone_idx(zone);
 +                      /*
 +                       * Call soft limit reclaim before calling shrink_zone.
 +                       * For now we ignore the return value
 +                       */
 +                      mem_cgroup_soft_limit_reclaim(zone, order, sc.gfp_mask,
 +                                                      nid, zid);
                        /*
                         * We put equal pressure on every zone, unless one
                         * zone has way too many pages free already.
                        if (zone_is_all_unreclaimable(zone))
                                continue;
                        if (nr_slab == 0 && zone->pages_scanned >=
 -                                              (zone_lru_pages(zone) * 6))
 +                                      (zone_reclaimable_pages(zone) * 6))
                                        zone_set_flag(zone,
                                                      ZONE_ALL_UNRECLAIMABLE);
                        /*
@@@ -2213,39 -2113,12 +2213,39 @@@ void wakeup_kswapd(struct zone *zone, i
        wake_up_interruptible(&pgdat->kswapd_wait);
  }
  
 -unsigned long global_lru_pages(void)
 +/*
 + * The reclaimable count would be mostly accurate.
 + * The less reclaimable pages may be
 + * - mlocked pages, which will be moved to unevictable list when encountered
 + * - mapped pages, which may require several travels to be reclaimed
 + * - dirty pages, which is not "instantly" reclaimable
 + */
 +unsigned long global_reclaimable_pages(void)
 +{
 +      int nr;
 +
 +      nr = global_page_state(NR_ACTIVE_FILE) +
 +           global_page_state(NR_INACTIVE_FILE);
 +
 +      if (nr_swap_pages > 0)
 +              nr += global_page_state(NR_ACTIVE_ANON) +
 +                    global_page_state(NR_INACTIVE_ANON);
 +
 +      return nr;
 +}
 +
 +unsigned long zone_reclaimable_pages(struct zone *zone)
  {
 -      return global_page_state(NR_ACTIVE_ANON)
 -              + global_page_state(NR_ACTIVE_FILE)
 -              + global_page_state(NR_INACTIVE_ANON)
 -              + global_page_state(NR_INACTIVE_FILE);
 +      int nr;
 +
 +      nr = zone_page_state(zone, NR_ACTIVE_FILE) +
 +           zone_page_state(zone, NR_INACTIVE_FILE);
 +
 +      if (nr_swap_pages > 0)
 +              nr += zone_page_state(zone, NR_ACTIVE_ANON) +
 +                    zone_page_state(zone, NR_INACTIVE_ANON);
 +
 +      return nr;
  }
  
  #ifdef CONFIG_HIBERNATION
@@@ -2260,7 -2133,6 +2260,7 @@@ static void shrink_all_zones(unsigned l
  {
        struct zone *zone;
        unsigned long nr_reclaimed = 0;
 +      struct zone_reclaim_stat *reclaim_stat;
  
        for_each_populated_zone(zone) {
                enum lru_list l;
                                                l == LRU_ACTIVE_FILE))
                                continue;
  
 -                      zone->lru[l].nr_saved_scan += (lru_pages >> prio) + 1;
 -                      if (zone->lru[l].nr_saved_scan >= nr_pages || pass > 3) {
 +                      reclaim_stat = get_reclaim_stat(zone, sc);
 +                      reclaim_stat->nr_saved_scan[l] +=
 +                                              (lru_pages >> prio) + 1;
 +                      if (reclaim_stat->nr_saved_scan[l]
 +                                              >= nr_pages || pass > 3) {
                                unsigned long nr_to_scan;
  
 -                              zone->lru[l].nr_saved_scan = 0;
 +                              reclaim_stat->nr_saved_scan[l] = 0;
                                nr_to_scan = min(nr_pages, lru_pages);
                                nr_reclaimed += shrink_list(l, nr_to_scan, zone,
                                                                sc, prio);
@@@ -2321,7 -2190,7 +2321,7 @@@ unsigned long shrink_all_memory(unsigne
  
        current->reclaim_state = &reclaim_state;
  
 -      lru_pages = global_lru_pages();
 +      lru_pages = global_reclaimable_pages();
        nr_slab = global_page_state(NR_SLAB_RECLAIMABLE);
        /* If slab caches are huge, it's better to hit them first */
        while (nr_slab >= lru_pages) {
  
                        reclaim_state.reclaimed_slab = 0;
                        shrink_slab(sc.nr_scanned, sc.gfp_mask,
 -                                      global_lru_pages());
 +                                  global_reclaimable_pages());
                        sc.nr_reclaimed += reclaim_state.reclaimed_slab;
                        if (sc.nr_reclaimed >= nr_pages)
                                goto out;
        if (!sc.nr_reclaimed) {
                do {
                        reclaim_state.reclaimed_slab = 0;
 -                      shrink_slab(nr_pages, sc.gfp_mask, global_lru_pages());
 +                      shrink_slab(nr_pages, sc.gfp_mask,
 +                                  global_reclaimable_pages());
                        sc.nr_reclaimed += reclaim_state.reclaimed_slab;
                } while (sc.nr_reclaimed < nr_pages &&
                                reclaim_state.reclaimed_slab > 0);
@@@ -2701,7 -2569,7 +2701,7 @@@ static void check_move_unevictable_page
  retry:
        ClearPageUnevictable(page);
        if (page_evictable(page, NULL)) {
 -              enum lru_list l = LRU_INACTIVE_ANON + page_is_file_cache(page);
 +              enum lru_list l = page_lru_base_type(page);
  
                __dec_zone_state(zone, NR_UNEVICTABLE);
                list_move(&page->lru, &zone->lru[l].list);
@@@ -2844,10 -2712,10 +2844,10 @@@ static void scan_all_zones_unevictable_
  unsigned long scan_unevictable_pages;
  
  int scan_unevictable_handler(struct ctl_table *table, int write,
 -                         struct file *file, void __user *buffer,
 +                         void __user *buffer,
                           size_t *length, loff_t *ppos)
  {
 -      proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
 +      proc_doulongvec_minmax(table, write, buffer, length, ppos);
  
        if (write && *(unsigned long *)table->data)
                scan_all_zones_unevictable_pages();