Merge branch 'hwpoison' of git://git.kernel.org/pub/scm/linux/kernel/git/ak/linux...

author Linus Torvalds <torvalds@linux-foundation.org>

Thu, 24 Sep 2009 14:53:22 +0000 (07:53 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Thu, 24 Sep 2009 14:53:22 +0000 (07:53 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Thu, 24 Sep 2009 14:53:22 +0000 (07:53 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Thu, 24 Sep 2009 14:53:22 +0000 (07:53 -0700)
diff --combined Documentation/sysctl/vm.txt

index e6fb1ec2744b180d25bd41e6a672e5627dd8e971,faf62740aa2c2125e19f0c96f2032ac557a4fa11..a6e360d2055c561ae347a75e20c015b3f61a8928
--- 1/Documentation/sysctl/vm.txt
--- 2/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@@ -32,6 -32,8 +32,8 @@@ Currently, these files are in /proc/sys
   - legacy_va_layout
   - lowmem_reserve_ratio
   - max_map_count
+ - memory_failure_early_kill
+ - memory_failure_recovery
   - min_free_kbytes
   - min_slab_ratio
   - min_unmapped_ratio
@@@ -53,7 -55,6 +55,6 @@@
   - vfs_cache_pressure
   - zone_reclaim_mode
   
- 
   ==============================================================
   
   block_dump
@@@ -275,6 -276,44 +276,44 @@@ e.g., up to one or two maps per allocat
   
   The default value is 65536.
   
+ =============================================================
+ 
+ memory_failure_early_kill:
+ 
+ Control how to kill processes when uncorrected memory error (typically
+ a 2bit error in a memory module) is detected in the background by hardware
+ that cannot be handled by the kernel. In some cases (like the page
+ still having a valid copy on disk) the kernel will handle the failure
+ transparently without affecting any applications. But if there is
+ no other uptodate copy of the data it will kill to prevent any data
+ corruptions from propagating.
+ 
+ 1: Kill all processes that have the corrupted and not reloadable page mapped
+ as soon as the corruption is detected.  Note this is not supported
+ for a few types of pages, like kernel internally allocated data or
+ the swap cache, but works for the majority of user pages.
+ 
+ 0: Only unmap the corrupted page from all processes and only kill a process
+ who tries to access it.
+ 
+ The kill is done using a catchable SIGBUS with BUS_MCEERR_AO, so processes can
+ handle this if they want to.
+ 
+ This is only active on architectures/platforms with advanced machine
+ check handling and depends on the hardware capabilities.
+ 
+ Applications can override this setting individually with the PR_MCE_KILL prctl
+ 
+ ==============================================================
+ 
+ memory_failure_recovery
+ 
+ Enable memory failure recovery (when supported by the platform)
+ 
+ 1: Attempt recovery.
+ 
+ 0: Always panic on a memory failure.
+ 
   ==============================================================
   
   min_free_kbytes:
@@@ -585,9 -624,7 +624,9 @@@ caching of directory and inode objects
   At the default value of vfs_cache_pressure=100 the kernel will attempt to
   reclaim dentries and inodes at a "fair" rate with respect to pagecache and
   swapcache reclaim.  Decreasing vfs_cache_pressure causes the kernel to prefer
- -to retain dentry and inode caches.  Increasing vfs_cache_pressure beyond 100
+ +to retain dentry and inode caches. When vfs_cache_pressure=0, the kernel will
+ +never reclaim dentries and inodes due to memory pressure and this can easily
+ +lead to out-of-memory conditions. Increasing vfs_cache_pressure beyond 100
   causes the kernel to prefer to reclaim dentries and inodes.
   
   ==============================================================
diff --combined arch/x86/mm/fault.c

index 82728f2c6d5599ccda0c4cb1dee132ce3305f5ca,8ba5624082000eb5a101e3de8637ea1e4fcbabd9..f4cee9028cf0b01e11951662b625f63371f627e6
--- 1/arch/x86/mm/fault.c
--- 2/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@@ -10,7 -10,7 +10,7 @@@
   #include <linux/bootmem.h>            /* max_low_pfn                  */
   #include <linux/kprobes.h>            /* __kprobes, ...               */
   #include <linux/mmiotrace.h>          /* kmmio_handler, ...           */
- -#include <linux/perf_counter.h>               /* perf_swcounter_event         */
+ +#include <linux/perf_event.h>         /* perf_sw_event                */
   
   #include <asm/traps.h>                        /* dotraplinkage, ...           */
   #include <asm/pgalloc.h>              /* pgd_*(), ...                 */
@@@ -167,6 -167,7 +167,7 @@@ force_sig_info_fault(int si_signo, int 
         info.si_errno   = 0;
         info.si_code    = si_code;
         info.si_addr    = (void __user *)address;
+       info.si_addr_lsb = si_code == BUS_MCEERR_AR ? PAGE_SHIFT : 0;
   
         force_sig_info(si_signo, &info, tsk);
   }
@@@ -790,10 -791,12 +791,12 @@@ out_of_memory(struct pt_regs *regs, uns
   }
   
   static void
- do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address)
+ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
+         unsigned int fault)
   {
         struct task_struct *tsk = current;
         struct mm_struct *mm = tsk->mm;
+       int code = BUS_ADRERR;
   
         up_read(&mm->mmap_sem);
   
@@@ -809,7 -812,15 +812,15 @@@
         tsk->thread.error_code  = error_code;
         tsk->thread.trap_no     = 14;
   
-       force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
+ #ifdef CONFIG_MEMORY_FAILURE
+       if (fault & VM_FAULT_HWPOISON) {
+               printk(KERN_ERR
+       "MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n",
+                       tsk->comm, tsk->pid, address);
+               code = BUS_MCEERR_AR;
+       }
+ #endif
+       force_sig_info_fault(SIGBUS, code, address, tsk);
   }
   
   static noinline void
@@@ -819,8 -830,8 +830,8 @@@ mm_fault_error(struct pt_regs *regs, un
         if (fault & VM_FAULT_OOM) {
                 out_of_memory(regs, error_code, address);
         } else {
-               if (fault & VM_FAULT_SIGBUS)
-                       do_sigbus(regs, error_code, address);
+               if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON))
+                       do_sigbus(regs, error_code, address, fault);
                 else
                         BUG();
         }
@@@ -1017,7 -1028,7 +1028,7 @@@ do_page_fault(struct pt_regs *regs, uns
         if (unlikely(error_code & PF_RSVD))
                 pgtable_bad(regs, error_code, address);
   
- -      perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address);
+ +      perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address);
   
         /*
          * If we're in an interrupt, have no user context or are running
@@@ -1114,11 -1125,11 +1125,11 @@@ good_area
   
         if (fault & VM_FAULT_MAJOR) {
                 tsk->maj_flt++;
- -              perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
+ +              perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
                                      regs, address);
         } else {
                 tsk->min_flt++;
- -              perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
+ +              perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
                                      regs, address);
         }
   
diff --combined fs/btrfs/inode.c

index 9096fd0ca3ca447a7195f489f03d862340173494,dd86050190fcb28e08d59ffa2b812db14c5764be..d154a3f365d53bf159f6e22aaa1714bccb4e7c56
--- 1/fs/btrfs/inode.c
--- 2/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@@ -55,13 -55,13 +55,13 @@@ struct btrfs_iget_args 
         struct btrfs_root *root;
   };
   
- -static struct inode_operations btrfs_dir_inode_operations;
- -static struct inode_operations btrfs_symlink_inode_operations;
- -static struct inode_operations btrfs_dir_ro_inode_operations;
- -static struct inode_operations btrfs_special_inode_operations;
- -static struct inode_operations btrfs_file_inode_operations;
- -static struct address_space_operations btrfs_aops;
- -static struct address_space_operations btrfs_symlink_aops;
+ +static const struct inode_operations btrfs_dir_inode_operations;
+ +static const struct inode_operations btrfs_symlink_inode_operations;
+ +static const struct inode_operations btrfs_dir_ro_inode_operations;
+ +static const struct inode_operations btrfs_special_inode_operations;
+ +static const struct inode_operations btrfs_file_inode_operations;
+ +static const struct address_space_operations btrfs_aops;
+ +static const struct address_space_operations btrfs_symlink_aops;
   static struct file_operations btrfs_dir_file_operations;
   static struct extent_io_ops btrfs_extent_io_ops;
   
@@@ -5201,7 -5201,7 +5201,7 @@@ static int btrfs_permission(struct inod
         return generic_permission(inode, mask, btrfs_check_acl);
   }
   
- -static struct inode_operations btrfs_dir_inode_operations = {
+ +static const struct inode_operations btrfs_dir_inode_operations = {
         .getattr        = btrfs_getattr,
         .lookup         = btrfs_lookup,
         .create         = btrfs_create,
@@@ -5219,7 -5219,7 +5219,7 @@@
         .removexattr    = btrfs_removexattr,
         .permission     = btrfs_permission,
   };
- -static struct inode_operations btrfs_dir_ro_inode_operations = {
+ +static const struct inode_operations btrfs_dir_ro_inode_operations = {
         .lookup         = btrfs_lookup,
         .permission     = btrfs_permission,
   };
@@@ -5259,7 -5259,7 +5259,7 @@@ static struct extent_io_ops btrfs_exten
    *
    * For now we're avoiding this by dropping bmap.
    */
- -static struct address_space_operations btrfs_aops = {
+ +static const struct address_space_operations btrfs_aops = {
         .readpage       = btrfs_readpage,
         .writepage      = btrfs_writepage,
         .writepages     = btrfs_writepages,
@@@ -5269,16 -5269,17 +5269,17 @@@
         .invalidatepage = btrfs_invalidatepage,
         .releasepage    = btrfs_releasepage,
         .set_page_dirty = btrfs_set_page_dirty,
+       .error_remove_page = generic_error_remove_page,
   };
   
- -static struct address_space_operations btrfs_symlink_aops = {
+ +static const struct address_space_operations btrfs_symlink_aops = {
         .readpage       = btrfs_readpage,
         .writepage      = btrfs_writepage,
         .invalidatepage = btrfs_invalidatepage,
         .releasepage    = btrfs_releasepage,
   };
   
- -static struct inode_operations btrfs_file_inode_operations = {
+ +static const struct inode_operations btrfs_file_inode_operations = {
         .truncate       = btrfs_truncate,
         .getattr        = btrfs_getattr,
         .setattr        = btrfs_setattr,
@@@ -5290,7 -5291,7 +5291,7 @@@
         .fallocate      = btrfs_fallocate,
         .fiemap         = btrfs_fiemap,
   };
- -static struct inode_operations btrfs_special_inode_operations = {
+ +static const struct inode_operations btrfs_special_inode_operations = {
         .getattr        = btrfs_getattr,
         .setattr        = btrfs_setattr,
         .permission     = btrfs_permission,
@@@ -5299,7 -5300,7 +5300,7 @@@
         .listxattr      = btrfs_listxattr,
         .removexattr    = btrfs_removexattr,
   };
- -static struct inode_operations btrfs_symlink_inode_operations = {
+ +static const struct inode_operations btrfs_symlink_inode_operations = {
         .readlink       = generic_readlink,
         .follow_link    = page_follow_link_light,
         .put_link       = page_put_link,
diff --combined fs/ext3/inode.c

index cd098a7b77fc04b7255fe5586248faa67dbbfb07,953b430f92e37f8bfb5d109ef42c7136cddaac56..acf1b14233275e891fd5e1d55560fed331add18c
--- 1/fs/ext3/inode.c
--- 2/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@@ -172,21 -172,10 +172,21 @@@ static int try_to_extend_transaction(ha
    * so before we call here everything must be consistently dirtied against
    * this transaction.
    */
- -static int ext3_journal_test_restart(handle_t *handle, struct inode *inode)
+ +static int truncate_restart_transaction(handle_t *handle, struct inode *inode)
   {
+ +      int ret;
+ +
         jbd_debug(2, "restarting handle %p\n", handle);
- -      return ext3_journal_restart(handle, blocks_for_truncate(inode));
+ +      /*
+ +       * Drop truncate_mutex to avoid deadlock with ext3_get_blocks_handle
+ +       * At this moment, get_block can be called only for blocks inside
+ +       * i_size since page cache has been already dropped and writes are
+ +       * blocked by i_mutex. So we can safely drop the truncate_mutex.
+ +       */
+ +      mutex_unlock(&EXT3_I(inode)->truncate_mutex);
+ +      ret = ext3_journal_restart(handle, blocks_for_truncate(inode));
+ +      mutex_lock(&EXT3_I(inode)->truncate_mutex);
+ +      return ret;
   }
   
   /*
@@@ -1830,6 -1819,7 +1830,7 @@@ static const struct address_space_opera
         .direct_IO              = ext3_direct_IO,
         .migratepage            = buffer_migrate_page,
         .is_partially_uptodate  = block_is_partially_uptodate,
+       .error_remove_page      = generic_error_remove_page,
   };
   
   static const struct address_space_operations ext3_writeback_aops = {
@@@ -1845,6 -1835,7 +1846,7 @@@
         .direct_IO              = ext3_direct_IO,
         .migratepage            = buffer_migrate_page,
         .is_partially_uptodate  = block_is_partially_uptodate,
+       .error_remove_page      = generic_error_remove_page,
   };
   
   static const struct address_space_operations ext3_journalled_aops = {
@@@ -1859,6 -1850,7 +1861,7 @@@
         .invalidatepage         = ext3_invalidatepage,
         .releasepage            = ext3_releasepage,
         .is_partially_uptodate  = block_is_partially_uptodate,
+       .error_remove_page      = generic_error_remove_page,
   };
   
   void ext3_set_aops(struct inode *inode)
@@@ -2083,7 -2075,7 +2086,7 @@@ static void ext3_clear_blocks(handle_t 
                         ext3_journal_dirty_metadata(handle, bh);
                 }
                 ext3_mark_inode_dirty(handle, inode);
- -              ext3_journal_test_restart(handle, inode);
+ +              truncate_restart_transaction(handle, inode);
                 if (bh) {
                         BUFFER_TRACE(bh, "retaking write access");
                         ext3_journal_get_write_access(handle, bh);
@@@ -2293,7 -2285,7 +2296,7 @@@ static void ext3_free_branches(handle_
                                 return;
                         if (try_to_extend_transaction(handle, inode)) {
                                 ext3_mark_inode_dirty(handle, inode);
- -                              ext3_journal_test_restart(handle, inode);
+ +                              truncate_restart_transaction(handle, inode);
                         }
   
                         ext3_free_blocks(handle, inode, nr, 1);
@@@ -2903,10 -2895,6 +2906,10 @@@ static int ext3_do_update_inode(handle_
         struct buffer_head *bh = iloc->bh;
         int err = 0, rc, block;
   
+ +again:
+ +      /* we can't allow multiple procs in here at once, its a bit racey */
+ +      lock_buffer(bh);
+ +
         /* For fields not not tracking in the in-memory inode,
          * initialise them to zero for new inodes. */
         if (ei->i_state & EXT3_STATE_NEW)
@@@ -2966,20 -2954,16 +2969,20 @@@
                                /* If this is the first large file
                                 * created, add a flag to the superblock.
                                 */
+ +                              unlock_buffer(bh);
                                 err = ext3_journal_get_write_access(handle,
                                                 EXT3_SB(sb)->s_sbh);
                                 if (err)
                                         goto out_brelse;
+ +
                                 ext3_update_dynamic_rev(sb);
                                 EXT3_SET_RO_COMPAT_FEATURE(sb,
                                         EXT3_FEATURE_RO_COMPAT_LARGE_FILE);
                                 handle->h_sync = 1;
                                 err = ext3_journal_dirty_metadata(handle,
                                                 EXT3_SB(sb)->s_sbh);
+ +                              /* get our lock and start over */
+ +                              goto again;
                         }
                 }
         }
@@@ -3002,7 -2986,6 +3005,7 @@@
                 raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
   
         BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
+ +      unlock_buffer(bh);
         rc = ext3_journal_dirty_metadata(handle, bh);
         if (!err)
                 err = rc;
diff --combined fs/ext4/inode.c

index 3a798737e305756a493e6ad13f865b302f9174a8,349dd6b4da47492ed2e8e94f613de6e1e86dd676..064746fad5812e693ef6d3ef2578822a3007cadb
--- 1/fs/ext4/inode.c
--- 2/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@@ -192,24 -192,11 +192,24 @@@ static int try_to_extend_transaction(ha
    * so before we call here everything must be consistently dirtied against
    * this transaction.
    */
- -static int ext4_journal_test_restart(handle_t *handle, struct inode *inode)
+ + int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode,
+ +                               int nblocks)
   {
+ +      int ret;
+ +
+ +      /*
+ +       * Drop i_data_sem to avoid deadlock with ext4_get_blocks At this
+ +       * moment, get_block can be called only for blocks inside i_size since
+ +       * page cache has been already dropped and writes are blocked by
+ +       * i_mutex. So we can safely drop the i_data_sem here.
+ +       */
         BUG_ON(EXT4_JOURNAL(inode) == NULL);
         jbd_debug(2, "restarting handle %p\n", handle);
- -      return ext4_journal_restart(handle, blocks_for_truncate(inode));
+ +      up_write(&EXT4_I(inode)->i_data_sem);
+ +      ret = ext4_journal_restart(handle, blocks_for_truncate(inode));
+ +      down_write(&EXT4_I(inode)->i_data_sem);
+ +
+ +      return ret;
   }
   
   /*
@@@ -354,7 -341,9 +354,7 @@@ static int ext4_block_to_path(struct in
         int n = 0;
         int final = 0;
   
- -      if (i_block < 0) {
- -              ext4_warning(inode->i_sb, "ext4_block_to_path", "block < 0");
- -      } else if (i_block < direct_blocks) {
+ +      if (i_block < direct_blocks) {
                 offsets[n++] = i_block;
                 final = direct_blocks;
         } else if ((i_block -= direct_blocks) < indirect_blocks) {
@@@ -562,21 -551,15 +562,21 @@@ static ext4_fsblk_t ext4_find_near(stru
    *
    *    Normally this function find the preferred place for block allocation,
    *    returns it.
+ + *    Because this is only used for non-extent files, we limit the block nr
+ + *    to 32 bits.
    */
   static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
                                    Indirect *partial)
   {
+ +      ext4_fsblk_t goal;
+ +
         /*
          * XXX need to get goal block from mballoc's data structures
          */
   
- -      return ext4_find_near(inode, partial);
+ +      goal = ext4_find_near(inode, partial);
+ +      goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
+ +      return goal;
   }
   
   /**
@@@ -657,8 -640,6 +657,8 @@@ static int ext4_alloc_blocks(handle_t *
                 if (*err)
                         goto failed_out;
   
+ +              BUG_ON(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS);
+ +
                 target -= count;
                 /* allocate blocks for indirect blocks */
                 while (index < indirect_blks && count) {
@@@ -693,7 -674,6 +693,7 @@@
                 ar.flags = EXT4_MB_HINT_DATA;
   
         current_block = ext4_mb_new_blocks(handle, &ar, err);
+ +      BUG_ON(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS);
   
         if (*err && (target == blks)) {
                 /*
@@@ -782,9 -762,8 +782,9 @@@ static int ext4_alloc_branch(handle_t *
                 BUFFER_TRACE(bh, "call get_create_access");
                 err = ext4_journal_get_create_access(handle, bh);
                 if (err) {
+ +                      /* Don't brelse(bh) here; it's done in
+ +                       * ext4_journal_forget() below */
                         unlock_buffer(bh);
- -                      brelse(bh);
                         goto failed;
                 }
   
@@@ -1130,15 -1109,16 +1130,15 @@@ static void ext4_da_update_reserve_spac
                 ext4_discard_preallocations(inode);
   }
   
- -static int check_block_validity(struct inode *inode, sector_t logical,
- -                              sector_t phys, int len)
+ +static int check_block_validity(struct inode *inode, const char *msg,
+ +                              sector_t logical, sector_t phys, int len)
   {
         if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), phys, len)) {
- -              ext4_error(inode->i_sb, "check_block_validity",
+ +              ext4_error(inode->i_sb, msg,
                            "inode #%lu logical block %llu mapped to %llu "
                            "(size %d)", inode->i_ino,
                            (unsigned long long) logical,
                            (unsigned long long) phys, len);
- -              WARN_ON(1);
                 return -EIO;
         }
         return 0;
@@@ -1190,8 -1170,8 +1190,8 @@@ int ext4_get_blocks(handle_t *handle, s
         up_read((&EXT4_I(inode)->i_data_sem));
   
         if (retval > 0 && buffer_mapped(bh)) {
- -              int ret = check_block_validity(inode, block,
- -                                             bh->b_blocknr, retval);
+ +              int ret = check_block_validity(inode, "file system corruption",
+ +                                             block, bh->b_blocknr, retval);
                 if (ret != 0)
                         return ret;
         }
@@@ -1255,7 -1235,8 +1255,7 @@@
                          * i_data's format changing.  Force the migrate
                          * to fail by clearing migrate flags
                          */
- -                      EXT4_I(inode)->i_flags = EXT4_I(inode)->i_flags &
- -                                                      ~EXT4_EXT_MIGRATE;
+ +                      EXT4_I(inode)->i_state &= ~EXT4_STATE_EXT_MIGRATE;
                 }
         }
   
@@@ -1271,9 -1252,8 +1271,9 @@@
   
         up_write((&EXT4_I(inode)->i_data_sem));
         if (retval > 0 && buffer_mapped(bh)) {
- -              int ret = check_block_validity(inode, block,
- -                                             bh->b_blocknr, retval);
+ +              int ret = check_block_validity(inode, "file system "
+ +                                             "corruption after allocation",
+ +                                             block, bh->b_blocknr, retval);
                 if (ret != 0)
                         return ret;
         }
@@@ -1883,6 -1863,18 +1883,6 @@@ static void ext4_da_page_release_reserv
    * Delayed allocation stuff
    */
   
- -struct mpage_da_data {
- -      struct inode *inode;
- -      sector_t b_blocknr;             /* start block number of extent */
- -      size_t b_size;                  /* size of extent */
- -      unsigned long b_state;          /* state of the extent */
- -      unsigned long first_page, next_page;    /* extent of pages */
- -      struct writeback_control *wbc;
- -      int io_done;
- -      int pages_written;
- -      int retval;
- -};
- -
   /*
    * mpage_da_submit_io - walks through extent of pages and try to write
    * them with writepage() call back
@@@ -2337,7 -2329,7 +2337,7 @@@ static int __mpage_da_writepage(struct 
                 /*
                  * Rest of the page in the page_vec
                  * redirty then and skip then. We will
- -               * try to to write them again after
+ +               * try to write them again after
                  * starting a new transaction
                  */
                 redirty_page_for_writepage(wbc, page);
@@@ -2745,7 -2737,6 +2745,7 @@@ static int ext4_da_writepages(struct ad
         long pages_skipped;
         int range_cyclic, cycled = 1, io_done = 0;
         int needed_blocks, ret = 0, nr_to_writebump = 0;
+ +      loff_t range_start = wbc->range_start;
         struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
   
         trace_ext4_da_writepages(inode, wbc);
@@@ -2859,7 -2850,6 +2859,7 @@@ retry
                         mpd.io_done = 1;
                         ret = MPAGE_DA_EXTENT_TAIL;
                 }
+ +              trace_ext4_da_write_pages(inode, &mpd);
                 wbc->nr_to_write -= mpd.pages_written;
   
                 ext4_journal_stop(handle);
@@@ -2915,7 -2905,6 +2915,7 @@@ out_writepages
         if (!no_nrwrite_index_update)
                 wbc->no_nrwrite_index_update = 0;
         wbc->nr_to_write -= nr_to_writebump;
+ +      wbc->range_start = range_start;
         trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
         return ret;
   }
@@@ -3128,8 -3117,6 +3128,8 @@@ out
    */
   int ext4_alloc_da_blocks(struct inode *inode)
   {
+ +      trace_ext4_alloc_da_blocks(inode);
+ +
         if (!EXT4_I(inode)->i_reserved_data_blocks &&
             !EXT4_I(inode)->i_reserved_meta_blocks)
                 return 0;
@@@ -3386,6 -3373,7 +3386,7 @@@ static const struct address_space_opera
         .direct_IO              = ext4_direct_IO,
         .migratepage            = buffer_migrate_page,
         .is_partially_uptodate  = block_is_partially_uptodate,
+       .error_remove_page      = generic_error_remove_page,
   };
   
   static const struct address_space_operations ext4_writeback_aops = {
@@@ -3401,6 -3389,7 +3402,7 @@@
         .direct_IO              = ext4_direct_IO,
         .migratepage            = buffer_migrate_page,
         .is_partially_uptodate  = block_is_partially_uptodate,
+       .error_remove_page      = generic_error_remove_page,
   };
   
   static const struct address_space_operations ext4_journalled_aops = {
@@@ -3415,6 -3404,7 +3417,7 @@@
         .invalidatepage         = ext4_invalidatepage,
         .releasepage            = ext4_releasepage,
         .is_partially_uptodate  = block_is_partially_uptodate,
+       .error_remove_page      = generic_error_remove_page,
   };
   
   static const struct address_space_operations ext4_da_aops = {
@@@ -3431,6 -3421,7 +3434,7 @@@
         .direct_IO              = ext4_direct_IO,
         .migratepage            = buffer_migrate_page,
         .is_partially_uptodate  = block_is_partially_uptodate,
+       .error_remove_page      = generic_error_remove_page,
   };
   
   void ext4_set_aops(struct inode *inode)
@@@ -3672,8 -3663,7 +3676,8 @@@ static void ext4_clear_blocks(handle_t 
                         ext4_handle_dirty_metadata(handle, inode, bh);
                 }
                 ext4_mark_inode_dirty(handle, inode);
- -              ext4_journal_test_restart(handle, inode);
+ +              ext4_truncate_restart_trans(handle, inode,
+ +                                          blocks_for_truncate(inode));
                 if (bh) {
                         BUFFER_TRACE(bh, "retaking write access");
                         ext4_journal_get_write_access(handle, bh);
@@@ -3884,8 -3874,7 +3888,8 @@@ static void ext4_free_branches(handle_
                                 return;
                         if (try_to_extend_transaction(handle, inode)) {
                                 ext4_mark_inode_dirty(handle, inode);
- -                              ext4_journal_test_restart(handle, inode);
+ +                              ext4_truncate_restart_trans(handle, inode,
+ +                                          blocks_for_truncate(inode));
                         }
   
                         ext4_free_blocks(handle, inode, nr, 1, 1);
@@@ -3973,7 -3962,8 +3977,7 @@@ void ext4_truncate(struct inode *inode
         if (!ext4_can_truncate(inode))
                 return;
   
- -      if (ei->i_disksize && inode->i_size == 0 &&
- -          !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
+ +      if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
                 ei->i_state |= EXT4_STATE_DA_ALLOC_CLOSE;
   
         if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
@@@ -4547,8 -4537,7 +4551,8 @@@ static int ext4_inode_blocks_set(handle
    */
   static int ext4_do_update_inode(handle_t *handle,
                                 struct inode *inode,
- -                              struct ext4_iloc *iloc)
+ +                              struct ext4_iloc *iloc,
+ +                              int do_sync)
   {
         struct ext4_inode *raw_inode = ext4_raw_inode(iloc);
         struct ext4_inode_info *ei = EXT4_I(inode);
@@@ -4596,7 -4585,8 +4600,7 @@@
         if (ext4_inode_blocks_set(handle, raw_inode, ei))
                 goto out_brelse;
         raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
- -      /* clear the migrate flag in the raw_inode */
- -      raw_inode->i_flags = cpu_to_le32(ei->i_flags & ~EXT4_EXT_MIGRATE);
+ +      raw_inode->i_flags = cpu_to_le32(ei->i_flags);
         if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
             cpu_to_le32(EXT4_OS_HURD))
                 raw_inode->i_file_acl_high =
@@@ -4649,22 -4639,10 +4653,22 @@@
                 raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
         }
   
- -      BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
- -      rc = ext4_handle_dirty_metadata(handle, inode, bh);
- -      if (!err)
- -              err = rc;
+ +      /*
+ +       * If we're not using a journal and we were called from
+ +       * ext4_write_inode() to sync the inode (making do_sync true),
+ +       * we can just use sync_dirty_buffer() directly to do our dirty
+ +       * work.  Testing s_journal here is a bit redundant but it's
+ +       * worth it to avoid potential future trouble.
+ +       */
+ +      if (EXT4_SB(inode->i_sb)->s_journal == NULL && do_sync) {
+ +              BUFFER_TRACE(bh, "call sync_dirty_buffer");
+ +              sync_dirty_buffer(bh);
+ +      } else {
+ +              BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+ +              rc = ext4_handle_dirty_metadata(handle, inode, bh);
+ +              if (!err)
+ +                      err = rc;
+ +      }
         ei->i_state &= ~EXT4_STATE_NEW;
   
   out_brelse:
@@@ -4710,32 -4688,19 +4714,32 @@@
    */
   int ext4_write_inode(struct inode *inode, int wait)
   {
+ +      int err;
+ +
         if (current->flags & PF_MEMALLOC)
                 return 0;
   
- -      if (ext4_journal_current_handle()) {
- -              jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n");
- -              dump_stack();
- -              return -EIO;
- -      }
+ +      if (EXT4_SB(inode->i_sb)->s_journal) {
+ +              if (ext4_journal_current_handle()) {
+ +                      jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n");
+ +                      dump_stack();
+ +                      return -EIO;
+ +              }
   
- -      if (!wait)
- -              return 0;
+ +              if (!wait)
+ +                      return 0;
+ +
+ +              err = ext4_force_commit(inode->i_sb);
+ +      } else {
+ +              struct ext4_iloc iloc;
   
- -      return ext4_force_commit(inode->i_sb);
+ +              err = ext4_get_inode_loc(inode, &iloc);
+ +              if (err)
+ +                      return err;
+ +              err = ext4_do_update_inode(EXT4_NOJOURNAL_HANDLE,
+ +                                         inode, &iloc, wait);
+ +      }
+ +      return err;
   }
   
   /*
@@@ -5029,7 -4994,7 +5033,7 @@@ int ext4_mark_iloc_dirty(handle_t *hand
         get_bh(iloc->bh);
   
         /* ext4_do_update_inode() does jbd2_journal_dirty_metadata */
- -      err = ext4_do_update_inode(handle, inode, iloc);
+ +      err = ext4_do_update_inode(handle, inode, iloc, 0);
         put_bh(iloc->bh);
         return err;
   }
@@@ -5320,21 -5285,12 +5324,21 @@@ int ext4_page_mkwrite(struct vm_area_st
         else
                 len = PAGE_CACHE_SIZE;
   
+ +      lock_page(page);
+ +      /*
+ +       * return if we have all the buffers mapped. This avoid
+ +       * the need to call write_begin/write_end which does a
+ +       * journal_start/journal_stop which can block and take
+ +       * long time
+ +       */
         if (page_has_buffers(page)) {
- -              /* return if we have all the buffers mapped */
                 if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
- -                                     ext4_bh_unmapped))
+ +                                      ext4_bh_unmapped)) {
+ +                      unlock_page(page);
                         goto out_unlock;
+ +              }
         }
+ +      unlock_page(page);
         /*
          * OK, we need to fill the hole... Do write_begin write_end
          * to do block allocation/reservation.We are not holding
diff --combined fs/ocfs2/aops.c

index 72e76062a900d2555fac2dae44ef325a5d8aa897,747f15eefd82e95dbad0c1afdbad690f1c750c0c..deb2b132ae5ed42b68fd11f58413f2ffa4779b83
--- 1/fs/ocfs2/aops.c
--- 2/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@@ -44,7 -44,6 +44,7 @@@
   #include "suballoc.h"
   #include "super.h"
   #include "symlink.h"
+ +#include "refcounttree.h"
   
   #include "buffer_head_io.h"
   
@@@ -127,8 -126,8 +127,8 @@@ bail
         return err;
   }
   
- -static int ocfs2_get_block(struct inode *inode, sector_t iblock,
- -                         struct buffer_head *bh_result, int create)
+ +int ocfs2_get_block(struct inode *inode, sector_t iblock,
+ +                  struct buffer_head *bh_result, int create)
   {
         int err = 0;
         unsigned int ext_flags;
@@@ -591,8 -590,6 +591,8 @@@ static int ocfs2_direct_IO_get_blocks(s
                 goto bail;
         }
   
+ +      /* We should already CoW the refcounted extent. */
+ +      BUG_ON(ext_flags & OCFS2_EXT_REFCOUNTED);
         /*
          * get_more_blocks() expects us to describe a hole by clearing
          * the mapped bit on bh_result().
@@@ -690,10 -687,6 +690,10 @@@ static ssize_t ocfs2_direct_IO(int rw
         if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
                 return 0;
   
+ +      /* Fallback to buffered I/O if we are appending. */
+ +      if (i_size_read(inode) <= offset)
+ +              return 0;
+ +
         ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
                                             inode->i_sb->s_bdev, iov, offset,
                                             nr_segs, 
@@@ -1266,8 -1259,7 +1266,8 @@@ static int ocfs2_write_cluster(struct a
                         goto out;
                 }
         } else if (unwritten) {
- -              ocfs2_init_dinode_extent_tree(&et, inode, wc->w_di_bh);
+ +              ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode),
+ +                                            wc->w_di_bh);
                 ret = ocfs2_mark_extent_written(inode, &et,
                                                 wc->w_handle, cpos, 1, phys,
                                                 meta_ac, &wc->w_dealloc);
@@@ -1456,9 -1448,6 +1456,9 @@@ static int ocfs2_populate_write_desc(st
                                 goto out;
                         }
   
+ +                      /* We should already CoW the refcountd extent. */
+ +                      BUG_ON(ext_flags & OCFS2_EXT_REFCOUNTED);
+ +
                         /*
                          * Assume worst case - that we're writing in
                          * the middle of the extent.
@@@ -1539,7 -1528,7 +1539,7 @@@ static int ocfs2_write_begin_inline(str
                 goto out;
         }
   
- -      ret = ocfs2_journal_access_di(handle, inode, wc->w_di_bh,
+ +      ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh,
                                       OCFS2_JOURNAL_ACCESS_WRITE);
         if (ret) {
                 ocfs2_commit_trans(osb, handle);
@@@ -1710,19 -1699,6 +1710,19 @@@ int ocfs2_write_begin_nolock(struct add
                 goto out;
         }
   
+ +      ret = ocfs2_check_range_for_refcount(inode, pos, len);
+ +      if (ret < 0) {
+ +              mlog_errno(ret);
+ +              goto out;
+ +      } else if (ret == 1) {
+ +              ret = ocfs2_refcount_cow(inode, di_bh,
+ +                                       wc->w_cpos, wc->w_clen, UINT_MAX);
+ +              if (ret) {
+ +                      mlog_errno(ret);
+ +                      goto out;
+ +              }
+ +      }
+ +
         ret = ocfs2_populate_write_desc(inode, wc, &clusters_to_alloc,
                                         &extents_to_split);
         if (ret) {
@@@ -1750,8 -1726,7 +1750,8 @@@
                      (long long)i_size_read(inode), le32_to_cpu(di->i_clusters),
                      clusters_to_alloc, extents_to_split);
   
- -              ocfs2_init_dinode_extent_tree(&et, inode, wc->w_di_bh);
+ +              ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode),
+ +                                            wc->w_di_bh);
                 ret = ocfs2_lock_allocators(inode, &et,
                                             clusters_to_alloc, extents_to_split,
                                             &data_ac, &meta_ac);
@@@ -1798,7 -1773,7 +1798,7 @@@
          * We don't want this to fail in ocfs2_write_end(), so do it
          * here.
          */
- -      ret = ocfs2_journal_access_di(handle, inode, wc->w_di_bh,
+ +      ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh,
                                       OCFS2_JOURNAL_ACCESS_WRITE);
         if (ret) {
                 mlog_errno(ret);
@@@ -2022,4 -1997,5 +2022,5 @@@ const struct address_space_operations o
         .releasepage            = ocfs2_releasepage,
         .migratepage            = buffer_migrate_page,
         .is_partially_uptodate  = block_is_partially_uptodate,
+       .error_remove_page      = generic_error_remove_page,
   };
diff --combined fs/proc/meminfo.c

index 171e052c07b3684f4a4264ebb44e802affc20f6e,78faedcb0a8d5e9ac012f4fb612609855bf3995a..c7bff4f603ff1557f7663fa22b9fb7ae8ed8a2e8
--- 1/fs/proc/meminfo.c
--- 2/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@@ -81,11 -81,9 +81,11 @@@ static int meminfo_proc_show(struct seq
                 "Writeback:      %8lu kB\n"
                 "AnonPages:      %8lu kB\n"
                 "Mapped:         %8lu kB\n"
+ +              "Shmem:          %8lu kB\n"
                 "Slab:           %8lu kB\n"
                 "SReclaimable:   %8lu kB\n"
                 "SUnreclaim:     %8lu kB\n"
+ +              "KernelStack:    %8lu kB\n"
                 "PageTables:     %8lu kB\n"
   #ifdef CONFIG_QUICKLIST
                 "Quicklists:     %8lu kB\n"
@@@ -97,7 -95,11 +97,11 @@@
                 "Committed_AS:   %8lu kB\n"
                 "VmallocTotal:   %8lu kB\n"
                 "VmallocUsed:    %8lu kB\n"
-               "VmallocChunk:   %8lu kB\n",
+               "VmallocChunk:   %8lu kB\n"
+ #ifdef CONFIG_MEMORY_FAILURE
+               "HardwareCorrupted: %8lu kB\n"
+ #endif
+               ,
                 K(i.totalram),
                 K(i.freeram),
                 K(i.bufferram),
@@@ -126,12 -128,10 +130,12 @@@
                 K(global_page_state(NR_WRITEBACK)),
                 K(global_page_state(NR_ANON_PAGES)),
                 K(global_page_state(NR_FILE_MAPPED)),
+ +              K(global_page_state(NR_SHMEM)),
                 K(global_page_state(NR_SLAB_RECLAIMABLE) +
                                 global_page_state(NR_SLAB_UNRECLAIMABLE)),
                 K(global_page_state(NR_SLAB_RECLAIMABLE)),
                 K(global_page_state(NR_SLAB_UNRECLAIMABLE)),
+ +              global_page_state(NR_KERNEL_STACK) * THREAD_SIZE / 1024,
                 K(global_page_state(NR_PAGETABLE)),
   #ifdef CONFIG_QUICKLIST
                 K(quicklist_total_size()),
@@@ -144,6 -144,9 +148,9 @@@
                 (unsigned long)VMALLOC_TOTAL >> 10,
                 vmi.used >> 10,
                 vmi.largest_chunk >> 10
+ #ifdef CONFIG_MEMORY_FAILURE
+               ,atomic_long_read(&mce_bad_pages) << (PAGE_SHIFT - 10)
+ #endif
                 );
   
         hugetlb_report_meminfo(m);
diff --combined fs/xfs/linux-2.6/xfs_aops.c

index d5e5559e31db3774ba6f76e350302009ccf831eb,52f3fc63571aaaa69765f3be5d2f8ed776477c5b..381854461b282fe928c93cf61efb41edb1ad1acf
--- 1/fs/xfs/linux-2.6/xfs_aops.c
--- 2/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@@ -216,6 -216,7 +216,6 @@@ xfs_setfilesize
         if (ip->i_d.di_size < isize) {
                 ip->i_d.di_size = isize;
                 ip->i_update_core = 1;
- -              ip->i_update_size = 1;
                 xfs_mark_inode_dirty_sync(ip);
         }
   
@@@ -1635,4 -1636,5 +1635,5 @@@ const struct address_space_operations x
         .direct_IO              = xfs_vm_direct_IO,
         .migratepage            = buffer_migrate_page,
         .is_partially_uptodate  = block_is_partially_uptodate,
+       .error_remove_page      = generic_error_remove_page,
   };
diff --combined include/asm-generic/mman-common.h

index dd63bd38864b23627f36e57b53c8413af63cd238,c325d1ef42ab746bc9086e3c8347802a8a9c561a..5ee13b2fd223599422446b509227eadb24b7f7e8
--- 1/include/asm-generic/mman-common.h
--- 2/include/asm-generic/mman-common.h
+++ b/include/asm-generic/mman-common.h
@@@ -34,10 -34,8 +34,11 @@@
   #define MADV_REMOVE   9               /* remove these pages & resources */
   #define MADV_DONTFORK 10              /* don't inherit across fork */
   #define MADV_DOFORK   11              /* do inherit across fork */
+ #define MADV_HWPOISON 100             /* poison a page for testing */
   
+ +#define MADV_MERGEABLE   12           /* KSM may merge identical pages */
+ +#define MADV_UNMERGEABLE 13           /* KSM may not merge identical pages */
+ +
   /* compatibility flags */
   #define MAP_FILE      0
   
diff --combined include/linux/fs.h

index 33ed6644abd08a20dfc3e5ba1eef6408a6687add,4f47afd37647b330d0eb16bfcafdf237186c32b1..78e95b8b66d4071dfaf24e319c3aa1470ffd646f
--- 1/include/linux/fs.h
--- 2/include/linux/fs.h
+++ b/include/linux/fs.h
@@@ -595,6 -595,7 +595,7 @@@ struct address_space_operations 
         int (*launder_page) (struct page *);
         int (*is_partially_uptodate) (struct page *, read_descriptor_t *,
                                         unsigned long);
+       int (*error_remove_page)(struct address_space *, struct page *);
   };
   
   /*
@@@ -655,6 -656,7 +656,6 @@@ struct block_device 
         int                     bd_invalidated;
         struct gendisk *        bd_disk;
         struct list_head        bd_list;
- -      struct backing_dev_info *bd_inode_backing_dev_info;
         /*
          * Private data.  You must have bd_claim'ed the block_device
          * to use this.  NOTE:  bd_claim allows an owner to claim
@@@ -1066,8 -1068,8 +1067,8 @@@ struct file_lock 
         struct fasync_struct *  fl_fasync; /* for lease break notifications */
         unsigned long fl_break_time;    /* for nonblocking lease breaks */
   
- -      struct file_lock_operations *fl_ops;    /* Callbacks for filesystems */
- -      struct lock_manager_operations *fl_lmops;       /* Callbacks for lockmanagers */
+ +      const struct file_lock_operations *fl_ops;      /* Callbacks for filesystems */
+ +      const struct lock_manager_operations *fl_lmops; /* Callbacks for lockmanagers */
         union {
                 struct nfs_lock_info    nfs_fl;
                 struct nfs4_lock_info   nfs4_fl;
@@@ -1318,8 -1320,8 +1319,8 @@@ struct super_block 
         unsigned long long      s_maxbytes;     /* Max file size */
         struct file_system_type *s_type;
         const struct super_operations   *s_op;
- -      struct dquot_operations *dq_op;
- -      struct quotactl_ops     *s_qcop;
+ +      const struct dquot_operations   *dq_op;
+ +      const struct quotactl_ops       *s_qcop;
         const struct export_operations *s_export_op;
         unsigned long           s_flags;
         unsigned long           s_magic;
@@@ -1342,7 -1344,6 +1343,7 @@@
         int                     s_nr_dentry_unused;     /* # of dentry on lru */
   
         struct block_device     *s_bdev;
+ +      struct backing_dev_info *s_bdi;
         struct mtd_info         *s_mtd;
         struct list_head        s_instances;
         struct quota_info       s_dquot;        /* Diskquota specific options */
@@@ -2467,7 -2468,7 +2468,7 @@@ ssize_t simple_attr_write(struct file *
                           size_t len, loff_t *ppos);
   
   struct ctl_table;
- -int proc_nr_files(struct ctl_table *table, int write, struct file *filp,
+ +int proc_nr_files(struct ctl_table *table, int write,
                   void __user *buffer, size_t *lenp, loff_t *ppos);
   
   int __init get_filesystem_list(char *buf);
diff --combined include/linux/mm.h

index 87218ae84e36f227ac0203c5214bf85351fae6f5,1ffca03f34b799bbdce8f7ee82d8f875fb9e60ae..6953a5a53e4495eeee028ee6c3b8f3bef00839b7
--- 1/include/linux/mm.h
--- 2/include/linux/mm.h
+++ b/include/linux/mm.h
@@@ -25,7 -25,6 +25,7 @@@ extern unsigned long max_mapnr
   #endif
   
   extern unsigned long num_physpages;
+ +extern unsigned long totalram_pages;
   extern void * high_memory;
   extern int page_cluster;
   
@@@ -104,7 -103,6 +104,7 @@@ extern unsigned int kobjsize(const voi
   #define VM_MIXEDMAP   0x10000000      /* Can contain "struct page" and pure PFN pages */
   #define VM_SAO                0x20000000      /* Strong Access Ordering (powerpc) */
   #define VM_PFN_AT_MMAP        0x40000000      /* PFNMAP vma that is fully mapped at mmap time */
+ +#define VM_MERGEABLE  0x80000000      /* KSM may merge identical pages */
   
   #ifndef VM_STACK_DEFAULT_FLAGS                /* arch can override this */
   #define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS
@@@ -285,14 -283,6 +285,14 @@@ static inline int is_vmalloc_addr(cons
         return 0;
   #endif
   }
+ +#ifdef CONFIG_MMU
+ +extern int is_vmalloc_or_module_addr(const void *x);
+ +#else
+ +static int is_vmalloc_or_module_addr(const void *x)
+ +{
+ +      return 0;
+ +}
+ +#endif
   
   static inline struct page *compound_head(struct page *page)
   {
@@@ -695,11 -685,12 +695,12 @@@ static inline int page_mapped(struct pa
   #define VM_FAULT_SIGBUS       0x0002
   #define VM_FAULT_MAJOR        0x0004
   #define VM_FAULT_WRITE        0x0008  /* Special case for get_user_pages */
+ #define VM_FAULT_HWPOISON 0x0010      /* Hit poisoned page */
   
   #define VM_FAULT_NOPAGE       0x0100  /* ->fault installed the pte, not return page */
   #define VM_FAULT_LOCKED       0x0200  /* ->fault locked the returned page */
   
- #define VM_FAULT_ERROR        (VM_FAULT_OOM | VM_FAULT_SIGBUS)
+ #define VM_FAULT_ERROR        (VM_FAULT_OOM | VM_FAULT_SIGBUS | VM_FAULT_HWPOISON)
   
   /*
    * Can be called by the pagefault handler when it gets a VM_FAULT_OOM.
@@@ -710,8 -701,17 +711,8 @@@ extern void pagefault_out_of_memory(voi
   
   extern void show_free_areas(void);
   
- -#ifdef CONFIG_SHMEM
- -extern int shmem_lock(struct file *file, int lock, struct user_struct *user);
- -#else
- -static inline int shmem_lock(struct file *file, int lock,
- -                          struct user_struct *user)
- -{
- -      return 0;
- -}
- -#endif
+ +int shmem_lock(struct file *file, int lock, struct user_struct *user);
   struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags);
- -
   int shmem_zero_setup(struct vm_area_struct *);
   
   #ifndef CONFIG_MMU
@@@ -794,6 -794,11 +795,11 @@@ static inline void unmap_shared_mapping
   extern int vmtruncate(struct inode * inode, loff_t offset);
   extern int vmtruncate_range(struct inode * inode, loff_t offset, loff_t end);
   
+ int truncate_inode_page(struct address_space *mapping, struct page *page);
+ int generic_error_remove_page(struct address_space *mapping, struct page *page);
+ 
+ int invalidate_inode_page(struct page *page);
+ 
   #ifdef CONFIG_MMU
   extern int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                         unsigned long address, unsigned int flags);
@@@ -816,7 -821,6 +822,7 @@@ int get_user_pages(struct task_struct *
                         struct page **pages, struct vm_area_struct **vmas);
   int get_user_pages_fast(unsigned long start, int nr_pages, int write,
                         struct page **pages);
+ +struct page *get_dump_page(unsigned long addr);
   
   extern int try_to_release_page(struct page * page, gfp_t gfp_mask);
   extern void do_invalidatepage(struct page *page, unsigned long offset);
@@@ -1060,8 -1064,6 +1066,8 @@@ extern void setup_per_cpu_pageset(void)
   static inline void setup_per_cpu_pageset(void) {}
   #endif
   
+ +extern void zone_pcp_update(struct zone *zone);
+ +
   /* nommu.c */
   extern atomic_long_t mmap_pages_allocated;
   
@@@ -1230,8 -1232,7 +1236,8 @@@ struct page *follow_page(struct vm_area
   #define FOLL_WRITE    0x01    /* check pte is writable */
   #define FOLL_TOUCH    0x02    /* mark page accessed */
   #define FOLL_GET      0x04    /* do get_page on page */
- -#define FOLL_ANON     0x08    /* give ZERO_PAGE if no pgtable */
+ +#define FOLL_DUMP     0x08    /* give error on hole if it would be zero */
+ +#define FOLL_FORCE    0x10    /* get_user_pages read/write w/o permission */
   
   typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
                         void *data);
@@@ -1279,7 -1280,7 +1285,7 @@@ int in_gate_area_no_task(unsigned long 
   #define in_gate_area(task, addr) ({(void)task; in_gate_area_no_task(addr);})
   #endif        /* __HAVE_ARCH_GATE_AREA */
   
- -int drop_caches_sysctl_handler(struct ctl_table *, int, struct file *,
+ +int drop_caches_sysctl_handler(struct ctl_table *, int,
                                         void __user *, size_t *, loff_t *);
   unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
                         unsigned long lru_pages);
@@@ -1308,5 -1309,12 +1314,12 @@@ void vmemmap_populate_print_last(void)
   extern int account_locked_memory(struct mm_struct *mm, struct rlimit *rlim,
                                  size_t size);
   extern void refund_locked_memory(struct mm_struct *mm, size_t size);
+ 
+ extern void memory_failure(unsigned long pfn, int trapno);
+ extern int __memory_failure(unsigned long pfn, int trapno, int ref);
+ extern int sysctl_memory_failure_early_kill;
+ extern int sysctl_memory_failure_recovery;
+ extern atomic_long_t mce_bad_pages;
+ 
   #endif /* __KERNEL__ */
   #endif /* _LINUX_MM_H */
diff --combined include/linux/page-flags.h

index 13de789f0a5c1b8b47660dd37ca0c7da73dd26fd,9bc5fd9fdbf6209e55ab87fc267fee12ae6a7b39..6b202b173955541adbe03ebd8137af6749f1c214
--- 1/include/linux/page-flags.h
--- 2/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@@ -51,6 -51,9 +51,9 @@@
    * PG_buddy is set to indicate that the page is free and in the buddy system
    * (see mm/page_alloc.c).
    *
+  * PG_hwpoison indicates that a page got corrupted in hardware and contains
+  * data with incorrect ECC bits that triggered a machine check. Accessing is
+  * not safe since it may cause another machine check. Don't touch!
    */
   
   /*
@@@ -101,6 -104,9 +104,9 @@@ enum pageflags 
   #endif
   #ifdef CONFIG_ARCH_USES_PG_UNCACHED
         PG_uncached,            /* Page has been mapped as uncached */
+ #endif
+ #ifdef CONFIG_MEMORY_FAILURE
+       PG_hwpoison,            /* hardware poisoned page. Don't touch */
   #endif
         __NR_PAGEFLAGS,
   
@@@ -158,9 -164,6 +164,9 @@@ static inline int TestSetPage##uname(st
   static inline int TestClearPage##uname(struct page *page)             \
                 { return test_and_clear_bit(PG_##lname, &page->flags); }
   
+ +#define __TESTCLEARFLAG(uname, lname)                                 \
+ +static inline int __TestClearPage##uname(struct page *page)           \
+ +              { return __test_and_clear_bit(PG_##lname, &page->flags); }
   
   #define PAGEFLAG(uname, lname) TESTPAGEFLAG(uname, lname)             \
         SETPAGEFLAG(uname, lname) CLEARPAGEFLAG(uname, lname)
@@@ -187,9 -190,6 +193,9 @@@ static inline void __ClearPage##uname(s
   #define TESTCLEARFLAG_FALSE(uname)                                    \
   static inline int TestClearPage##uname(struct page *page) { return 0; }
   
+ +#define __TESTCLEARFLAG_FALSE(uname)                                  \
+ +static inline int __TestClearPage##uname(struct page *page) { return 0; }
+ +
   struct page;  /* forward declaration */
   
   TESTPAGEFLAG(Locked, locked) TESTSETFLAG(Locked, locked)
@@@ -256,11 -256,11 +262,11 @@@ PAGEFLAG(Unevictable, unevictable) __CL
   #ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT
   #define MLOCK_PAGES 1
   PAGEFLAG(Mlocked, mlocked) __CLEARPAGEFLAG(Mlocked, mlocked)
- -      TESTSCFLAG(Mlocked, mlocked)
+ +      TESTSCFLAG(Mlocked, mlocked) __TESTCLEARFLAG(Mlocked, mlocked)
   #else
   #define MLOCK_PAGES 0
- -PAGEFLAG_FALSE(Mlocked)
- -      SETPAGEFLAG_NOOP(Mlocked) TESTCLEARFLAG_FALSE(Mlocked)
+ +PAGEFLAG_FALSE(Mlocked) SETPAGEFLAG_NOOP(Mlocked)
+ +      TESTCLEARFLAG_FALSE(Mlocked) __TESTCLEARFLAG_FALSE(Mlocked)
   #endif
   
   #ifdef CONFIG_ARCH_USES_PG_UNCACHED
@@@ -269,6 -269,15 +275,15 @@@ PAGEFLAG(Uncached, uncached
   PAGEFLAG_FALSE(Uncached)
   #endif
   
+ #ifdef CONFIG_MEMORY_FAILURE
+ PAGEFLAG(HWPoison, hwpoison)
+ TESTSETFLAG(HWPoison, hwpoison)
+ #define __PG_HWPOISON (1UL << PG_hwpoison)
+ #else
+ PAGEFLAG_FALSE(HWPoison)
+ #define __PG_HWPOISON 0
+ #endif
+ 
   static inline int PageUptodate(struct page *page)
   {
         int ret = test_bit(PG_uptodate, &(page)->flags);
@@@ -393,7 -402,7 +408,7 @@@ static inline void __ClearPageTail(stru
          1 << PG_private | 1 << PG_private_2 | \
          1 << PG_buddy   | 1 << PG_writeback | 1 << PG_reserved | \
          1 << PG_slab    | 1 << PG_swapcache | 1 << PG_active | \
-        1 << PG_unevictable | __PG_MLOCKED)
+        1 << PG_unevictable | __PG_MLOCKED | __PG_HWPOISON)
   
   /*
    * Flags checked when a page is prepped for return by the page allocator.
@@@ -402,8 -411,8 +417,8 @@@
    */
   #define PAGE_FLAGS_CHECK_AT_PREP      ((1 << NR_PAGEFLAGS) - 1)
   
- -#endif /* !__GENERATING_BOUNDS_H */
- -
+ +#define PAGE_FLAGS_PRIVATE                            \
+ +      (1 << PG_private | 1 << PG_private_2)
   /**
    * page_has_private - Determine if page has private stuff
    * @page: The page to be checked
@@@ -411,11 -420,8 +426,11 @@@
    * Determine if a page has private stuff, indicating that release routines
    * should be invoked upon it.
    */
- -#define page_has_private(page)                        \
- -      ((page)->flags & ((1 << PG_private) |   \
- -                        (1 << PG_private_2)))
+ +static inline int page_has_private(struct page *page)
+ +{
+ +      return !!(page->flags & PAGE_FLAGS_PRIVATE);
+ +}
+ +
+ +#endif /* !__GENERATING_BOUNDS_H */
   
   #endif        /* PAGE_FLAGS_H */
diff --combined include/linux/prctl.h

index 07bff666e65b695d1062638f8d20f9770d610161,3dc303197e674ec1afce3394ca7d0d6a74b768cf..931150566ade8d720f156665b2a5400bbc499f0d
--- 1/include/linux/prctl.h
--- 2/include/linux/prctl.h
+++ b/include/linux/prctl.h
@@@ -85,7 -85,9 +85,9 @@@
   #define PR_SET_TIMERSLACK 29
   #define PR_GET_TIMERSLACK 30
   
- -#define PR_TASK_PERF_COUNTERS_DISABLE         31
- -#define PR_TASK_PERF_COUNTERS_ENABLE          32
+ +#define PR_TASK_PERF_EVENTS_DISABLE           31
+ +#define PR_TASK_PERF_EVENTS_ENABLE            32
   
+ #define PR_MCE_KILL   33
+ 
   #endif /* _LINUX_PRCTL_H */
diff --combined include/linux/rmap.h

index 477841d29fce238a2888a7c0af0c7d05bc795524,3c1004e50747fd2cb82be52f08d6c8f69a2b017d..cb0ba7032609d5602a709a54f4b012413b981587
--- 1/include/linux/rmap.h
--- 2/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@@ -71,17 -71,33 +71,29 @@@ void page_add_new_anon_rmap(struct pag
   void page_add_file_rmap(struct page *);
   void page_remove_rmap(struct page *);
   
- -#ifdef CONFIG_DEBUG_VM
- -void page_dup_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address);
- -#else
- -static inline void page_dup_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address)
+ +static inline void page_dup_rmap(struct page *page)
   {
         atomic_inc(&page->_mapcount);
   }
- -#endif
   
   /*
    * Called from mm/vmscan.c to handle paging out
    */
   int page_referenced(struct page *, int is_locked,
                         struct mem_cgroup *cnt, unsigned long *vm_flags);
- int try_to_unmap(struct page *, int ignore_refs);
+ enum ttu_flags {
+       TTU_UNMAP = 0,                  /* unmap mode */
+       TTU_MIGRATION = 1,              /* migration mode */
+       TTU_MUNLOCK = 2,                /* munlock mode */
+       TTU_ACTION_MASK = 0xff,
+ 
+       TTU_IGNORE_MLOCK = (1 << 8),    /* ignore mlock */
+       TTU_IGNORE_ACCESS = (1 << 9),   /* don't age */
+       TTU_IGNORE_HWPOISON = (1 << 10),/* corrupted page is recoverable */
+ };
+ #define TTU_ACTION(x) ((x) & TTU_ACTION_MASK)
+ 
+ int try_to_unmap(struct page *, enum ttu_flags flags);
   
   /*
    * Called from mm/filemap_xip.c to unmap empty zero page
@@@ -108,6 -124,13 +120,13 @@@ int page_mkclean(struct page *)
    */
   int try_to_munlock(struct page *);
   
+ /*
+  * Called by memory-failure.c to kill processes.
+  */
+ struct anon_vma *page_lock_anon_vma(struct page *page);
+ void page_unlock_anon_vma(struct anon_vma *anon_vma);
+ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma);
+ 
   #else /* !CONFIG_MMU */
   
   #define anon_vma_init()               do {} while (0)
diff --combined include/linux/sched.h

index 8a16f6d11dcd3efbb8fcf8536def605ae5684da8,29eae73c951d65dc7bc7e6d93c383bc9bd7f2a50..75e6e60bf583bb89a7784d4476a32766d10db420
--- 1/include/linux/sched.h
--- 2/include/linux/sched.h
+++ b/include/linux/sched.h
@@@ -100,7 -100,7 +100,7 @@@ struct robust_list_head
   struct bio;
   struct fs_struct;
   struct bts_context;
- -struct perf_counter_context;
+ +struct perf_event_context;
   
   /*
    * List of flags we want to share for kernel threads,
@@@ -140,10 -140,6 +140,10 @@@ extern int nr_processes(void)
   extern unsigned long nr_running(void);
   extern unsigned long nr_uninterruptible(void);
   extern unsigned long nr_iowait(void);
+ +extern unsigned long nr_iowait_cpu(void);
+ +extern unsigned long this_cpu_load(void);
+ +
+ +
   extern void calc_global_load(void);
   extern u64 cpu_nr_migrations(int cpu);
   
@@@ -194,7 -190,6 +194,7 @@@ extern unsigned long long time_sync_thr
   /* in tsk->state again */
   #define TASK_DEAD             64
   #define TASK_WAKEKILL         128
+ +#define TASK_WAKING           256
   
   /* Convenience macros for the sake of set_task_state */
   #define TASK_KILLABLE         (TASK_WAKEKILL | TASK_UNINTERRUPTIBLE)
@@@ -261,7 -256,7 +261,7 @@@ extern asmlinkage void schedule_tail(st
   extern void init_idle(struct task_struct *idle, int cpu);
   extern void init_idle_bootup_task(struct task_struct *idle);
   
- -extern int runqueue_is_locked(void);
+ +extern int runqueue_is_locked(int cpu);
   extern void task_rq_unlock_wait(struct task_struct *p);
   
   extern cpumask_var_t nohz_cpu_mask;
@@@ -309,7 -304,7 +309,7 @@@ extern void softlockup_tick(void)
   extern void touch_softlockup_watchdog(void);
   extern void touch_all_softlockup_watchdogs(void);
   extern int proc_dosoftlockup_thresh(struct ctl_table *table, int write,
- -                                  struct file *filp, void __user *buffer,
+ +                                  void __user *buffer,
                                     size_t *lenp, loff_t *ppos);
   extern unsigned int  softlockup_panic;
   extern int softlockup_thresh;
@@@ -331,7 -326,7 +331,7 @@@ extern unsigned long sysctl_hung_task_c
   extern unsigned long sysctl_hung_task_timeout_secs;
   extern unsigned long sysctl_hung_task_warnings;
   extern int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
- -                                       struct file *filp, void __user *buffer,
+ +                                       void __user *buffer,
                                          size_t *lenp, loff_t *ppos);
   #endif
   
@@@ -426,15 -421,6 +426,15 @@@ static inline unsigned long get_mm_hiwa
         return max(mm->hiwater_rss, get_mm_rss(mm));
   }
   
+ +static inline void setmax_mm_hiwater_rss(unsigned long *maxrss,
+ +                                       struct mm_struct *mm)
+ +{
+ +      unsigned long hiwater_rss = get_mm_hiwater_rss(mm);
+ +
+ +      if (*maxrss < hiwater_rss)
+ +              *maxrss = hiwater_rss;
+ +}
+ +
   static inline unsigned long get_mm_hiwater_vm(struct mm_struct *mm)
   {
         return max(mm->hiwater_vm, mm->total_vm);
@@@ -447,9 -433,7 +447,9 @@@ extern int get_dumpable(struct mm_struc
   /* dumpable bits */
   #define MMF_DUMPABLE      0  /* core dump is permitted */
   #define MMF_DUMP_SECURELY 1  /* core file is readable only by root */
+ +
   #define MMF_DUMPABLE_BITS 2
+ +#define MMF_DUMPABLE_MASK ((1 << MMF_DUMPABLE_BITS) - 1)
   
   /* coredump filter bits */
   #define MMF_DUMP_ANON_PRIVATE 2
@@@ -459,7 -443,6 +459,7 @@@
   #define MMF_DUMP_ELF_HEADERS  6
   #define MMF_DUMP_HUGETLB_PRIVATE 7
   #define MMF_DUMP_HUGETLB_SHARED  8
+ +
   #define MMF_DUMP_FILTER_SHIFT MMF_DUMPABLE_BITS
   #define MMF_DUMP_FILTER_BITS  7
   #define MMF_DUMP_FILTER_MASK \
@@@ -473,10 -456,6 +473,10 @@@
   #else
   # define MMF_DUMP_MASK_DEFAULT_ELF    0
   #endif
+ +                                      /* leave room for more dump flags */
+ +#define MMF_VM_MERGEABLE      16      /* KSM may merge identical pages */
+ +
+ +#define MMF_INIT_MASK         (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK)
   
   struct sighand_struct {
         atomic_t                count;
@@@ -493,13 -472,6 +493,13 @@@ struct pacct_struct 
         unsigned long           ac_minflt, ac_majflt;
   };
   
+ +struct cpu_itimer {
+ +      cputime_t expires;
+ +      cputime_t incr;
+ +      u32 error;
+ +      u32 incr_error;
+ +};
+ +
   /**
    * struct task_cputime - collected CPU time counts
    * @utime:            time spent in user mode, in &cputime_t units
@@@ -594,12 -566,9 +594,12 @@@ struct signal_struct 
         struct pid *leader_pid;
         ktime_t it_real_incr;
   
- -      /* ITIMER_PROF and ITIMER_VIRTUAL timers for the process */
- -      cputime_t it_prof_expires, it_virt_expires;
- -      cputime_t it_prof_incr, it_virt_incr;
+ +      /*
+ +       * ITIMER_PROF and ITIMER_VIRTUAL timers for the process, we use
+ +       * CPUCLOCK_PROF and CPUCLOCK_VIRT for indexing array as these
+ +       * values are defined to 0 and 1 respectively
+ +       */
+ +      struct cpu_itimer it[2];
   
         /*
          * Thread group totals for process CPU timers.
@@@ -631,7 -600,6 +631,7 @@@
         unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
         unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt;
         unsigned long inblock, oublock, cinblock, coublock;
+ +      unsigned long maxrss, cmaxrss;
         struct task_io_accounting ioac;
   
         /*
@@@ -663,8 -631,6 +663,8 @@@
         unsigned audit_tty;
         struct tty_audit_buf *tty_audit_buf;
   #endif
+ +
+ +      int oom_adj;    /* OOM kill score adjustment (bit shift) */
   };
   
   /* Context switch must be unlocked if interrupts are to be enabled */
@@@ -734,7 -700,7 +734,7 @@@ struct user_struct 
   #endif
   #endif
   
- -#ifdef CONFIG_PERF_COUNTERS
+ +#ifdef CONFIG_PERF_EVENTS
         atomic_long_t locked_vm;
   #endif
   };
@@@ -836,14 -802,14 +836,14 @@@ enum cpu_idle_type 
   #define SD_BALANCE_NEWIDLE    0x0002  /* Balance when about to become idle */
   #define SD_BALANCE_EXEC               0x0004  /* Balance on exec */
   #define SD_BALANCE_FORK               0x0008  /* Balance on fork, clone */
- -#define SD_WAKE_IDLE          0x0010  /* Wake to idle CPU on task wakeup */
+ +#define SD_BALANCE_WAKE               0x0010  /* Balance on wakeup */
   #define SD_WAKE_AFFINE                0x0020  /* Wake task to waking CPU */
- -#define SD_WAKE_BALANCE               0x0040  /* Perform balancing at task wakeup */
+ +#define SD_PREFER_LOCAL               0x0040  /* Prefer to keep tasks local to this domain */
   #define SD_SHARE_CPUPOWER     0x0080  /* Domain members share cpu power */
   #define SD_POWERSAVINGS_BALANCE       0x0100  /* Balance for power savings */
   #define SD_SHARE_PKG_RESOURCES        0x0200  /* Domain members share cpu pkg resources */
   #define SD_SERIALIZE          0x0400  /* Only a single load balancing instance */
- -#define SD_WAKE_IDLE_FAR      0x0800  /* Gain latency sacrificing cache hit */
+ +
   #define SD_PREFER_SIBLING     0x1000  /* Prefer to place tasks in a sibling domain */
   
   enum powersavings_balance_level {
@@@ -1025,9 -991,6 +1025,9 @@@ static inline int test_sd_parent(struc
         return 0;
   }
   
+ +unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu);
+ +unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu);
+ +
   #else /* CONFIG_SMP */
   
   struct sched_domain_attr;
@@@ -1039,7 -1002,6 +1039,7 @@@ partition_sched_domains(int ndoms_new, 
   }
   #endif        /* !CONFIG_SMP */
   
+ +
   struct io_context;                    /* See blkdev.h */
   
   
@@@ -1057,12 -1019,6 +1057,12 @@@ struct uts_namespace
   struct rq;
   struct sched_domain;
   
+ +/*
+ + * wake flags
+ + */
+ +#define WF_SYNC               0x01            /* waker goes to sleep after wakup */
+ +#define WF_FORK               0x02            /* child wakeup after fork */
+ +
   struct sched_class {
         const struct sched_class *next;
   
@@@ -1070,13 -1026,13 +1070,13 @@@
         void (*dequeue_task) (struct rq *rq, struct task_struct *p, int sleep);
         void (*yield_task) (struct rq *rq);
   
- -      void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int sync);
+ +      void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags);
   
         struct task_struct * (*pick_next_task) (struct rq *rq);
         void (*put_prev_task) (struct rq *rq, struct task_struct *p);
   
   #ifdef CONFIG_SMP
- -      int  (*select_task_rq)(struct task_struct *p, int sync);
+ +      int  (*select_task_rq)(struct task_struct *p, int sd_flag, int flags);
   
         unsigned long (*load_balance) (struct rq *this_rq, int this_cpu,
                         struct rq *busiest, unsigned long max_load_move,
@@@ -1108,8 -1064,6 +1108,8 @@@
         void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
                              int oldprio, int running);
   
+ +      unsigned int (*get_rr_interval) (struct task_struct *task);
+ +
   #ifdef CONFIG_FAIR_GROUP_SCHED
         void (*moved_group) (struct task_struct *p);
   #endif
@@@ -1148,8 -1102,6 +1148,8 @@@ struct sched_entity 
         u64                     start_runtime;
         u64                     avg_wakeup;
   
+ +      u64                     avg_running;
+ +
   #ifdef CONFIG_SCHEDSTATS
         u64                     wait_start;
         u64                     wait_max;
@@@ -1247,6 -1199,7 +1247,6 @@@ struct task_struct 
          * a short time
          */
         unsigned char fpu_counter;
- -      s8 oomkilladj; /* OOM kill score adjustment (bit shift). */
   #ifdef CONFIG_BLK_DEV_IO_TRACE
         unsigned int btrace_seq;
   #endif
@@@ -1271,6 -1224,7 +1271,6 @@@
         struct mm_struct *mm, *active_mm;
   
   /* task state */
- -      struct linux_binfmt *binfmt;
         int exit_state;
         int exit_code, exit_signal;
         int pdeath_signal;  /*  The signal sent when the parent dies  */
@@@ -1482,10 -1436,10 +1482,10 @@@
         struct list_head pi_state_list;
         struct futex_pi_state *pi_state_cache;
   #endif
- -#ifdef CONFIG_PERF_COUNTERS
- -      struct perf_counter_context *perf_counter_ctxp;
- -      struct mutex perf_counter_mutex;
- -      struct list_head perf_counter_list;
+ +#ifdef CONFIG_PERF_EVENTS
+ +      struct perf_event_context *perf_event_ctxp;
+ +      struct mutex perf_event_mutex;
+ +      struct list_head perf_event_list;
   #endif
   #ifdef CONFIG_NUMA
         struct mempolicy *mempolicy;    /* Protected by alloc_lock */
@@@ -1538,7 -1492,6 +1538,7 @@@
         /* bitmask of trace recursion */
         unsigned long trace_recursion;
   #endif /* CONFIG_TRACING */
+ +      unsigned long stack_start;
   };
   
   /* Future-safe accessor for struct task_struct's cpus_allowed. */
@@@ -1734,6 -1687,7 +1734,7 @@@ extern cputime_t task_gtime(struct task
   #define PF_EXITPIDONE 0x00000008      /* pi exit done on shut down */
   #define PF_VCPU               0x00000010      /* I'm a virtual CPU */
   #define PF_FORKNOEXEC 0x00000040      /* forked but didn't exec */
+ #define PF_MCE_PROCESS  0x00000080      /* process policy on mce errors */
   #define PF_SUPERPRIV  0x00000100      /* used super-user privileges */
   #define PF_DUMPCORE   0x00000200      /* dumped core */
   #define PF_SIGNALED   0x00000400      /* killed by a signal */
@@@ -1745,7 -1699,7 +1746,7 @@@
   #define PF_FROZEN     0x00010000      /* frozen for system suspend */
   #define PF_FSTRANS    0x00020000      /* inside a filesystem transaction */
   #define PF_KSWAPD     0x00040000      /* I am kswapd */
- -#define PF_SWAPOFF    0x00080000      /* I am in swapoff */
+ +#define PF_OOM_ORIGIN 0x00080000      /* Allocating much memory to others */
   #define PF_LESS_THROTTLE 0x00100000   /* Throttle me less: I clean memory */
   #define PF_KTHREAD    0x00200000      /* I am a kernel thread */
   #define PF_RANDOMIZE  0x00400000      /* randomize virtual address space */
@@@ -1753,6 -1707,7 +1754,7 @@@
   #define PF_SPREAD_PAGE        0x01000000      /* Spread page cache over cpuset */
   #define PF_SPREAD_SLAB        0x02000000      /* Spread some slab caches over cpuset */
   #define PF_THREAD_BOUND       0x04000000      /* Thread bound to specific cpu */
+ #define PF_MCE_EARLY    0x08000000      /* Early kill for mce process policy */
   #define PF_MEMPOLICY  0x10000000      /* Non-default NUMA mempolicy */
   #define PF_MUTEX_TESTER       0x20000000      /* Thread belongs to the rt mutex tester */
   #define PF_FREEZER_SKIP       0x40000000      /* Freezer should not count it as freezeable */
@@@ -1787,6 -1742,7 +1789,6 @@@
   
   #define RCU_READ_UNLOCK_BLOCKED (1 << 0) /* blocked while in RCU read-side. */
   #define RCU_READ_UNLOCK_NEED_QS (1 << 1) /* RCU core needs CPU response. */
- -#define RCU_READ_UNLOCK_GOT_QS  (1 << 2) /* CPU has responded to RCU core. */
   
   static inline void rcu_copy_process(struct task_struct *p)
   {
@@@ -1816,13 -1772,10 +1818,13 @@@ static inline int set_cpus_allowed_ptr(
         return 0;
   }
   #endif
+ +
+ +#ifndef CONFIG_CPUMASK_OFFSTACK
   static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
   {
         return set_cpus_allowed_ptr(p, &new_mask);
   }
+ +#endif
   
   /*
    * Architectures can set this to 1 if they have specified
@@@ -1905,7 -1858,7 +1907,7 @@@ extern unsigned int sysctl_sched_time_a
   extern unsigned int sysctl_timer_migration;
   
   int sched_nr_latency_handler(struct ctl_table *table, int write,
- -              struct file *file, void __user *buffer, size_t *length,
+ +              void __user *buffer, size_t *length,
                 loff_t *ppos);
   #endif
   #ifdef CONFIG_SCHED_DEBUG
@@@ -1923,7 -1876,7 +1925,7 @@@ extern unsigned int sysctl_sched_rt_per
   extern int sysctl_sched_rt_runtime;
   
   int sched_rt_handler(struct ctl_table *table, int write,
- -              struct file *filp, void __user *buffer, size_t *lenp,
+ +              void __user *buffer, size_t *lenp,
                 loff_t *ppos);
   
   extern unsigned int sysctl_sched_compat_yield;
@@@ -2058,7 -2011,6 +2060,7 @@@ extern int kill_pgrp(struct pid *pid, i
   extern int kill_pid(struct pid *pid, int sig, int priv);
   extern int kill_proc_info(int, struct siginfo *, pid_t);
   extern int do_notify_parent(struct task_struct *, int);
+ +extern void __wake_up_parent(struct task_struct *p, struct task_struct *parent);
   extern void force_sig(int, struct task_struct *);
   extern void force_sig_specific(int, struct task_struct *);
   extern int send_sig(int, struct task_struct *, int);
@@@ -2336,10 -2288,7 +2338,10 @@@ static inline int signal_pending(struc
         return unlikely(test_tsk_thread_flag(p,TIF_SIGPENDING));
   }
   
- -extern int __fatal_signal_pending(struct task_struct *p);
+ +static inline int __fatal_signal_pending(struct task_struct *p)
+ +{
+ +      return unlikely(sigismember(&p->pending.signal, SIGKILL));
+ +}
   
   static inline int fatal_signal_pending(struct task_struct *p)
   {
diff --combined include/linux/swap.h

index 82232dbea3f711217384bc4e2bbf1380f0851de8,f077e454c65918cfa7e3812adf92d14f4610ef1e..4ec90019c1a4c3997c8da80dd3d36f3b31d09b11
--- 1/include/linux/swap.h
--- 2/include/linux/swap.h
+++ b/include/linux/swap.h
@@@ -34,15 -34,37 +34,37 @@@ static inline int current_is_kswapd(voi
    * the type/offset into the pte as 5/27 as well.
    */
   #define MAX_SWAPFILES_SHIFT   5
- #ifndef CONFIG_MIGRATION
- #define MAX_SWAPFILES         (1 << MAX_SWAPFILES_SHIFT)
+ 
+ /*
+  * Use some of the swap files numbers for other purposes. This
+  * is a convenient way to hook into the VM to trigger special
+  * actions on faults.
+  */
+ 
+ /*
+  * NUMA node memory migration support
+  */
+ #ifdef CONFIG_MIGRATION
+ #define SWP_MIGRATION_NUM 2
+ #define SWP_MIGRATION_READ    (MAX_SWAPFILES + SWP_HWPOISON_NUM)
+ #define SWP_MIGRATION_WRITE   (MAX_SWAPFILES + SWP_HWPOISON_NUM + 1)
   #else
- /* Use last two entries for page migration swap entries */
- #define MAX_SWAPFILES         ((1 << MAX_SWAPFILES_SHIFT)-2)
- #define SWP_MIGRATION_READ    MAX_SWAPFILES
- #define SWP_MIGRATION_WRITE   (MAX_SWAPFILES + 1)
+ #define SWP_MIGRATION_NUM 0
   #endif
   
+ /*
+  * Handling of hardware poisoned pages with memory corruption.
+  */
+ #ifdef CONFIG_MEMORY_FAILURE
+ #define SWP_HWPOISON_NUM 1
+ #define SWP_HWPOISON          MAX_SWAPFILES
+ #else
+ #define SWP_HWPOISON_NUM 0
+ #endif
+ 
+ #define MAX_SWAPFILES \
+       ((1 << MAX_SWAPFILES_SHIFT) - SWP_MIGRATION_NUM - SWP_HWPOISON_NUM)
+ 
   /*
    * Magic header for a swap area. The first part of the union is
    * what the swap magic looks like for the old (limited to 128MB)
@@@ -217,11 -239,6 +239,11 @@@ extern unsigned long try_to_free_pages(
   extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem,
                                                   gfp_t gfp_mask, bool noswap,
                                                   unsigned int swappiness);
+ +extern unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
+ +                                              gfp_t gfp_mask, bool noswap,
+ +                                              unsigned int swappiness,
+ +                                              struct zone *zone,
+ +                                              int nid);
   extern int __isolate_lru_page(struct page *page, int mode, int file);
   extern unsigned long shrink_all_memory(unsigned long nr_pages);
   extern int vm_swappiness;
@@@ -245,7 -262,7 +267,7 @@@ extern int page_evictable(struct page *
   extern void scan_mapping_unevictable_pages(struct address_space *);
   
   extern unsigned long scan_unevictable_pages;
- -extern int scan_unevictable_handler(struct ctl_table *, int, struct file *,
+ +extern int scan_unevictable_handler(struct ctl_table *, int,
                                         void __user *, size_t *, loff_t *);
   extern int scan_unevictable_register_node(struct node *node);
   extern void scan_unevictable_unregister_node(struct node *node);
@@@ -424,22 -441,10 +446,22 @@@ static inline swp_entry_t get_swap_page
   }
   
   /* linux/mm/thrash.c */
- -#define put_swap_token(mm)    do { } while (0)
- -#define grab_swap_token(mm)   do { } while (0)
- -#define has_swap_token(mm)    0
- -#define disable_swap_token()  do { } while (0)
+ +static inline void put_swap_token(struct mm_struct *mm)
+ +{
+ +}
+ +
+ +static inline void grab_swap_token(struct mm_struct *mm)
+ +{
+ +}
+ +
+ +static inline int has_swap_token(struct mm_struct *mm)
+ +{
+ +      return 0;
+ +}
+ +
+ +static inline void disable_swap_token(void)
+ +{
+ +}
   
   static inline void
   mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent)
diff --combined kernel/sys.c

index ebcb15611728c510ef59312e3a0cd398757e0b8f,41e02eff33986ca3b32834a696b107433a0b5551..255475d163e0cdb62602306a28134d67005e87c0
--- 1/kernel/sys.c
--- 2/kernel/sys.c
+++ b/kernel/sys.c
@@@ -14,7 -14,7 +14,7 @@@
   #include <linux/prctl.h>
   #include <linux/highuid.h>
   #include <linux/fs.h>
- -#include <linux/perf_counter.h>
+ +#include <linux/perf_event.h>
   #include <linux/resource.h>
   #include <linux/kernel.h>
   #include <linux/kexec.h>
@@@ -1338,7 -1338,6 +1338,7 @@@ static void k_getrusage(struct task_str
         unsigned long flags;
         cputime_t utime, stime;
         struct task_cputime cputime;
+ +      unsigned long maxrss = 0;
   
         memset((char *) r, 0, sizeof *r);
         utime = stime = cputime_zero;
@@@ -1347,7 -1346,6 +1347,7 @@@
                 utime = task_utime(current);
                 stime = task_stime(current);
                 accumulate_thread_rusage(p, r);
+ +              maxrss = p->signal->maxrss;
                 goto out;
         }
   
@@@ -1365,7 -1363,6 +1365,7 @@@
                         r->ru_majflt = p->signal->cmaj_flt;
                         r->ru_inblock = p->signal->cinblock;
                         r->ru_oublock = p->signal->coublock;
+ +                      maxrss = p->signal->cmaxrss;
   
                         if (who == RUSAGE_CHILDREN)
                                 break;
@@@ -1380,8 -1377,6 +1380,8 @@@
                         r->ru_majflt += p->signal->maj_flt;
                         r->ru_inblock += p->signal->inblock;
                         r->ru_oublock += p->signal->oublock;
+ +                      if (maxrss < p->signal->maxrss)
+ +                              maxrss = p->signal->maxrss;
                         t = p;
                         do {
                                 accumulate_thread_rusage(t, r);
@@@ -1397,15 -1392,6 +1397,15 @@@
   out:
         cputime_to_timeval(utime, &r->ru_utime);
         cputime_to_timeval(stime, &r->ru_stime);
+ +
+ +      if (who != RUSAGE_CHILDREN) {
+ +              struct mm_struct *mm = get_task_mm(p);
+ +              if (mm) {
+ +                      setmax_mm_hiwater_rss(&maxrss, mm);
+ +                      mmput(mm);
+ +              }
+ +      }
+ +      r->ru_maxrss = maxrss * (PAGE_SIZE / 1024); /* convert pages to KBs */
   }
   
   int getrusage(struct task_struct *p, int who, struct rusage __user *ru)
@@@ -1525,11 -1511,11 +1525,11 @@@ SYSCALL_DEFINE5(prctl, int, option, uns
                 case PR_SET_TSC:
                         error = SET_TSC_CTL(arg2);
                         break;
- -              case PR_TASK_PERF_COUNTERS_DISABLE:
- -                      error = perf_counter_task_disable();
+ +              case PR_TASK_PERF_EVENTS_DISABLE:
+ +                      error = perf_event_task_disable();
                         break;
- -              case PR_TASK_PERF_COUNTERS_ENABLE:
- -                      error = perf_counter_task_enable();
+ +              case PR_TASK_PERF_EVENTS_ENABLE:
+ +                      error = perf_event_task_enable();
                         break;
                 case PR_GET_TIMERSLACK:
                         error = current->timer_slack_ns;
@@@ -1542,6 -1528,28 +1542,28 @@@
                                 current->timer_slack_ns = arg2;
                         error = 0;
                         break;
+               case PR_MCE_KILL:
+                       if (arg4 | arg5)
+                               return -EINVAL;
+                       switch (arg2) {
+                       case 0:
+                               if (arg3 != 0)
+                                       return -EINVAL;
+                               current->flags &= ~PF_MCE_PROCESS;
+                               break;
+                       case 1:
+                               current->flags |= PF_MCE_PROCESS;
+                               if (arg3 != 0)
+                                       current->flags |= PF_MCE_EARLY;
+                               else
+                                       current->flags &= ~PF_MCE_EARLY;
+                               break;
+                       default:
+                               return -EINVAL;
+                       }
+                       error = 0;
+                       break;
+ 
                 default:
                         error = -EINVAL;
                         break;
diff --combined kernel/sysctl.c

index a02697b7cb97a94a1f9632846a488c23e099fa7c,eacae77ac9fc0a3c8d0660cbdfb0e5337e61fe82..0d949c517412ee16822a5ca7d6e7c79218543741
--- 1/kernel/sysctl.c
--- 2/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@@ -26,6 -26,7 +26,6 @@@
   #include <linux/proc_fs.h>
   #include <linux/security.h>
   #include <linux/ctype.h>
- -#include <linux/utsname.h>
   #include <linux/kmemcheck.h>
   #include <linux/smp_lock.h>
   #include <linux/fs.h>
@@@ -49,7 -50,7 +49,7 @@@
   #include <linux/reboot.h>
   #include <linux/ftrace.h>
   #include <linux/slow-work.h>
- -#include <linux/perf_counter.h>
+ +#include <linux/perf_event.h>
   
   #include <asm/uaccess.h>
   #include <asm/processor.h>
@@@ -76,7 -77,6 +76,7 @@@ extern int max_threads
   extern int core_uses_pid;
   extern int suid_dumpable;
   extern char core_pattern[];
+ +extern unsigned int core_pipe_limit;
   extern int pid_max;
   extern int min_free_kbytes;
   extern int pid_max_min, pid_max_max;
@@@ -91,9 -91,7 +91,9 @@@ extern int sysctl_nr_trim_pages
   #ifdef CONFIG_RCU_TORTURE_TEST
   extern int rcutorture_runnable;
   #endif /* #ifdef CONFIG_RCU_TORTURE_TEST */
+ +#ifdef CONFIG_BLOCK
   extern int blk_iopoll_enabled;
+ +#endif
   
   /* Constants used for minimum and  maximum */
   #ifdef CONFIG_DETECT_SOFTLOCKUP
@@@ -106,9 -104,6 +106,9 @@@ static int __maybe_unused one = 1
   static int __maybe_unused two = 2;
   static unsigned long one_ul = 1;
   static int one_hundred = 100;
+ +#ifdef CONFIG_PRINTK
+ +static int ten_thousand = 10000;
+ +#endif
   
   /* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */
   static unsigned long dirty_bytes_min = 2 * PAGE_SIZE;
@@@ -163,9 -158,9 +163,9 @@@ extern int max_lock_depth
   #endif
   
   #ifdef CONFIG_PROC_SYSCTL
- -static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp,
+ +static int proc_do_cad_pid(struct ctl_table *table, int write,
                   void __user *buffer, size_t *lenp, loff_t *ppos);
- -static int proc_taint(struct ctl_table *table, int write, struct file *filp,
+ +static int proc_taint(struct ctl_table *table, int write,
                                void __user *buffer, size_t *lenp, loff_t *ppos);
   #endif
   
@@@ -424,14 -419,6 +424,14 @@@ static struct ctl_table kern_table[] = 
                 .proc_handler   = &proc_dostring,
                 .strategy       = &sysctl_string,
         },
+ +      {
+ +              .ctl_name       = CTL_UNNUMBERED,
+ +              .procname       = "core_pipe_limit",
+ +              .data           = &core_pipe_limit,
+ +              .maxlen         = sizeof(unsigned int),
+ +              .mode           = 0644,
+ +              .proc_handler   = &proc_dointvec,
+ +      },
   #ifdef CONFIG_PROC_SYSCTL
         {
                 .procname       = "tainted",
@@@ -733,17 -720,6 +733,17 @@@
                 .mode           = 0644,
                 .proc_handler   = &proc_dointvec,
         },
+ +      {
+ +              .ctl_name       = CTL_UNNUMBERED,
+ +              .procname       = "printk_delay",
+ +              .data           = &printk_delay_msec,
+ +              .maxlen         = sizeof(int),
+ +              .mode           = 0644,
+ +              .proc_handler   = &proc_dointvec_minmax,
+ +              .strategy       = &sysctl_intvec,
+ +              .extra1         = &zero,
+ +              .extra2         = &ten_thousand,
+ +      },
   #endif
         {
                 .ctl_name       = KERN_NGROUPS_MAX,
@@@ -986,28 -962,28 +986,28 @@@
                 .child          = slow_work_sysctls,
         },
   #endif
- -#ifdef CONFIG_PERF_COUNTERS
+ +#ifdef CONFIG_PERF_EVENTS
         {
                 .ctl_name       = CTL_UNNUMBERED,
- -              .procname       = "perf_counter_paranoid",
- -              .data           = &sysctl_perf_counter_paranoid,
- -              .maxlen         = sizeof(sysctl_perf_counter_paranoid),
+ +              .procname       = "perf_event_paranoid",
+ +              .data           = &sysctl_perf_event_paranoid,
+ +              .maxlen         = sizeof(sysctl_perf_event_paranoid),
                 .mode           = 0644,
                 .proc_handler   = &proc_dointvec,
         },
         {
                 .ctl_name       = CTL_UNNUMBERED,
- -              .procname       = "perf_counter_mlock_kb",
- -              .data           = &sysctl_perf_counter_mlock,
- -              .maxlen         = sizeof(sysctl_perf_counter_mlock),
+ +              .procname       = "perf_event_mlock_kb",
+ +              .data           = &sysctl_perf_event_mlock,
+ +              .maxlen         = sizeof(sysctl_perf_event_mlock),
                 .mode           = 0644,
                 .proc_handler   = &proc_dointvec,
         },
         {
                 .ctl_name       = CTL_UNNUMBERED,
- -              .procname       = "perf_counter_max_sample_rate",
- -              .data           = &sysctl_perf_counter_sample_rate,
- -              .maxlen         = sizeof(sysctl_perf_counter_sample_rate),
+ +              .procname       = "perf_event_max_sample_rate",
+ +              .data           = &sysctl_perf_event_sample_rate,
+ +              .maxlen         = sizeof(sysctl_perf_event_sample_rate),
                 .mode           = 0644,
                 .proc_handler   = &proc_dointvec,
         },
@@@ -1022,7 -998,6 +1022,7 @@@
                 .proc_handler   = &proc_dointvec,
         },
   #endif
+ +#ifdef CONFIG_BLOCK
         {
                 .ctl_name       = CTL_UNNUMBERED,
                 .procname       = "blk_iopoll",
@@@ -1031,7 -1006,6 +1031,7 @@@
                 .mode           = 0644,
                 .proc_handler   = &proc_dointvec,
         },
+ +#endif
   /*
    * NOTE: do not add new entries to this table unless you have read
    * Documentation/sysctl/ctl_unnumbered.txt
@@@ -1398,6 -1372,31 +1398,31 @@@ static struct ctl_table vm_table[] = 
                 .mode           = 0644,
                 .proc_handler   = &scan_unevictable_handler,
         },
+ #ifdef CONFIG_MEMORY_FAILURE
+       {
+               .ctl_name       = CTL_UNNUMBERED,
+               .procname       = "memory_failure_early_kill",
+               .data           = &sysctl_memory_failure_early_kill,
+               .maxlen         = sizeof(sysctl_memory_failure_early_kill),
+               .mode           = 0644,
+               .proc_handler   = &proc_dointvec_minmax,
+               .strategy       = &sysctl_intvec,
+               .extra1         = &zero,
+               .extra2         = &one,
+       },
+       {
+               .ctl_name       = CTL_UNNUMBERED,
+               .procname       = "memory_failure_recovery",
+               .data           = &sysctl_memory_failure_recovery,
+               .maxlen         = sizeof(sysctl_memory_failure_recovery),
+               .mode           = 0644,
+               .proc_handler   = &proc_dointvec_minmax,
+               .strategy       = &sysctl_intvec,
+               .extra1         = &zero,
+               .extra2         = &one,
+       },
+ #endif
+ 
   /*
    * NOTE: do not add new entries to this table unless you have read
    * Documentation/sysctl/ctl_unnumbered.txt
@@@ -2226,7 -2225,7 +2251,7 @@@ void sysctl_head_put(struct ctl_table_h
   #ifdef CONFIG_PROC_SYSCTL
   
   static int _proc_do_string(void* data, int maxlen, int write,
- -                         struct file *filp, void __user *buffer,
+ +                         void __user *buffer,
                            size_t *lenp, loff_t *ppos)
   {
         size_t len;
@@@ -2287,6 -2286,7 +2312,6 @@@
    * proc_dostring - read a string sysctl
    * @table: the sysctl table
    * @write: %TRUE if this is a write to the sysctl file
- - * @filp: the file structure
    * @buffer: the user buffer
    * @lenp: the size of the user buffer
    * @ppos: file position
@@@ -2300,10 -2300,10 +2325,10 @@@
    *
    * Returns 0 on success.
    */
- -int proc_dostring(struct ctl_table *table, int write, struct file *filp,
+ +int proc_dostring(struct ctl_table *table, int write,
                   void __user *buffer, size_t *lenp, loff_t *ppos)
   {
- -      return _proc_do_string(table->data, table->maxlen, write, filp,
+ +      return _proc_do_string(table->data, table->maxlen, write,
                                buffer, lenp, ppos);
   }
   
@@@ -2328,7 -2328,7 +2353,7 @@@ static int do_proc_dointvec_conv(int *n
   }
   
   static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
- -                int write, struct file *filp, void __user *buffer,
+ +                int write, void __user *buffer,
                   size_t *lenp, loff_t *ppos,
                   int (*conv)(int *negp, unsigned long *lvalp, int *valp,
                               int write, void *data),
@@@ -2435,13 -2435,13 +2460,13 @@@
   #undef TMPBUFLEN
   }
   
- -static int do_proc_dointvec(struct ctl_table *table, int write, struct file *filp,
+ +static int do_proc_dointvec(struct ctl_table *table, int write,
                   void __user *buffer, size_t *lenp, loff_t *ppos,
                   int (*conv)(int *negp, unsigned long *lvalp, int *valp,
                               int write, void *data),
                   void *data)
   {
- -      return __do_proc_dointvec(table->data, table, write, filp,
+ +      return __do_proc_dointvec(table->data, table, write,
                         buffer, lenp, ppos, conv, data);
   }
   
@@@ -2449,6 -2449,7 +2474,6 @@@
    * proc_dointvec - read a vector of integers
    * @table: the sysctl table
    * @write: %TRUE if this is a write to the sysctl file
- - * @filp: the file structure
    * @buffer: the user buffer
    * @lenp: the size of the user buffer
    * @ppos: file position
@@@ -2458,10 -2459,10 +2483,10 @@@
    *
    * Returns 0 on success.
    */
- -int proc_dointvec(struct ctl_table *table, int write, struct file *filp,
+ +int proc_dointvec(struct ctl_table *table, int write,
                      void __user *buffer, size_t *lenp, loff_t *ppos)
   {
- -    return do_proc_dointvec(table,write,filp,buffer,lenp,ppos,
+ +    return do_proc_dointvec(table,write,buffer,lenp,ppos,
                             NULL,NULL);
   }
   
@@@ -2469,7 -2470,7 +2494,7 @@@
    * Taint values can only be increased
    * This means we can safely use a temporary.
    */
- -static int proc_taint(struct ctl_table *table, int write, struct file *filp,
+ +static int proc_taint(struct ctl_table *table, int write,
                                void __user *buffer, size_t *lenp, loff_t *ppos)
   {
         struct ctl_table t;
@@@ -2481,7 -2482,7 +2506,7 @@@
   
         t = *table;
         t.data = &tmptaint;
- -      err = proc_doulongvec_minmax(&t, write, filp, buffer, lenp, ppos);
+ +      err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos);
         if (err < 0)
                 return err;
   
@@@ -2533,6 -2534,7 +2558,6 @@@ static int do_proc_dointvec_minmax_conv
    * proc_dointvec_minmax - read a vector of integers with min/max values
    * @table: the sysctl table
    * @write: %TRUE if this is a write to the sysctl file
- - * @filp: the file structure
    * @buffer: the user buffer
    * @lenp: the size of the user buffer
    * @ppos: file position
@@@ -2545,18 -2547,19 +2570,18 @@@
    *
    * Returns 0 on success.
    */
- -int proc_dointvec_minmax(struct ctl_table *table, int write, struct file *filp,
+ +int proc_dointvec_minmax(struct ctl_table *table, int write,
                   void __user *buffer, size_t *lenp, loff_t *ppos)
   {
         struct do_proc_dointvec_minmax_conv_param param = {
                 .min = (int *) table->extra1,
                 .max = (int *) table->extra2,
         };
- -      return do_proc_dointvec(table, write, filp, buffer, lenp, ppos,
+ +      return do_proc_dointvec(table, write, buffer, lenp, ppos,
                                 do_proc_dointvec_minmax_conv, &param);
   }
   
   static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write,
- -                                   struct file *filp,
                                      void __user *buffer,
                                      size_t *lenp, loff_t *ppos,
                                      unsigned long convmul,
@@@ -2661,19 -2664,21 +2686,19 @@@
   }
   
   static int do_proc_doulongvec_minmax(struct ctl_table *table, int write,
- -                                   struct file *filp,
                                      void __user *buffer,
                                      size_t *lenp, loff_t *ppos,
                                      unsigned long convmul,
                                      unsigned long convdiv)
   {
         return __do_proc_doulongvec_minmax(table->data, table, write,
- -                      filp, buffer, lenp, ppos, convmul, convdiv);
+ +                      buffer, lenp, ppos, convmul, convdiv);
   }
   
   /**
    * proc_doulongvec_minmax - read a vector of long integers with min/max values
    * @table: the sysctl table
    * @write: %TRUE if this is a write to the sysctl file
- - * @filp: the file structure
    * @buffer: the user buffer
    * @lenp: the size of the user buffer
    * @ppos: file position
@@@ -2686,16 -2691,17 +2711,16 @@@
    *
    * Returns 0 on success.
    */
- -int proc_doulongvec_minmax(struct ctl_table *table, int write, struct file *filp,
+ +int proc_doulongvec_minmax(struct ctl_table *table, int write,
                            void __user *buffer, size_t *lenp, loff_t *ppos)
   {
- -    return do_proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos, 1l, 1l);
+ +    return do_proc_doulongvec_minmax(table, write, buffer, lenp, ppos, 1l, 1l);
   }
   
   /**
    * proc_doulongvec_ms_jiffies_minmax - read a vector of millisecond values with min/max values
    * @table: the sysctl table
    * @write: %TRUE if this is a write to the sysctl file
- - * @filp: the file structure
    * @buffer: the user buffer
    * @lenp: the size of the user buffer
    * @ppos: file position
@@@ -2710,10 -2716,11 +2735,10 @@@
    * Returns 0 on success.
    */
   int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
- -                                    struct file *filp,
                                       void __user *buffer,
                                       size_t *lenp, loff_t *ppos)
   {
- -    return do_proc_doulongvec_minmax(table, write, filp, buffer,
+ +    return do_proc_doulongvec_minmax(table, write, buffer,
                                      lenp, ppos, HZ, 1000l);
   }
   
@@@ -2789,6 -2796,7 +2814,6 @@@ static int do_proc_dointvec_ms_jiffies_
    * proc_dointvec_jiffies - read a vector of integers as seconds
    * @table: the sysctl table
    * @write: %TRUE if this is a write to the sysctl file
- - * @filp: the file structure
    * @buffer: the user buffer
    * @lenp: the size of the user buffer
    * @ppos: file position
@@@ -2800,10 -2808,10 +2825,10 @@@
    *
    * Returns 0 on success.
    */
- -int proc_dointvec_jiffies(struct ctl_table *table, int write, struct file *filp,
+ +int proc_dointvec_jiffies(struct ctl_table *table, int write,
                           void __user *buffer, size_t *lenp, loff_t *ppos)
   {
- -    return do_proc_dointvec(table,write,filp,buffer,lenp,ppos,
+ +    return do_proc_dointvec(table,write,buffer,lenp,ppos,
                             do_proc_dointvec_jiffies_conv,NULL);
   }
   
@@@ -2811,6 -2819,7 +2836,6 @@@
    * proc_dointvec_userhz_jiffies - read a vector of integers as 1/USER_HZ seconds
    * @table: the sysctl table
    * @write: %TRUE if this is a write to the sysctl file
- - * @filp: the file structure
    * @buffer: the user buffer
    * @lenp: the size of the user buffer
    * @ppos: pointer to the file position
@@@ -2822,10 -2831,10 +2847,10 @@@
    *
    * Returns 0 on success.
    */
- -int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, struct file *filp,
+ +int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write,
                                  void __user *buffer, size_t *lenp, loff_t *ppos)
   {
- -    return do_proc_dointvec(table,write,filp,buffer,lenp,ppos,
+ +    return do_proc_dointvec(table,write,buffer,lenp,ppos,
                             do_proc_dointvec_userhz_jiffies_conv,NULL);
   }
   
@@@ -2833,6 -2842,7 +2858,6 @@@
    * proc_dointvec_ms_jiffies - read a vector of integers as 1 milliseconds
    * @table: the sysctl table
    * @write: %TRUE if this is a write to the sysctl file
- - * @filp: the file structure
    * @buffer: the user buffer
    * @lenp: the size of the user buffer
    * @ppos: file position
@@@ -2845,14 -2855,14 +2870,14 @@@
    *
    * Returns 0 on success.
    */
- -int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, struct file *filp,
+ +int proc_dointvec_ms_jiffies(struct ctl_table *table, int write,
                              void __user *buffer, size_t *lenp, loff_t *ppos)
   {
- -      return do_proc_dointvec(table, write, filp, buffer, lenp, ppos,
+ +      return do_proc_dointvec(table, write, buffer, lenp, ppos,
                                 do_proc_dointvec_ms_jiffies_conv, NULL);
   }
   
- -static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp,
+ +static int proc_do_cad_pid(struct ctl_table *table, int write,
                            void __user *buffer, size_t *lenp, loff_t *ppos)
   {
         struct pid *new_pid;
@@@ -2861,7 -2871,7 +2886,7 @@@
   
         tmp = pid_vnr(cad_pid);
   
- -      r = __do_proc_dointvec(&tmp, table, write, filp, buffer,
+ +      r = __do_proc_dointvec(&tmp, table, write, buffer,
                                lenp, ppos, NULL, NULL);
         if (r || !write)
                 return r;
@@@ -2876,49 -2886,50 +2901,49 @@@
   
   #else /* CONFIG_PROC_FS */
   
- -int proc_dostring(struct ctl_table *table, int write, struct file *filp,
+ +int proc_dostring(struct ctl_table *table, int write,
                   void __user *buffer, size_t *lenp, loff_t *ppos)
   {
         return -ENOSYS;
   }
   
- -int proc_dointvec(struct ctl_table *table, int write, struct file *filp,
+ +int proc_dointvec(struct ctl_table *table, int write,
                   void __user *buffer, size_t *lenp, loff_t *ppos)
   {
         return -ENOSYS;
   }
   
- -int proc_dointvec_minmax(struct ctl_table *table, int write, struct file *filp,
+ +int proc_dointvec_minmax(struct ctl_table *table, int write,
                     void __user *buffer, size_t *lenp, loff_t *ppos)
   {
         return -ENOSYS;
   }
   
- -int proc_dointvec_jiffies(struct ctl_table *table, int write, struct file *filp,
+ +int proc_dointvec_jiffies(struct ctl_table *table, int write,
                     void __user *buffer, size_t *lenp, loff_t *ppos)
   {
         return -ENOSYS;
   }
   
- -int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, struct file *filp,
+ +int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write,
                     void __user *buffer, size_t *lenp, loff_t *ppos)
   {
         return -ENOSYS;
   }
   
- -int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, struct file *filp,
+ +int proc_dointvec_ms_jiffies(struct ctl_table *table, int write,
                              void __user *buffer, size_t *lenp, loff_t *ppos)
   {
         return -ENOSYS;
   }
   
- -int proc_doulongvec_minmax(struct ctl_table *table, int write, struct file *filp,
+ +int proc_doulongvec_minmax(struct ctl_table *table, int write,
                     void __user *buffer, size_t *lenp, loff_t *ppos)
   {
         return -ENOSYS;
   }
   
   int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
- -                                    struct file *filp,
                                       void __user *buffer,
                                       size_t *lenp, loff_t *ppos)
   {
diff --combined mm/Kconfig

index 71eb0b4cce8dbc425aa476d54f2048e5679c7dea,4b4e57a9643e240c90163f23fe3913d6e32891be..247760729593d37f841655dd54ec4572523255f7
--- 1/mm/Kconfig
--- 2/mm/Kconfig
+++ b/mm/Kconfig
@@@ -214,18 -214,6 +214,18 @@@ config HAVE_MLOCKED_PAGE_BI
   config MMU_NOTIFIER
         bool
   
+ +config KSM
+ +      bool "Enable KSM for page merging"
+ +      depends on MMU
+ +      help
+ +        Enable Kernel Samepage Merging: KSM periodically scans those areas
+ +        of an application's address space that an app has advised may be
+ +        mergeable.  When it finds pages of identical content, it replaces
+ +        the many instances by a single resident page with that content, so
+ +        saving memory until one or another app needs to modify the content.
+ +        Recommended for use with KVM, or with other duplicative applications.
+ +        See Documentation/vm/ksm.txt for more information.
+ +
   config DEFAULT_MMAP_MIN_ADDR
           int "Low address space to protect from user allocation"
           default 4096
@@@ -245,6 -233,20 +245,20 @@@
           /proc/sys/vm/mmap_min_addr tunable.
   
   
+ config MEMORY_FAILURE
+       depends on MMU
+       depends on X86_MCE
+       bool "Enable recovery from hardware memory errors"
+       help
+         Enables code to recover from some memory failures on systems
+         with MCA recovery. This allows a system to continue running
+         even when some of its memory has uncorrected errors. This requires
+         special hardware support and typically ECC memory.
+ 
+ config HWPOISON_INJECT
+       tristate "Poison pages injector"
+       depends on MEMORY_FAILURE && DEBUG_KERNEL
+ 
   config NOMMU_INITIAL_TRIM_EXCESS
         int "Turn on mmap() excess space trimming before booting"
         depends on !MMU
diff --combined mm/Makefile

index 88193d73cd1a30dd623e94eb9b5bed0c96cf08e0,713c9f82d5ab86b306cc03408c3930b539a44b7f..515fd793c17fa989cffe0f3a686c8086e2f7ddca
--- 1/mm/Makefile
--- 2/mm/Makefile
+++ b/mm/Makefile
@@@ -11,10 -11,10 +11,10 @@@ obj-y                      := bootmem.o filemap.o mempool.
                            maccess.o page_alloc.o page-writeback.o \
                            readahead.o swap.o truncate.o vmscan.o shmem.o \
                            prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
- -                         page_isolation.o mm_init.o $(mmu-y)
+ +                         page_isolation.o mm_init.o mmu_context.o \
+ +                         pagewalk.o $(mmu-y)
   obj-y += init-mm.o
   
- -obj-$(CONFIG_PROC_PAGE_MONITOR) += pagewalk.o
   obj-$(CONFIG_BOUNCE)  += bounce.o
   obj-$(CONFIG_SWAP)    += page_io.o swap_state.o swapfile.o thrash.o
   obj-$(CONFIG_HAS_DMA) += dmapool.o
@@@ -25,7 -25,6 +25,7 @@@ obj-$(CONFIG_SPARSEMEM_VMEMMAP) += spar
   obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o
   obj-$(CONFIG_SLOB) += slob.o
   obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
+ +obj-$(CONFIG_KSM) += ksm.o
   obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o
   obj-$(CONFIG_SLAB) += slab.o
   obj-$(CONFIG_SLUB) += slub.o
@@@ -41,5 -40,7 +41,7 @@@ obj-$(CONFIG_SMP) += allocpercpu.
   endif
   obj-$(CONFIG_QUICKLIST) += quicklist.o
   obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
+ obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
+ obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
   obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
   obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
diff --combined mm/filemap.c

index bcc7372aebbc4375d0763e4f3acd8d096bcb612d,75575c3921672e91a0690231659d1ecd1eda2103..c1fc205a92c6eae9840961ad34dbfc88b58a80a4
--- 1/mm/filemap.c
--- 2/mm/filemap.c
+++ b/mm/filemap.c
@@@ -104,6 -104,10 +104,10 @@@
    *
    *  ->task->proc_lock
    *    ->dcache_lock           (proc_pid_lookup)
+  *
+  *  (code doesn't rely on that order, so you could switch it around)
+  *  ->tasklist_lock             (memory_failure, collect_procs_ao)
+  *    ->i_mmap_lock
    */
   
   /*
@@@ -119,8 -123,6 +123,8 @@@ void __remove_from_page_cache(struct pa
         page->mapping = NULL;
         mapping->nrpages--;
         __dec_zone_page_state(page, NR_FILE_PAGES);
+ +      if (PageSwapBacked(page))
+ +              __dec_zone_page_state(page, NR_SHMEM);
         BUG_ON(page_mapped(page));
   
         /*
@@@ -433,8 -435,6 +437,8 @@@ int add_to_page_cache_locked(struct pag
                 if (likely(!error)) {
                         mapping->nrpages++;
                         __inc_zone_page_state(page, NR_FILE_PAGES);
+ +                      if (PageSwapBacked(page))
+ +                              __inc_zone_page_state(page, NR_SHMEM);
                         spin_unlock_irq(&mapping->tree_lock);
                 } else {
                         page->mapping = NULL;
diff --combined mm/madvise.c

index d9ae2067952e5d2b5d09b260bc443d1ba0899d6a,8dbd38b8e4a4c9bf75997a9a4766e61884824f5b..35b1479b7c9d080ed97b772da44603c4f757093c
--- 1/mm/madvise.c
--- 2/mm/madvise.c
+++ b/mm/madvise.c
@@@ -11,7 -11,6 +11,7 @@@
   #include <linux/mempolicy.h>
   #include <linux/hugetlb.h>
   #include <linux/sched.h>
+ +#include <linux/ksm.h>
   
   /*
    * Any behaviour which results in changes to the vma->vm_flags needs to
@@@ -42,7 -41,7 +42,7 @@@ static long madvise_behavior(struct vm_
         struct mm_struct * mm = vma->vm_mm;
         int error = 0;
         pgoff_t pgoff;
- -      int new_flags = vma->vm_flags;
+ +      unsigned long new_flags = vma->vm_flags;
   
         switch (behavior) {
         case MADV_NORMAL:
@@@ -58,18 -57,8 +58,18 @@@
                 new_flags |= VM_DONTCOPY;
                 break;
         case MADV_DOFORK:
+ +              if (vma->vm_flags & VM_IO) {
+ +                      error = -EINVAL;
+ +                      goto out;
+ +              }
                 new_flags &= ~VM_DONTCOPY;
                 break;
+ +      case MADV_MERGEABLE:
+ +      case MADV_UNMERGEABLE:
+ +              error = ksm_madvise(vma, start, end, behavior, &new_flags);
+ +              if (error)
+ +                      goto out;
+ +              break;
         }
   
         if (new_flags == vma->vm_flags) {
@@@ -218,20 -207,67 +218,46 @@@ static long madvise_remove(struct vm_ar
         return error;
   }
   
+ #ifdef CONFIG_MEMORY_FAILURE
+ /*
+  * Error injection support for memory error handling.
+  */
+ static int madvise_hwpoison(unsigned long start, unsigned long end)
+ {
+       int ret = 0;
+ 
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM;
+       for (; start < end; start += PAGE_SIZE) {
+               struct page *p;
+               int ret = get_user_pages(current, current->mm, start, 1,
+                                               0, 0, &p, NULL);
+               if (ret != 1)
+                       return ret;
+               printk(KERN_INFO "Injecting memory failure for page %lx at %lx\n",
+                      page_to_pfn(p), start);
+               /* Ignore return value for now */
+               __memory_failure(page_to_pfn(p), 0, 1);
+               put_page(p);
+       }
+       return ret;
+ }
+ #endif
+ 
   static long
   madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
                 unsigned long start, unsigned long end, int behavior)
   {
- -      long error;
- -
         switch (behavior) {
- -      case MADV_DOFORK:
- -              if (vma->vm_flags & VM_IO) {
- -                      error = -EINVAL;
- -                      break;
- -              }
- -      case MADV_DONTFORK:
- -      case MADV_NORMAL:
- -      case MADV_SEQUENTIAL:
- -      case MADV_RANDOM:
- -              error = madvise_behavior(vma, prev, start, end, behavior);
- -              break;
         case MADV_REMOVE:
- -              error = madvise_remove(vma, prev, start, end);
- -              break;
- -
+ +              return madvise_remove(vma, prev, start, end);
         case MADV_WILLNEED:
- -              error = madvise_willneed(vma, prev, start, end);
- -              break;
- -
+ +              return madvise_willneed(vma, prev, start, end);
         case MADV_DONTNEED:
- -              error = madvise_dontneed(vma, prev, start, end);
- -              break;
- -
+ +              return madvise_dontneed(vma, prev, start, end);
         default:
- -              BUG();
- -              break;
+ +              return madvise_behavior(vma, prev, start, end, behavior);
         }
- -      return error;
   }
   
   static int
@@@ -246,17 -282,12 +272,17 @@@ madvise_behavior_valid(int behavior
         case MADV_REMOVE:
         case MADV_WILLNEED:
         case MADV_DONTNEED:
+ +#ifdef CONFIG_KSM
+ +      case MADV_MERGEABLE:
+ +      case MADV_UNMERGEABLE:
+ +#endif
                 return 1;
   
         default:
                 return 0;
         }
   }
+ +
   /*
    * The madvise(2) system call.
    *
@@@ -281,12 -312,6 +307,12 @@@
    *            so the kernel can free resources associated with it.
    *  MADV_REMOVE - the application wants to free up the given range of
    *            pages and associated backing store.
+ + *  MADV_DONTFORK - omit this area from child's address space when forking:
+ + *            typically, to avoid COWing pages pinned by get_user_pages().
+ + *  MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking.
+ + *  MADV_MERGEABLE - the application recommends that KSM try to merge pages in
+ + *            this area with pages of identical content from other such areas.
+ + *  MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others.
    *
    * return values:
    *  zero    - success
@@@ -308,6 -333,10 +334,10 @@@ SYSCALL_DEFINE3(madvise, unsigned long
         int write;
         size_t len;
   
+ #ifdef CONFIG_MEMORY_FAILURE
+       if (behavior == MADV_HWPOISON)
+               return madvise_hwpoison(start, start+len_in);
+ #endif
         if (!madvise_behavior_valid(behavior))
                 return error;
   
diff --combined mm/memory.c

index b1443ac07c00a4f1a46de6bb260d00e8f52f99db,44ea41196c139ab80ecbb1a6066f6e8f759223cf..987389a809e77accb1915dbd291d3e38b8dc1e60
--- 1/mm/memory.c
--- 2/mm/memory.c
+++ b/mm/memory.c
@@@ -45,7 -45,6 +45,7 @@@
   #include <linux/swap.h>
   #include <linux/highmem.h>
   #include <linux/pagemap.h>
+ +#include <linux/ksm.h>
   #include <linux/rmap.h>
   #include <linux/module.h>
   #include <linux/delayacct.h>
@@@ -57,7 -56,6 +57,7 @@@
   #include <linux/swapops.h>
   #include <linux/elf.h>
   
+ +#include <asm/io.h>
   #include <asm/pgalloc.h>
   #include <asm/uaccess.h>
   #include <asm/tlb.h>
@@@ -108,18 -106,6 +108,18 @@@ static int __init disable_randmaps(cha
   }
   __setup("norandmaps", disable_randmaps);
   
+ +unsigned long zero_pfn __read_mostly;
+ +unsigned long highest_memmap_pfn __read_mostly;
+ +
+ +/*
+ + * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init()
+ + */
+ +static int __init init_zero_pfn(void)
+ +{
+ +      zero_pfn = page_to_pfn(ZERO_PAGE(0));
+ +      return 0;
+ +}
+ +core_initcall(init_zero_pfn);
   
   /*
    * If a p?d_bad entry is found while walking page tables, report
@@@ -456,20 -442,6 +456,20 @@@ static inline int is_cow_mapping(unsign
         return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
   }
   
+ +#ifndef is_zero_pfn
+ +static inline int is_zero_pfn(unsigned long pfn)
+ +{
+ +      return pfn == zero_pfn;
+ +}
+ +#endif
+ +
+ +#ifndef my_zero_pfn
+ +static inline unsigned long my_zero_pfn(unsigned long addr)
+ +{
+ +      return zero_pfn;
+ +}
+ +#endif
+ +
   /*
    * vm_normal_page -- This function gets the "struct page" associated with a pte.
    *
@@@ -525,9 -497,7 +525,9 @@@ struct page *vm_normal_page(struct vm_a
         if (HAVE_PTE_SPECIAL) {
                 if (likely(!pte_special(pte)))
                         goto check_pfn;
- -              if (!(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)))
+ +              if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
+ +                      return NULL;
+ +              if (!is_zero_pfn(pfn))
                         print_bad_pte(vma, addr, pte, NULL);
                 return NULL;
         }
@@@ -549,8 -519,6 +549,8 @@@
                 }
         }
   
+ +      if (is_zero_pfn(pfn))
+ +              return NULL;
   check_pfn:
         if (unlikely(pfn > highest_memmap_pfn)) {
                 print_bad_pte(vma, addr, pte, NULL);
@@@ -628,8 -596,8 +628,8 @@@ copy_one_pte(struct mm_struct *dst_mm, 
         page = vm_normal_page(vma, addr, pte);
         if (page) {
                 get_page(page);
- -              page_dup_rmap(page, vma, addr);
- -              rss[!!PageAnon(page)]++;
+ +              page_dup_rmap(page);
+ +              rss[PageAnon(page)]++;
         }
   
   out_set_pte:
@@@ -1174,14 -1142,9 +1174,14 @@@ struct page *follow_page(struct vm_area
                 goto no_page;
         if ((flags & FOLL_WRITE) && !pte_write(pte))
                 goto unlock;
+ +
         page = vm_normal_page(vma, address, pte);
- -      if (unlikely(!page))
- -              goto bad_page;
+ +      if (unlikely(!page)) {
+ +              if ((flags & FOLL_DUMP) ||
+ +                  !is_zero_pfn(pte_pfn(pte)))
+ +                      goto bad_page;
+ +              page = pte_page(pte);
+ +      }
   
         if (flags & FOLL_GET)
                 get_page(page);
@@@ -1209,46 -1172,65 +1209,46 @@@ no_page
         pte_unmap_unlock(ptep, ptl);
         if (!pte_none(pte))
                 return page;
- -      /* Fall through to ZERO_PAGE handling */
+ +
   no_page_table:
         /*
          * When core dumping an enormous anonymous area that nobody
- -       * has touched so far, we don't want to allocate page tables.
+ +       * has touched so far, we don't want to allocate unnecessary pages or
+ +       * page tables.  Return error instead of NULL to skip handle_mm_fault,
+ +       * then get_dump_page() will return NULL to leave a hole in the dump.
+ +       * But we can only make this optimization where a hole would surely
+ +       * be zero-filled if handle_mm_fault() actually did handle it.
          */
- -      if (flags & FOLL_ANON) {
- -              page = ZERO_PAGE(0);
- -              if (flags & FOLL_GET)
- -                      get_page(page);
- -              BUG_ON(flags & FOLL_WRITE);
- -      }
+ +      if ((flags & FOLL_DUMP) &&
+ +          (!vma->vm_ops || !vma->vm_ops->fault))
+ +              return ERR_PTR(-EFAULT);
         return page;
   }
   
- -/* Can we do the FOLL_ANON optimization? */
- -static inline int use_zero_page(struct vm_area_struct *vma)
- -{
- -      /*
- -       * We don't want to optimize FOLL_ANON for make_pages_present()
- -       * when it tries to page in a VM_LOCKED region. As to VM_SHARED,
- -       * we want to get the page from the page tables to make sure
- -       * that we serialize and update with any other user of that
- -       * mapping.
- -       */
- -      if (vma->vm_flags & (VM_LOCKED | VM_SHARED))
- -              return 0;
- -      /*
- -       * And if we have a fault routine, it's not an anonymous region.
- -       */
- -      return !vma->vm_ops || !vma->vm_ops->fault;
- -}
- -
- -
- -
   int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
- -                   unsigned long start, int nr_pages, int flags,
+ +                   unsigned long start, int nr_pages, unsigned int gup_flags,
                      struct page **pages, struct vm_area_struct **vmas)
   {
         int i;
- -      unsigned int vm_flags = 0;
- -      int write = !!(flags & GUP_FLAGS_WRITE);
- -      int force = !!(flags & GUP_FLAGS_FORCE);
- -      int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS);
- -      int ignore_sigkill = !!(flags & GUP_FLAGS_IGNORE_SIGKILL);
+ +      unsigned long vm_flags;
   
         if (nr_pages <= 0)
                 return 0;
+ +
+ +      VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET));
+ +
         /* 
          * Require read or write permissions.
- -       * If 'force' is set, we only require the "MAY" flags.
+ +       * If FOLL_FORCE is set, we only require the "MAY" flags.
          */
- -      vm_flags  = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
- -      vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
+ +      vm_flags  = (gup_flags & FOLL_WRITE) ?
+ +                      (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
+ +      vm_flags &= (gup_flags & FOLL_FORCE) ?
+ +                      (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
         i = 0;
   
         do {
                 struct vm_area_struct *vma;
- -              unsigned int foll_flags;
   
                 vma = find_extend_vma(mm, start);
                 if (!vma && in_gate_area(tsk, start)) {
@@@ -1260,7 -1242,7 +1260,7 @@@
                         pte_t *pte;
   
                         /* user gate pages are read-only */
- -                      if (!ignore && write)
+ +                      if (gup_flags & FOLL_WRITE)
                                 return i ? : -EFAULT;
                         if (pg > TASK_SIZE)
                                 pgd = pgd_offset_k(pg);
@@@ -1294,26 -1276,38 +1294,26 @@@
   
                 if (!vma ||
                     (vma->vm_flags & (VM_IO | VM_PFNMAP)) ||
- -                  (!ignore && !(vm_flags & vma->vm_flags)))
+ +                  !(vm_flags & vma->vm_flags))
                         return i ? : -EFAULT;
   
                 if (is_vm_hugetlb_page(vma)) {
                         i = follow_hugetlb_page(mm, vma, pages, vmas,
- -                                              &start, &nr_pages, i, write);
+ +                                      &start, &nr_pages, i, gup_flags);
                         continue;
                 }
   
- -              foll_flags = FOLL_TOUCH;
- -              if (pages)
- -                      foll_flags |= FOLL_GET;
- -              if (!write && use_zero_page(vma))
- -                      foll_flags |= FOLL_ANON;
- -
                 do {
                         struct page *page;
+ +                      unsigned int foll_flags = gup_flags;
   
                         /*
                          * If we have a pending SIGKILL, don't keep faulting
- -                       * pages and potentially allocating memory, unless
- -                       * current is handling munlock--e.g., on exit. In
- -                       * that case, we are not allocating memory.  Rather,
- -                       * we're only unlocking already resident/mapped pages.
+ +                       * pages and potentially allocating memory.
                          */
- -                      if (unlikely(!ignore_sigkill &&
- -                                      fatal_signal_pending(current)))
+ +                      if (unlikely(fatal_signal_pending(current)))
                                 return i ? i : -ERESTARTSYS;
   
- -                      if (write)
- -                              foll_flags |= FOLL_WRITE;
- -
                         cond_resched();
                         while (!(page = follow_page(vma, start, foll_flags))) {
                                 int ret;
@@@ -1325,7 -1319,8 +1325,8 @@@
                                 if (ret & VM_FAULT_ERROR) {
                                         if (ret & VM_FAULT_OOM)
                                                 return i ? i : -ENOMEM;
-                                       else if (ret & VM_FAULT_SIGBUS)
+                                       if (ret &
+                                           (VM_FAULT_HWPOISON|VM_FAULT_SIGBUS))
                                                 return i ? i : -EFAULT;
                                         BUG();
                                 }
@@@ -1424,47 -1419,18 +1425,47 @@@ int get_user_pages(struct task_struct *
                 unsigned long start, int nr_pages, int write, int force,
                 struct page **pages, struct vm_area_struct **vmas)
   {
- -      int flags = 0;
+ +      int flags = FOLL_TOUCH;
   
+ +      if (pages)
+ +              flags |= FOLL_GET;
         if (write)
- -              flags |= GUP_FLAGS_WRITE;
+ +              flags |= FOLL_WRITE;
         if (force)
- -              flags |= GUP_FLAGS_FORCE;
+ +              flags |= FOLL_FORCE;
   
         return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas);
   }
- -
   EXPORT_SYMBOL(get_user_pages);
   
+ +/**
+ + * get_dump_page() - pin user page in memory while writing it to core dump
+ + * @addr: user address
+ + *
+ + * Returns struct page pointer of user page pinned for dump,
+ + * to be freed afterwards by page_cache_release() or put_page().
+ + *
+ + * Returns NULL on any kind of failure - a hole must then be inserted into
+ + * the corefile, to preserve alignment with its headers; and also returns
+ + * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found -
+ + * allowing a hole to be left in the corefile to save diskspace.
+ + *
+ + * Called without mmap_sem, but after all other threads have been killed.
+ + */
+ +#ifdef CONFIG_ELF_CORE
+ +struct page *get_dump_page(unsigned long addr)
+ +{
+ +      struct vm_area_struct *vma;
+ +      struct page *page;
+ +
+ +      if (__get_user_pages(current, current->mm, addr, 1,
+ +                      FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma) < 1)
+ +              return NULL;
+ +      flush_cache_page(vma, addr, page_to_pfn(page));
+ +      return page;
+ +}
+ +#endif /* CONFIG_ELF_CORE */
+ +
   pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr,
                         spinlock_t **ptl)
   {
@@@ -1642,8 -1608,7 +1643,8 @@@ int vm_insert_mixed(struct vm_area_stru
          * If we don't have pte special, then we have to use the pfn_valid()
          * based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must*
          * refcount the page if pfn_valid is true (hence insert_page rather
- -       * than insert_pfn).
+ +       * than insert_pfn).  If a zero_pfn were inserted into a VM_MIXEDMAP
+ +       * without pte special, it would there be refcounted as a normal page.
          */
         if (!HAVE_PTE_SPECIAL && pfn_valid(pfn)) {
                 struct page *page;
@@@ -2009,7 -1974,7 +2010,7 @@@ static int do_wp_page(struct mm_struct 
          * Take out anonymous pages first, anonymous shared vmas are
          * not dirty accountable.
          */
- -      if (PageAnon(old_page)) {
+ +      if (PageAnon(old_page) && !PageKsm(old_page)) {
                 if (!trylock_page(old_page)) {
                         page_cache_get(old_page);
                         pte_unmap_unlock(page_table, ptl);
@@@ -2110,19 -2075,10 +2111,19 @@@ gotten
   
         if (unlikely(anon_vma_prepare(vma)))
                 goto oom;
- -      VM_BUG_ON(old_page == ZERO_PAGE(0));
- -      new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
- -      if (!new_page)
- -              goto oom;
+ +
+ +      if (is_zero_pfn(pte_pfn(orig_pte))) {
+ +              new_page = alloc_zeroed_user_highpage_movable(vma, address);
+ +              if (!new_page)
+ +                      goto oom;
+ +      } else {
+ +              new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
+ +              if (!new_page)
+ +                      goto oom;
+ +              cow_user_page(new_page, old_page, address, vma);
+ +      }
+ +      __SetPageUptodate(new_page);
+ +
         /*
          * Don't let another task, with possibly unlocked vma,
          * keep the mlocked page.
@@@ -2132,6 -2088,8 +2133,6 @@@
                 clear_page_mlock(old_page);
                 unlock_page(old_page);
         }
- -      cow_user_page(new_page, old_page, address, vma);
- -      __SetPageUptodate(new_page);
   
         if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))
                 goto oom_free_new;
@@@ -2157,14 -2115,9 +2158,14 @@@
                  * seen in the presence of one thread doing SMC and another
                  * thread doing COW.
                  */
- -              ptep_clear_flush_notify(vma, address, page_table);
+ +              ptep_clear_flush(vma, address, page_table);
                 page_add_new_anon_rmap(new_page, vma, address);
- -              set_pte_at(mm, address, page_table, entry);
+ +              /*
+ +               * We call the notify macro here because, when using secondary
+ +               * mmu page tables (such as kvm shadow page tables), we want the
+ +               * new page to be mapped directly into the secondary page table.
+ +               */
+ +              set_pte_at_notify(mm, address, page_table, entry);
                 update_mmu_cache(vma, address, entry);
                 if (old_page) {
                         /*
@@@ -2559,8 -2512,15 +2560,15 @@@ static int do_swap_page(struct mm_struc
                 goto out;
   
         entry = pte_to_swp_entry(orig_pte);
-       if (is_migration_entry(entry)) {
-               migration_entry_wait(mm, pmd, address);
+       if (unlikely(non_swap_entry(entry))) {
+               if (is_migration_entry(entry)) {
+                       migration_entry_wait(mm, pmd, address);
+               } else if (is_hwpoison_entry(entry)) {
+                       ret = VM_FAULT_HWPOISON;
+               } else {
+                       print_bad_pte(vma, address, orig_pte, NULL);
+                       ret = VM_FAULT_OOM;
+               }
                 goto out;
         }
         delayacct_set_flag(DELAYACCT_PF_SWAPIN);
@@@ -2584,6 -2544,10 +2592,10 @@@
                 /* Had to read the page from swap area: Major fault */
                 ret = VM_FAULT_MAJOR;
                 count_vm_event(PGMAJFAULT);
+       } else if (PageHWPoison(page)) {
+               ret = VM_FAULT_HWPOISON;
+               delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
+               goto out;
         }
   
         lock_page(page);
@@@ -2672,16 -2636,6 +2684,16 @@@ static int do_anonymous_page(struct mm_
         spinlock_t *ptl;
         pte_t entry;
   
+ +      if (!(flags & FAULT_FLAG_WRITE)) {
+ +              entry = pte_mkspecial(pfn_pte(my_zero_pfn(address),
+ +                                              vma->vm_page_prot));
+ +              ptl = pte_lockptr(mm, pmd);
+ +              spin_lock(ptl);
+ +              if (!pte_none(*page_table))
+ +                      goto unlock;
+ +              goto setpte;
+ +      }
+ +
         /* Allocate our own private page. */
         pte_unmap(page_table);
   
@@@ -2696,16 -2650,13 +2708,16 @@@
                 goto oom_free_page;
   
         entry = mk_pte(page, vma->vm_page_prot);
- -      entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ +      if (vma->vm_flags & VM_WRITE)
+ +              entry = pte_mkwrite(pte_mkdirty(entry));
   
         page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
         if (!pte_none(*page_table))
                 goto release;
+ +
         inc_mm_counter(mm, anon_rss);
         page_add_new_anon_rmap(page, vma, address);
+ +setpte:
         set_pte_at(mm, address, page_table, entry);
   
         /* No need to invalidate - it was non-present before */
@@@ -2760,6 -2711,12 +2772,12 @@@ static int __do_fault(struct mm_struct 
         if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
                 return ret;
   
+       if (unlikely(PageHWPoison(vmf.page))) {
+               if (ret & VM_FAULT_LOCKED)
+                       unlock_page(vmf.page);
+               return VM_FAULT_HWPOISON;
+       }
+ 
         /*
          * For consistency in subsequent calls, make the faulted page always
          * locked.
diff --combined mm/migrate.c

index 16052e80aaacbc182c9ea421bb7f84b02fc3a5b4,e3a0cd3859a9a71b682178c08ec346e6f22aec73..1a4bf4813780eb700ee026030bca18fedc2fbae6
--- 1/mm/migrate.c
--- 2/mm/migrate.c
+++ b/mm/migrate.c
@@@ -67,8 -67,6 +67,8 @@@ int putback_lru_pages(struct list_head 
   
         list_for_each_entry_safe(page, page2, l, lru) {
                 list_del(&page->lru);
+ +              dec_zone_page_state(page, NR_ISOLATED_ANON +
+ +                              page_is_file_cache(page));
                 putback_lru_page(page);
                 count++;
         }
@@@ -149,7 -147,7 +149,7 @@@ out
   static void remove_file_migration_ptes(struct page *old, struct page *new)
   {
         struct vm_area_struct *vma;
- -      struct address_space *mapping = page_mapping(new);
+ +      struct address_space *mapping = new->mapping;
         struct prio_tree_iter iter;
         pgoff_t pgoff = new->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
   
@@@ -272,7 -270,7 +272,7 @@@ static int migrate_page_move_mapping(st
         pslot = radix_tree_lookup_slot(&mapping->page_tree,
                                         page_index(page));
   
- -      expected_count = 2 + !!page_has_private(page);
+ +      expected_count = 2 + page_has_private(page);
         if (page_count(page) != expected_count ||
                         (struct page *)radix_tree_deref_slot(pslot) != page) {
                 spin_unlock_irq(&mapping->tree_lock);
@@@ -314,10 -312,7 +314,10 @@@
          */
         __dec_zone_page_state(page, NR_FILE_PAGES);
         __inc_zone_page_state(newpage, NR_FILE_PAGES);
- -
+ +      if (PageSwapBacked(page)) {
+ +              __dec_zone_page_state(page, NR_SHMEM);
+ +              __inc_zone_page_state(newpage, NR_SHMEM);
+ +      }
         spin_unlock_irq(&mapping->tree_lock);
   
         return 0;
@@@ -669,15 -664,13 +669,15 @@@ static int unmap_and_move(new_page_t ge
                          *    needs to be effective.
                          */
                         try_to_free_buffers(page);
+ +                      goto rcu_unlock;
                 }
- -              goto rcu_unlock;
+ +              goto skip_unmap;
         }
   
         /* Establish migration ptes or remove ptes */
-       try_to_unmap(page, 1);
+       try_to_unmap(page, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
   
+ +skip_unmap:
         if (!page_mapped(page))
                 rc = move_to_new_page(newpage, page);
   
@@@ -700,8 -693,6 +700,8 @@@ unlock
                  * restored.
                  */
                 list_del(&page->lru);
+ +              dec_zone_page_state(page, NR_ISOLATED_ANON +
+ +                              page_is_file_cache(page));
                 putback_lru_page(page);
         }
   
@@@ -746,13 -737,6 +746,13 @@@ int migrate_pages(struct list_head *fro
         struct page *page2;
         int swapwrite = current->flags & PF_SWAPWRITE;
         int rc;
+ +      unsigned long flags;
+ +
+ +      local_irq_save(flags);
+ +      list_for_each_entry(page, from, lru)
+ +              __inc_zone_page_state(page, NR_ISOLATED_ANON +
+ +                              page_is_file_cache(page));
+ +      local_irq_restore(flags);
   
         if (!swapwrite)
                 current->flags |= PF_SWAPWRITE;
diff --combined mm/page-writeback.c

index be197f71b096cb83d6ac48ab68cef84b2ba3fc61,bba82c414ba81eb5aaa521a6e31fd0fdbe787d11..d99664e8607e761235a13b2662353df131ac5b41
--- 1/mm/page-writeback.c
--- 2/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@@ -155,37 -155,37 +155,37 @@@ static void update_completion_period(vo
   }
   
   int dirty_background_ratio_handler(struct ctl_table *table, int write,
- -              struct file *filp, void __user *buffer, size_t *lenp,
+ +              void __user *buffer, size_t *lenp,
                 loff_t *ppos)
   {
         int ret;
   
- -      ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
+ +      ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
         if (ret == 0 && write)
                 dirty_background_bytes = 0;
         return ret;
   }
   
   int dirty_background_bytes_handler(struct ctl_table *table, int write,
- -              struct file *filp, void __user *buffer, size_t *lenp,
+ +              void __user *buffer, size_t *lenp,
                 loff_t *ppos)
   {
         int ret;
   
- -      ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos);
+ +      ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
         if (ret == 0 && write)
                 dirty_background_ratio = 0;
         return ret;
   }
   
   int dirty_ratio_handler(struct ctl_table *table, int write,
- -              struct file *filp, void __user *buffer, size_t *lenp,
+ +              void __user *buffer, size_t *lenp,
                 loff_t *ppos)
   {
         int old_ratio = vm_dirty_ratio;
         int ret;
   
- -      ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
+ +      ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
         if (ret == 0 && write && vm_dirty_ratio != old_ratio) {
                 update_completion_period();
                 vm_dirty_bytes = 0;
@@@ -195,13 -195,13 +195,13 @@@
   
   
   int dirty_bytes_handler(struct ctl_table *table, int write,
- -              struct file *filp, void __user *buffer, size_t *lenp,
+ +              void __user *buffer, size_t *lenp,
                 loff_t *ppos)
   {
         unsigned long old_bytes = vm_dirty_bytes;
         int ret;
   
- -      ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos);
+ +      ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
         if (ret == 0 && write && vm_dirty_bytes != old_bytes) {
                 update_completion_period();
                 vm_dirty_ratio = 0;
@@@ -315,7 -315,7 +315,7 @@@ int bdi_set_min_ratio(struct backing_de
   {
         int ret = 0;
   
- -      spin_lock(&bdi_lock);
+ +      spin_lock_bh(&bdi_lock);
         if (min_ratio > bdi->max_ratio) {
                 ret = -EINVAL;
         } else {
@@@ -327,7 -327,7 +327,7 @@@
                         ret = -EINVAL;
                 }
         }
- -      spin_unlock(&bdi_lock);
+ +      spin_unlock_bh(&bdi_lock);
   
         return ret;
   }
@@@ -339,14 -339,14 +339,14 @@@ int bdi_set_max_ratio(struct backing_de
         if (max_ratio > 100)
                 return -EINVAL;
   
- -      spin_lock(&bdi_lock);
+ +      spin_lock_bh(&bdi_lock);
         if (bdi->min_ratio > max_ratio) {
                 ret = -EINVAL;
         } else {
                 bdi->max_ratio = max_ratio;
                 bdi->max_prop_frac = (PROP_FRAC_BASE * max_ratio) / 100;
         }
- -      spin_unlock(&bdi_lock);
+ +      spin_unlock_bh(&bdi_lock);
   
         return ret;
   }
@@@ -380,8 -380,7 +380,8 @@@ static unsigned long highmem_dirtyable_
                 struct zone *z =
                         &NODE_DATA(node)->node_zones[ZONE_HIGHMEM];
   
- -              x += zone_page_state(z, NR_FREE_PAGES) + zone_lru_pages(z);
+ +              x += zone_page_state(z, NR_FREE_PAGES) +
+ +                   zone_reclaimable_pages(z);
         }
         /*
          * Make sure that the number of highmem pages is never larger
@@@ -405,7 -404,7 +405,7 @@@ unsigned long determine_dirtyable_memor
   {
         unsigned long x;
   
- -      x = global_page_state(NR_FREE_PAGES) + global_lru_pages();
+ +      x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages();
   
         if (!vm_highmem_is_dirtyable)
                 x -= highmem_dirtyable_memory(x);
@@@ -486,7 -485,6 +486,7 @@@ static void balance_dirty_pages(struct 
         unsigned long bdi_thresh;
         unsigned long pages_written = 0;
         unsigned long write_chunk = sync_writeback_pages();
+ +      unsigned long pause = 1;
   
         struct backing_dev_info *bdi = mapping->backing_dev_info;
   
@@@ -563,15 -561,7 +563,15 @@@
                 if (pages_written >= write_chunk)
                         break;          /* We've done our duty */
   
- -              schedule_timeout(1);
+ +              schedule_timeout_interruptible(pause);
+ +
+ +              /*
+ +               * Increase the delay for each loop, up to our previous
+ +               * default of taking a 100ms nap.
+ +               */
+ +              pause <<= 1;
+ +              if (pause > HZ / 10)
+ +                      pause = HZ / 10;
         }
   
         if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh &&
@@@ -592,8 -582,16 +592,8 @@@
         if ((laptop_mode && pages_written) ||
             (!laptop_mode && ((nr_writeback = global_page_state(NR_FILE_DIRTY)
                                           + global_page_state(NR_UNSTABLE_NFS))
- -                                        > background_thresh))) {
- -              struct writeback_control wbc = {
- -                      .bdi            = bdi,
- -                      .sync_mode      = WB_SYNC_NONE,
- -                      .nr_to_write    = nr_writeback,
- -              };
- -
- -
- -              bdi_start_writeback(&wbc);
- -      }
+ +                                        > background_thresh)))
+ +              bdi_start_writeback(bdi, nr_writeback);
   }
   
   void set_page_dirty_balance(struct page *page, int page_mkwrite)
@@@ -686,9 -684,9 +686,9 @@@ static DEFINE_TIMER(laptop_mode_wb_time
    * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs
    */
   int dirty_writeback_centisecs_handler(ctl_table *table, int write,
- -      struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+ +      void __user *buffer, size_t *length, loff_t *ppos)
   {
- -      proc_dointvec(table, write, file, buffer, length, ppos);
+ +      proc_dointvec(table, write, buffer, length, ppos);
         return 0;
   }
   
@@@ -1022,10 -1020,12 +1022,10 @@@ int do_writepages(struct address_space 
   
         if (wbc->nr_to_write <= 0)
                 return 0;
- -      wbc->for_writepages = 1;
         if (mapping->a_ops->writepages)
                 ret = mapping->a_ops->writepages(mapping, wbc);
         else
                 ret = generic_writepages(mapping, wbc);
- -      wbc->for_writepages = 0;
         return ret;
   }
   
@@@ -1149,6 -1149,13 +1149,13 @@@ int redirty_page_for_writepage(struct w
   EXPORT_SYMBOL(redirty_page_for_writepage);
   
   /*
+  * Dirty a page.
+  *
+  * For pages with a mapping this should be done under the page lock
+  * for the benefit of asynchronous memory errors who prefer a consistent
+  * dirty state. This rule can be broken in some special cases,
+  * but should be better not to.
+  *
    * If the mapping doesn't provide a set_page_dirty a_op, then
    * just fall through and assume that it wants buffer_heads.
    */
diff --combined mm/page_alloc.c

index 88248b3c20bb30dc7216221c05f9f8b47b737df2,9faa7ad95ac536ad34f16bf16996261554705fc3..bf720550b44d85adc294f7fd0b8ede38f73a8902
--- 1/mm/page_alloc.c
--- 2/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@@ -48,7 -48,6 +48,7 @@@
   #include <linux/page_cgroup.h>
   #include <linux/debugobjects.h>
   #include <linux/kmemleak.h>
+ +#include <trace/events/kmem.h>
   
   #include <asm/tlbflush.h>
   #include <asm/div64.h>
@@@ -72,6 -71,7 +72,6 @@@ EXPORT_SYMBOL(node_states)
   
   unsigned long totalram_pages __read_mostly;
   unsigned long totalreserve_pages __read_mostly;
- -unsigned long highest_memmap_pfn __read_mostly;
   int percpu_pagelist_fraction;
   gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
   
@@@ -123,8 -123,8 +123,8 @@@ static char * const zone_names[MAX_NR_Z
   
   int min_free_kbytes = 1024;
   
- -unsigned long __meminitdata nr_kernel_pages;
- -unsigned long __meminitdata nr_all_pages;
+ +static unsigned long __meminitdata nr_kernel_pages;
+ +static unsigned long __meminitdata nr_all_pages;
   static unsigned long __meminitdata dma_reserve;
   
   #ifdef CONFIG_ARCH_POPULATES_NODE_MAP
@@@ -234,6 -234,12 +234,12 @@@ static void bad_page(struct page *page
         static unsigned long nr_shown;
         static unsigned long nr_unshown;
   
+       /* Don't complain about poisoned pages */
+       if (PageHWPoison(page)) {
+               __ClearPageBuddy(page);
+               return;
+       }
+ 
         /*
          * Allow a burst of 60 reports, then keep quiet for that minute;
          * or allow a steady drip of one report per second.
@@@ -510,7 -516,7 +516,7 @@@ static inline int free_pages_check(stru
   }
   
   /*
- - * Frees a list of pages. 
+ + * Frees a number of pages from the PCP lists
    * Assumes all pages on list are in same zone, and of same order.
    * count is the number of pages to free.
    *
@@@ -520,42 -526,22 +526,42 @@@
    * And clear the zone's pages_scanned counter, to hold off the "all pages are
    * pinned" detection logic.
    */
- -static void free_pages_bulk(struct zone *zone, int count,
- -                                      struct list_head *list, int order)
+ +static void free_pcppages_bulk(struct zone *zone, int count,
+ +                                      struct per_cpu_pages *pcp)
   {
+ +      int migratetype = 0;
+ +      int batch_free = 0;
+ +
         spin_lock(&zone->lock);
         zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE);
         zone->pages_scanned = 0;
   
- -      __mod_zone_page_state(zone, NR_FREE_PAGES, count << order);
- -      while (count--) {
+ +      __mod_zone_page_state(zone, NR_FREE_PAGES, count);
+ +      while (count) {
                 struct page *page;
+ +              struct list_head *list;
   
- -              VM_BUG_ON(list_empty(list));
- -              page = list_entry(list->prev, struct page, lru);
- -              /* have to delete it as __free_one_page list manipulates */
- -              list_del(&page->lru);
- -              __free_one_page(page, zone, order, page_private(page));
+ +              /*
+ +               * Remove pages from lists in a round-robin fashion. A
+ +               * batch_free count is maintained that is incremented when an
+ +               * empty list is encountered.  This is so more pages are freed
+ +               * off fuller lists instead of spinning excessively around empty
+ +               * lists
+ +               */
+ +              do {
+ +                      batch_free++;
+ +                      if (++migratetype == MIGRATE_PCPTYPES)
+ +                              migratetype = 0;
+ +                      list = &pcp->lists[migratetype];
+ +              } while (list_empty(list));
+ +
+ +              do {
+ +                      page = list_entry(list->prev, struct page, lru);
+ +                      /* must delete as __free_one_page list manipulates */
+ +                      list_del(&page->lru);
+ +                      __free_one_page(page, zone, 0, migratetype);
+ +                      trace_mm_page_pcpu_drain(page, 0, migratetype);
+ +              } while (--count && --batch_free && !list_empty(list));
         }
         spin_unlock(&zone->lock);
   }
@@@ -577,7 -563,7 +583,7 @@@ static void __free_pages_ok(struct pag
         unsigned long flags;
         int i;
         int bad = 0;
- -      int wasMlocked = TestClearPageMlocked(page);
+ +      int wasMlocked = __TestClearPageMlocked(page);
   
         kmemcheck_free_shadow(page, order);
   
@@@ -666,7 -652,7 +672,7 @@@ static inline void expand(struct zone *
   /*
    * This page is about to be returned from the page allocator
    */
- static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
+ static inline int check_new_page(struct page *page)
   {
         if (unlikely(page_mapcount(page) |
                 (page->mapping != NULL)  |
@@@ -675,6 -661,18 +681,18 @@@
                 bad_page(page);
                 return 1;
         }
+       return 0;
+ }
+ 
+ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
+ {
+       int i;
+ 
+       for (i = 0; i < (1 << order); i++) {
+               struct page *p = page + i;
+               if (unlikely(check_new_page(p)))
+                       return 1;
+       }
   
         set_page_private(page, 0);
         set_page_refcounted(page);
@@@ -803,17 -801,6 +821,17 @@@ static int move_freepages_block(struct 
         return move_freepages(zone, start_page, end_page, migratetype);
   }
   
+ +static void change_pageblock_range(struct page *pageblock_page,
+ +                                      int start_order, int migratetype)
+ +{
+ +      int nr_pageblocks = 1 << (start_order - pageblock_order);
+ +
+ +      while (nr_pageblocks--) {
+ +              set_pageblock_migratetype(pageblock_page, migratetype);
+ +              pageblock_page += pageblock_nr_pages;
+ +      }
+ +}
+ +
   /* Remove an element from the buddy allocator from the fallback list */
   static inline struct page *
   __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
@@@ -867,16 -854,11 +885,16 @@@
                         list_del(&page->lru);
                         rmv_page_order(page);
   
- -                      if (current_order == pageblock_order)
- -                              set_pageblock_migratetype(page,
+ +                      /* Take ownership for orders >= pageblock_order */
+ +                      if (current_order >= pageblock_order)
+ +                              change_pageblock_range(page, current_order,
                                                         start_migratetype);
   
                         expand(zone, page, order, current_order, area, migratetype);
+ +
+ +                      trace_mm_page_alloc_extfrag(page, order, current_order,
+ +                              start_migratetype, migratetype);
+ +
                         return page;
                 }
         }
@@@ -910,7 -892,6 +928,7 @@@ retry_reserve
                 }
         }
   
+ +      trace_mm_page_alloc_zone_locked(page, order, migratetype);
         return page;
   }
   
@@@ -971,7 -952,7 +989,7 @@@ void drain_zone_pages(struct zone *zone
                 to_drain = pcp->batch;
         else
                 to_drain = pcp->count;
- -      free_pages_bulk(zone, to_drain, &pcp->list, 0);
+ +      free_pcppages_bulk(zone, to_drain, pcp);
         pcp->count -= to_drain;
         local_irq_restore(flags);
   }
@@@ -997,7 -978,7 +1015,7 @@@ static void drain_pages(unsigned int cp
   
                 pcp = &pset->pcp;
                 local_irq_save(flags);
- -              free_pages_bulk(zone, pcp->count, &pcp->list, 0);
+ +              free_pcppages_bulk(zone, pcp->count, pcp);
                 pcp->count = 0;
                 local_irq_restore(flags);
         }
@@@ -1063,8 -1044,7 +1081,8 @@@ static void free_hot_cold_page(struct p
         struct zone *zone = page_zone(page);
         struct per_cpu_pages *pcp;
         unsigned long flags;
- -      int wasMlocked = TestClearPageMlocked(page);
+ +      int migratetype;
+ +      int wasMlocked = __TestClearPageMlocked(page);
   
         kmemcheck_free_shadow(page, 0);
   
@@@ -1081,49 -1061,35 +1099,49 @@@
         kernel_map_pages(page, 1, 0);
   
         pcp = &zone_pcp(zone, get_cpu())->pcp;
- -      set_page_private(page, get_pageblock_migratetype(page));
+ +      migratetype = get_pageblock_migratetype(page);
+ +      set_page_private(page, migratetype);
         local_irq_save(flags);
         if (unlikely(wasMlocked))
                 free_page_mlock(page);
         __count_vm_event(PGFREE);
   
+ +      /*
+ +       * We only track unmovable, reclaimable and movable on pcp lists.
+ +       * Free ISOLATE pages back to the allocator because they are being
+ +       * offlined but treat RESERVE as movable pages so we can get those
+ +       * areas back if necessary. Otherwise, we may have to free
+ +       * excessively into the page allocator
+ +       */
+ +      if (migratetype >= MIGRATE_PCPTYPES) {
+ +              if (unlikely(migratetype == MIGRATE_ISOLATE)) {
+ +                      free_one_page(zone, page, 0, migratetype);
+ +                      goto out;
+ +              }
+ +              migratetype = MIGRATE_MOVABLE;
+ +      }
+ +
         if (cold)
- -              list_add_tail(&page->lru, &pcp->list);
+ +              list_add_tail(&page->lru, &pcp->lists[migratetype]);
         else
- -              list_add(&page->lru, &pcp->list);
+ +              list_add(&page->lru, &pcp->lists[migratetype]);
         pcp->count++;
         if (pcp->count >= pcp->high) {
- -              free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
+ +              free_pcppages_bulk(zone, pcp->batch, pcp);
                 pcp->count -= pcp->batch;
         }
+ +
+ +out:
         local_irq_restore(flags);
         put_cpu();
   }
   
   void free_hot_page(struct page *page)
   {
+ +      trace_mm_page_free_direct(page, 0);
         free_hot_cold_page(page, 0);
   }
         
- -void free_cold_page(struct page *page)
- -{
- -      free_hot_cold_page(page, 1);
- -}
- -
   /*
    * split_page takes a non-compound higher-order page, and splits it into
    * n (1<<order) sub-pages: page[0..n]
@@@ -1171,23 -1137,35 +1189,23 @@@ again
         cpu  = get_cpu();
         if (likely(order == 0)) {
                 struct per_cpu_pages *pcp;
+ +              struct list_head *list;
   
                 pcp = &zone_pcp(zone, cpu)->pcp;
+ +              list = &pcp->lists[migratetype];
                 local_irq_save(flags);
- -              if (!pcp->count) {
- -                      pcp->count = rmqueue_bulk(zone, 0,
- -                                      pcp->batch, &pcp->list,
+ +              if (list_empty(list)) {
+ +                      pcp->count += rmqueue_bulk(zone, 0,
+ +                                      pcp->batch, list,
                                         migratetype, cold);
- -                      if (unlikely(!pcp->count))
+ +                      if (unlikely(list_empty(list)))
                                 goto failed;
                 }
   
- -              /* Find a page of the appropriate migrate type */
- -              if (cold) {
- -                      list_for_each_entry_reverse(page, &pcp->list, lru)
- -                              if (page_private(page) == migratetype)
- -                                      break;
- -              } else {
- -                      list_for_each_entry(page, &pcp->list, lru)
- -                              if (page_private(page) == migratetype)
- -                                      break;
- -              }
- -
- -              /* Allocate more to the pcp list if necessary */
- -              if (unlikely(&page->lru == &pcp->list)) {
- -                      pcp->count += rmqueue_bulk(zone, 0,
- -                                      pcp->batch, &pcp->list,
- -                                      migratetype, cold);
- -                      page = list_entry(pcp->list.next, struct page, lru);
- -              }
+ +              if (cold)
+ +                      page = list_entry(list->prev, struct page, lru);
+ +              else
+ +                      page = list_entry(list->next, struct page, lru);
   
                 list_del(&page->lru);
                 pcp->count--;
@@@ -1667,6 -1645,10 +1685,6 @@@ __alloc_pages_direct_reclaim(gfp_t gfp_
   
         /* We now go into synchronous reclaim */
         cpuset_memory_pressure_bump();
- -
- -      /*
- -       * The task's cpuset might have expanded its set of allowable nodes
- -       */
         p->flags |= PF_MEMALLOC;
         lockdep_set_current_reclaim_state(gfp_mask);
         reclaim_state.reclaimed_slab = 0;
@@@ -1801,7 -1783,6 +1819,7 @@@ __alloc_pages_slowpath(gfp_t gfp_mask, 
   
         wake_all_kswapd(order, zonelist, high_zoneidx);
   
+ +restart:
         /*
          * OK, we're below the kswapd watermark and have kicked background
          * reclaim. Now things get more complex, so set up alloc_flags according
@@@ -1809,6 -1790,7 +1827,6 @@@
          */
         alloc_flags = gfp_to_alloc_flags(gfp_mask);
   
- -restart:
         /* This is the last chance, in general, before the goto nopage. */
         page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
                         high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
@@@ -1943,7 -1925,6 +1961,7 @@@ __alloc_pages_nodemask(gfp_t gfp_mask, 
                                 zonelist, high_zoneidx, nodemask,
                                 preferred_zone, migratetype);
   
+ +      trace_mm_page_alloc(page, order, gfp_mask, migratetype);
         return page;
   }
   EXPORT_SYMBOL(__alloc_pages_nodemask);
@@@ -1953,41 -1934,44 +1971,41 @@@
    */
   unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
   {
- -      struct page * page;
+ +      struct page *page;
+ +
+ +      /*
+ +       * __get_free_pages() returns a 32-bit address, which cannot represent
+ +       * a highmem page
+ +       */
+ +      VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);
+ +
         page = alloc_pages(gfp_mask, order);
         if (!page)
                 return 0;
         return (unsigned long) page_address(page);
   }
- -
   EXPORT_SYMBOL(__get_free_pages);
   
   unsigned long get_zeroed_page(gfp_t gfp_mask)
   {
- -      struct page * page;
- -
- -      /*
- -       * get_zeroed_page() returns a 32-bit address, which cannot represent
- -       * a highmem page
- -       */
- -      VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);
- -
- -      page = alloc_pages(gfp_mask | __GFP_ZERO, 0);
- -      if (page)
- -              return (unsigned long) page_address(page);
- -      return 0;
+ +      return __get_free_pages(gfp_mask | __GFP_ZERO, 0);
   }
- -
   EXPORT_SYMBOL(get_zeroed_page);
   
   void __pagevec_free(struct pagevec *pvec)
   {
         int i = pagevec_count(pvec);
   
- -      while (--i >= 0)
+ +      while (--i >= 0) {
+ +              trace_mm_pagevec_free(pvec->pages[i], pvec->cold);
                 free_hot_cold_page(pvec->pages[i], pvec->cold);
+ +      }
   }
   
   void __free_pages(struct page *page, unsigned int order)
   {
         if (put_page_testzero(page)) {
+ +              trace_mm_page_free_direct(page, order);
                 if (order == 0)
                         free_hot_page(page);
                 else
@@@ -2162,28 -2146,23 +2180,28 @@@ void show_free_areas(void
                 }
         }
   
- -      printk("Active_anon:%lu active_file:%lu inactive_anon:%lu\n"
- -              " inactive_file:%lu"
+ +      printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"
+ +              " active_file:%lu inactive_file:%lu isolated_file:%lu\n"
                 " unevictable:%lu"
- -              " dirty:%lu writeback:%lu unstable:%lu\n"
- -              " free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n",
+ +              " dirty:%lu writeback:%lu unstable:%lu buffer:%lu\n"
+ +              " free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n"
+ +              " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n",
                 global_page_state(NR_ACTIVE_ANON),
- -              global_page_state(NR_ACTIVE_FILE),
                 global_page_state(NR_INACTIVE_ANON),
+ +              global_page_state(NR_ISOLATED_ANON),
+ +              global_page_state(NR_ACTIVE_FILE),
                 global_page_state(NR_INACTIVE_FILE),
+ +              global_page_state(NR_ISOLATED_FILE),
                 global_page_state(NR_UNEVICTABLE),
                 global_page_state(NR_FILE_DIRTY),
                 global_page_state(NR_WRITEBACK),
                 global_page_state(NR_UNSTABLE_NFS),
+ +              nr_blockdev_pages(),
                 global_page_state(NR_FREE_PAGES),
- -              global_page_state(NR_SLAB_RECLAIMABLE) +
- -                      global_page_state(NR_SLAB_UNRECLAIMABLE),
+ +              global_page_state(NR_SLAB_RECLAIMABLE),
+ +              global_page_state(NR_SLAB_UNRECLAIMABLE),
                 global_page_state(NR_FILE_MAPPED),
+ +              global_page_state(NR_SHMEM),
                 global_page_state(NR_PAGETABLE),
                 global_page_state(NR_BOUNCE));
   
@@@ -2201,21 -2180,7 +2219,21 @@@
                         " active_file:%lukB"
                         " inactive_file:%lukB"
                         " unevictable:%lukB"
+ +                      " isolated(anon):%lukB"
+ +                      " isolated(file):%lukB"
                         " present:%lukB"
+ +                      " mlocked:%lukB"
+ +                      " dirty:%lukB"
+ +                      " writeback:%lukB"
+ +                      " mapped:%lukB"
+ +                      " shmem:%lukB"
+ +                      " slab_reclaimable:%lukB"
+ +                      " slab_unreclaimable:%lukB"
+ +                      " kernel_stack:%lukB"
+ +                      " pagetables:%lukB"
+ +                      " unstable:%lukB"
+ +                      " bounce:%lukB"
+ +                      " writeback_tmp:%lukB"
                         " pages_scanned:%lu"
                         " all_unreclaimable? %s"
                         "\n",
@@@ -2229,22 -2194,7 +2247,22 @@@
                         K(zone_page_state(zone, NR_ACTIVE_FILE)),
                         K(zone_page_state(zone, NR_INACTIVE_FILE)),
                         K(zone_page_state(zone, NR_UNEVICTABLE)),
+ +                      K(zone_page_state(zone, NR_ISOLATED_ANON)),
+ +                      K(zone_page_state(zone, NR_ISOLATED_FILE)),
                         K(zone->present_pages),
+ +                      K(zone_page_state(zone, NR_MLOCK)),
+ +                      K(zone_page_state(zone, NR_FILE_DIRTY)),
+ +                      K(zone_page_state(zone, NR_WRITEBACK)),
+ +                      K(zone_page_state(zone, NR_FILE_MAPPED)),
+ +                      K(zone_page_state(zone, NR_SHMEM)),
+ +                      K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)),
+ +                      K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)),
+ +                      zone_page_state(zone, NR_KERNEL_STACK) *
+ +                              THREAD_SIZE / 1024,
+ +                      K(zone_page_state(zone, NR_PAGETABLE)),
+ +                      K(zone_page_state(zone, NR_UNSTABLE_NFS)),
+ +                      K(zone_page_state(zone, NR_BOUNCE)),
+ +                      K(zone_page_state(zone, NR_WRITEBACK_TEMP)),
                         zone->pages_scanned,
                         (zone_is_all_unreclaimable(zone) ? "yes" : "no")
                         );
@@@ -2373,7 -2323,7 +2391,7 @@@ early_param("numa_zonelist_order", setu
    * sysctl handler for numa_zonelist_order
    */
   int numa_zonelist_order_handler(ctl_table *table, int write,
- -              struct file *file, void __user *buffer, size_t *length,
+ +              void __user *buffer, size_t *length,
                 loff_t *ppos)
   {
         char saved_string[NUMA_ZONELIST_ORDER_LEN];
@@@ -2382,7 -2332,7 +2400,7 @@@
         if (write)
                 strncpy(saved_string, (char*)table->data,
                         NUMA_ZONELIST_ORDER_LEN);
- -      ret = proc_dostring(table, write, file, buffer, length, ppos);
+ +      ret = proc_dostring(table, write, buffer, length, ppos);
         if (ret)
                 return ret;
         if (write) {
@@@ -2851,8 -2801,7 +2869,8 @@@ static void setup_zone_migrate_reserve(
   {
         unsigned long start_pfn, pfn, end_pfn;
         struct page *page;
- -      unsigned long reserve, block_migratetype;
+ +      unsigned long block_migratetype;
+ +      int reserve;
   
         /* Get the start pfn, end pfn and the number of blocks to reserve */
         start_pfn = zone->zone_start_pfn;
@@@ -2860,15 -2809,6 +2878,15 @@@
         reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>
                                                         pageblock_order;
   
+ +      /*
+ +       * Reserve blocks are generally in place to help high-order atomic
+ +       * allocations that are short-lived. A min_free_kbytes value that
+ +       * would result in more than 2 reserve blocks for atomic allocations
+ +       * is assumed to be in place to help anti-fragmentation for the
+ +       * future allocation of hugepages at runtime.
+ +       */
+ +      reserve = min(2, reserve);
+ +
         for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
                 if (!pfn_valid(pfn))
                         continue;
@@@ -3039,7 -2979,6 +3057,7 @@@ static int zone_batchsize(struct zone *
   static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
   {
         struct per_cpu_pages *pcp;
+ +      int migratetype;
   
         memset(p, 0, sizeof(*p));
   
@@@ -3047,8 -2986,7 +3065,8 @@@
         pcp->count = 0;
         pcp->high = 6 * batch;
         pcp->batch = max(1UL, 1 * batch);
- -      INIT_LIST_HEAD(&pcp->list);
+ +      for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)
+ +              INIT_LIST_HEAD(&pcp->lists[migratetype]);
   }
   
   /*
@@@ -3226,32 -3164,6 +3244,32 @@@ int zone_wait_table_init(struct zone *z
         return 0;
   }
   
+ +static int __zone_pcp_update(void *data)
+ +{
+ +      struct zone *zone = data;
+ +      int cpu;
+ +      unsigned long batch = zone_batchsize(zone), flags;
+ +
+ +      for (cpu = 0; cpu < NR_CPUS; cpu++) {
+ +              struct per_cpu_pageset *pset;
+ +              struct per_cpu_pages *pcp;
+ +
+ +              pset = zone_pcp(zone, cpu);
+ +              pcp = &pset->pcp;
+ +
+ +              local_irq_save(flags);
+ +              free_pcppages_bulk(zone, pcp->count, pcp);
+ +              setup_pageset(pset, batch);
+ +              local_irq_restore(flags);
+ +      }
+ +      return 0;
+ +}
+ +
+ +void zone_pcp_update(struct zone *zone)
+ +{
+ +      stop_machine(__zone_pcp_update, zone, NULL);
+ +}
+ +
   static __meminit void zone_pcp_init(struct zone *zone)
   {
         int cpu;
@@@ -3826,7 -3738,7 +3844,7 @@@ static void __paginginit free_area_init
                 zone_pcp_init(zone);
                 for_each_lru(l) {
                         INIT_LIST_HEAD(&zone->lru[l].list);
- -                      zone->lru[l].nr_saved_scan = 0;
+ +                      zone->reclaim_stat.nr_saved_scan[l] = 0;
                 }
                 zone->reclaim_stat.recent_rotated[0] = 0;
                 zone->reclaim_stat.recent_rotated[1] = 0;
@@@ -4615,7 -4527,7 +4633,7 @@@ void setup_per_zone_wmarks(void
         calculate_totalreserve_pages();
   }
   
- -/**
+ +/*
    * The inactive anon list should be small enough that the VM never has to
    * do too much work, but large enough that each inactive page has a chance
    * to be referenced again before it is swapped out.
@@@ -4706,9 -4618,9 +4724,9 @@@ module_init(init_per_zone_wmark_min
    *    changes.
    */
   int min_free_kbytes_sysctl_handler(ctl_table *table, int write, 
- -      struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+ +      void __user *buffer, size_t *length, loff_t *ppos)
   {
- -      proc_dointvec(table, write, file, buffer, length, ppos);
+ +      proc_dointvec(table, write, buffer, length, ppos);
         if (write)
                 setup_per_zone_wmarks();
         return 0;
@@@ -4716,12 -4628,12 +4734,12 @@@
   
   #ifdef CONFIG_NUMA
   int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
- -      struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+ +      void __user *buffer, size_t *length, loff_t *ppos)
   {
         struct zone *zone;
         int rc;
   
- -      rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
+ +      rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
         if (rc)
                 return rc;
   
@@@ -4732,12 -4644,12 +4750,12 @@@
   }
   
   int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
- -      struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+ +      void __user *buffer, size_t *length, loff_t *ppos)
   {
         struct zone *zone;
         int rc;
   
- -      rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
+ +      rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
         if (rc)
                 return rc;
   
@@@ -4758,9 -4670,9 +4776,9 @@@
    * if in function of the boot time zone sizes.
    */
   int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
- -      struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+ +      void __user *buffer, size_t *length, loff_t *ppos)
   {
- -      proc_dointvec_minmax(table, write, file, buffer, length, ppos);
+ +      proc_dointvec_minmax(table, write, buffer, length, ppos);
         setup_per_zone_lowmem_reserve();
         return 0;
   }
@@@ -4772,13 -4684,13 +4790,13 @@@
    */
   
   int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
- -      struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+ +      void __user *buffer, size_t *length, loff_t *ppos)
   {
         struct zone *zone;
         unsigned int cpu;
         int ret;
   
- -      ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
+ +      ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
         if (!write || (ret == -EINVAL))
                 return ret;
         for_each_populated_zone(zone) {
@@@ -4838,14 -4750,7 +4856,14 @@@ void *__init alloc_large_system_hash(co
                         numentries <<= (PAGE_SHIFT - scale);
   
                 /* Make sure we've got at least a 0-order allocation.. */
- -              if (unlikely((numentries * bucketsize) < PAGE_SIZE))
+ +              if (unlikely(flags & HASH_SMALL)) {
+ +                      /* Makes no sense without HASH_EARLY */
+ +                      WARN_ON(!(flags & HASH_EARLY));
+ +                      if (!(numentries >> *_hash_shift)) {
+ +                              numentries = 1UL << *_hash_shift;
+ +                              BUG_ON(!numentries);
+ +                      }
+ +              } else if (unlikely((numentries * bucketsize) < PAGE_SIZE))
                         numentries = PAGE_SIZE / bucketsize;
         }
         numentries = roundup_pow_of_two(numentries);
@@@ -4987,16 -4892,13 +5005,16 @@@ int set_migratetype_isolate(struct pag
         struct zone *zone;
         unsigned long flags;
         int ret = -EBUSY;
+ +      int zone_idx;
   
         zone = page_zone(page);
+ +      zone_idx = zone_idx(zone);
         spin_lock_irqsave(&zone->lock, flags);
         /*
          * In future, more migrate types will be able to be isolation target.
          */
- -      if (get_pageblock_migratetype(page) != MIGRATE_MOVABLE)
+ +      if (get_pageblock_migratetype(page) != MIGRATE_MOVABLE &&
+ +          zone_idx != ZONE_MOVABLE)
                 goto out;
         set_pageblock_migratetype(page, MIGRATE_ISOLATE);
         move_freepages_block(zone, page, MIGRATE_ISOLATE);
diff --combined mm/rmap.c

index 720fc03a7bc454de75fa86f542770ab9b9660788,09c3d0b961168b6168a41b47a80e7f2c4774799f..28aafe2b530668b03c766619a83873ee2a91087e
--- 1/mm/rmap.c
--- 2/mm/rmap.c
+++ b/mm/rmap.c
@@@ -36,6 -36,11 +36,11 @@@
    *                 mapping->tree_lock (widely used, in set_page_dirty,
    *                           in arch-dependent flush_dcache_mmap_lock,
    *                           within inode_lock in __sync_single_inode)
+  *
+  * (code doesn't rely on that order so it could be switched around)
+  * ->tasklist_lock
+  *   anon_vma->lock      (memory_failure, collect_procs_anon)
+  *     pte map lock
    */
   
   #include <linux/mm.h>
@@@ -191,7 -196,7 +196,7 @@@ void __init anon_vma_init(void
    * Getting a lock on a stable anon_vma from a page off the LRU is
    * tricky: page_lock_anon_vma rely on RCU to guard against the races.
    */
- static struct anon_vma *page_lock_anon_vma(struct page *page)
+ struct anon_vma *page_lock_anon_vma(struct page *page)
   {
         struct anon_vma *anon_vma;
         unsigned long anon_mapping;
@@@ -211,7 -216,7 +216,7 @@@ out
         return NULL;
   }
   
- static void page_unlock_anon_vma(struct anon_vma *anon_vma)
+ void page_unlock_anon_vma(struct anon_vma *anon_vma)
   {
         spin_unlock(&anon_vma->lock);
         rcu_read_unlock();
@@@ -311,7 -316,7 +316,7 @@@ pte_t *page_check_address(struct page *
    * if the page is not mapped into the page tables of this VMA.  Only
    * valid for normal file or anonymous VMAs.
    */
- static int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
+ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
   {
         unsigned long address;
         pte_t *pte;
@@@ -710,6 -715,27 +715,6 @@@ void page_add_file_rmap(struct page *pa
         }
   }
   
- -#ifdef CONFIG_DEBUG_VM
- -/**
- - * page_dup_rmap - duplicate pte mapping to a page
- - * @page:     the page to add the mapping to
- - * @vma:      the vm area being duplicated
- - * @address:  the user virtual address mapped
- - *
- - * For copy_page_range only: minimal extract from page_add_file_rmap /
- - * page_add_anon_rmap, avoiding unnecessary tests (already checked) so it's
- - * quicker.
- - *
- - * The caller needs to hold the pte lock.
- - */
- -void page_dup_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address)
- -{
- -      if (PageAnon(page))
- -              __page_check_anon_rmap(page, vma, address);
- -      atomic_inc(&page->_mapcount);
- -}
- -#endif
- -
   /**
    * page_remove_rmap - take down pte mapping from a page
    * @page: page to remove mapping from
@@@ -718,37 -744,34 +723,37 @@@
    */
   void page_remove_rmap(struct page *page)
   {
- -      if (atomic_add_negative(-1, &page->_mapcount)) {
- -              /*
- -               * Now that the last pte has gone, s390 must transfer dirty
- -               * flag from storage key to struct page.  We can usually skip
- -               * this if the page is anon, so about to be freed; but perhaps
- -               * not if it's in swapcache - there might be another pte slot
- -               * containing the swap entry, but page not yet written to swap.
- -               */
- -              if ((!PageAnon(page) || PageSwapCache(page)) &&
- -                  page_test_dirty(page)) {
- -                      page_clear_dirty(page);
- -                      set_page_dirty(page);
- -              }
- -              if (PageAnon(page))
- -                      mem_cgroup_uncharge_page(page);
- -              __dec_zone_page_state(page,
- -                      PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED);
- -              mem_cgroup_update_mapped_file_stat(page, -1);
- -              /*
- -               * It would be tidy to reset the PageAnon mapping here,
- -               * but that might overwrite a racing page_add_anon_rmap
- -               * which increments mapcount after us but sets mapping
- -               * before us: so leave the reset to free_hot_cold_page,
- -               * and remember that it's only reliable while mapped.
- -               * Leaving it set also helps swapoff to reinstate ptes
- -               * faster for those pages still in swapcache.
- -               */
+ +      /* page still mapped by someone else? */
+ +      if (!atomic_add_negative(-1, &page->_mapcount))
+ +              return;
+ +
+ +      /*
+ +       * Now that the last pte has gone, s390 must transfer dirty
+ +       * flag from storage key to struct page.  We can usually skip
+ +       * this if the page is anon, so about to be freed; but perhaps
+ +       * not if it's in swapcache - there might be another pte slot
+ +       * containing the swap entry, but page not yet written to swap.
+ +       */
+ +      if ((!PageAnon(page) || PageSwapCache(page)) && page_test_dirty(page)) {
+ +              page_clear_dirty(page);
+ +              set_page_dirty(page);
+ +      }
+ +      if (PageAnon(page)) {
+ +              mem_cgroup_uncharge_page(page);
+ +              __dec_zone_page_state(page, NR_ANON_PAGES);
+ +      } else {
+ +              __dec_zone_page_state(page, NR_FILE_MAPPED);
         }
+ +      mem_cgroup_update_mapped_file_stat(page, -1);
+ +      /*
+ +       * It would be tidy to reset the PageAnon mapping here,
+ +       * but that might overwrite a racing page_add_anon_rmap
+ +       * which increments mapcount after us but sets mapping
+ +       * before us: so leave the reset to free_hot_cold_page,
+ +       * and remember that it's only reliable while mapped.
+ +       * Leaving it set also helps swapoff to reinstate ptes
+ +       * faster for those pages still in swapcache.
+ +       */
   }
   
   /*
@@@ -756,7 -779,7 +761,7 @@@
    * repeatedly from either try_to_unmap_anon or try_to_unmap_file.
    */
   static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
-                               int migration)
+                               enum ttu_flags flags)
   {
         struct mm_struct *mm = vma->vm_mm;
         unsigned long address;
@@@ -778,11 -801,13 +783,13 @@@
          * If it's recently referenced (perhaps page_referenced
          * skipped over this mm) then we should reactivate it.
          */
-       if (!migration) {
+       if (!(flags & TTU_IGNORE_MLOCK)) {
                 if (vma->vm_flags & VM_LOCKED) {
                         ret = SWAP_MLOCK;
                         goto out_unmap;
                 }
+       }
+       if (!(flags & TTU_IGNORE_ACCESS)) {
                 if (ptep_clear_flush_young_notify(vma, address, pte)) {
                         ret = SWAP_FAIL;
                         goto out_unmap;
@@@ -800,7 -825,14 +807,14 @@@
         /* Update high watermark before we lower rss */
         update_hiwater_rss(mm);
   
-       if (PageAnon(page)) {
+       if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) {
+               if (PageAnon(page))
+                       dec_mm_counter(mm, anon_rss);
+               else
+                       dec_mm_counter(mm, file_rss);
+               set_pte_at(mm, address, pte,
+                               swp_entry_to_pte(make_hwpoison_entry(page)));
+       } else if (PageAnon(page)) {
                 swp_entry_t entry = { .val = page_private(page) };
   
                 if (PageSwapCache(page)) {
@@@ -822,12 -854,12 +836,12 @@@
                          * pte. do_swap_page() will wait until the migration
                          * pte is removed and then restart fault handling.
                          */
-                       BUG_ON(!migration);
+                       BUG_ON(TTU_ACTION(flags) != TTU_MIGRATION);
                         entry = make_migration_entry(page, pte_write(pteval));
                 }
                 set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
                 BUG_ON(pte_file(*pte));
-       } else if (PAGE_MIGRATION && migration) {
+       } else if (PAGE_MIGRATION && (TTU_ACTION(flags) == TTU_MIGRATION)) {
                 /* Establish migration entry for a file page */
                 swp_entry_t entry;
                 entry = make_migration_entry(page, pte_write(pteval));
@@@ -996,12 -1028,13 +1010,13 @@@ static int try_to_mlock_page(struct pag
    * vm_flags for that VMA.  That should be OK, because that vma shouldn't be
    * 'LOCKED.
    */
- static int try_to_unmap_anon(struct page *page, int unlock, int migration)
+ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
   {
         struct anon_vma *anon_vma;
         struct vm_area_struct *vma;
         unsigned int mlocked = 0;
         int ret = SWAP_AGAIN;
+       int unlock = TTU_ACTION(flags) == TTU_MUNLOCK;
   
         if (MLOCK_PAGES && unlikely(unlock))
                 ret = SWAP_SUCCESS;     /* default for try_to_munlock() */
@@@ -1017,7 -1050,7 +1032,7 @@@
                                 continue;  /* must visit all unlocked vmas */
                         ret = SWAP_MLOCK;  /* saw at least one mlocked vma */
                 } else {
-                       ret = try_to_unmap_one(page, vma, migration);
+                       ret = try_to_unmap_one(page, vma, flags);
                         if (ret == SWAP_FAIL || !page_mapped(page))
                                 break;
                 }
@@@ -1041,8 -1074,7 +1056,7 @@@
   /**
    * try_to_unmap_file - unmap/unlock file page using the object-based rmap method
    * @page: the page to unmap/unlock
-  * @unlock:  request for unlock rather than unmap [unlikely]
-  * @migration:  unmapping for migration - ignored if @unlock
+  * @flags: action and flags
    *
    * Find all the mappings of a page using the mapping pointer and the vma chains
    * contained in the address_space struct it points to.
@@@ -1054,7 -1086,7 +1068,7 @@@
    * vm_flags for that VMA.  That should be OK, because that vma shouldn't be
    * 'LOCKED.
    */
- static int try_to_unmap_file(struct page *page, int unlock, int migration)
+ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
   {
         struct address_space *mapping = page->mapping;
         pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
@@@ -1066,6 -1098,7 +1080,7 @@@
         unsigned long max_nl_size = 0;
         unsigned int mapcount;
         unsigned int mlocked = 0;
+       int unlock = TTU_ACTION(flags) == TTU_MUNLOCK;
   
         if (MLOCK_PAGES && unlikely(unlock))
                 ret = SWAP_SUCCESS;     /* default for try_to_munlock() */
@@@ -1078,7 -1111,7 +1093,7 @@@
                                 continue;       /* must visit all vmas */
                         ret = SWAP_MLOCK;
                 } else {
-                       ret = try_to_unmap_one(page, vma, migration);
+                       ret = try_to_unmap_one(page, vma, flags);
                         if (ret == SWAP_FAIL || !page_mapped(page))
                                 goto out;
                 }
@@@ -1103,7 -1136,8 +1118,8 @@@
                         ret = SWAP_MLOCK;       /* leave mlocked == 0 */
                         goto out;               /* no need to look further */
                 }
-               if (!MLOCK_PAGES && !migration && (vma->vm_flags & VM_LOCKED))
+               if (!MLOCK_PAGES && !(flags & TTU_IGNORE_MLOCK) &&
+                       (vma->vm_flags & VM_LOCKED))
                         continue;
                 cursor = (unsigned long) vma->vm_private_data;
                 if (cursor > max_nl_cursor)
@@@ -1137,7 -1171,7 +1153,7 @@@
         do {
                 list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
                                                 shared.vm_set.list) {
-                       if (!MLOCK_PAGES && !migration &&
+                       if (!MLOCK_PAGES && !(flags & TTU_IGNORE_MLOCK) &&
                             (vma->vm_flags & VM_LOCKED))
                                 continue;
                         cursor = (unsigned long) vma->vm_private_data;
@@@ -1177,7 -1211,7 +1193,7 @@@ out
   /**
    * try_to_unmap - try to remove all page table mappings to a page
    * @page: the page to get unmapped
-  * @migration: migration flag
+  * @flags: action and flags
    *
    * Tries to remove all the page table entries which are mapping this
    * page, used in the pageout path.  Caller must hold the page lock.
@@@ -1188,16 -1222,16 +1204,16 @@@
    * SWAP_FAIL  - the page is unswappable
    * SWAP_MLOCK - page is mlocked.
    */
- int try_to_unmap(struct page *page, int migration)
+ int try_to_unmap(struct page *page, enum ttu_flags flags)
   {
         int ret;
   
         BUG_ON(!PageLocked(page));
   
         if (PageAnon(page))
-               ret = try_to_unmap_anon(page, 0, migration);
+               ret = try_to_unmap_anon(page, flags);
         else
-               ret = try_to_unmap_file(page, 0, migration);
+               ret = try_to_unmap_file(page, flags);
         if (ret != SWAP_MLOCK && !page_mapped(page))
                 ret = SWAP_SUCCESS;
         return ret;
@@@ -1222,8 -1256,8 +1238,8 @@@ int try_to_munlock(struct page *page
         VM_BUG_ON(!PageLocked(page) || PageLRU(page));
   
         if (PageAnon(page))
-               return try_to_unmap_anon(page, 1, 0);
+               return try_to_unmap_anon(page, TTU_MUNLOCK);
         else
-               return try_to_unmap_file(page, 1, 0);
+               return try_to_unmap_file(page, TTU_MUNLOCK);
   }
   
diff --combined mm/shmem.c

index b206a7a32e2a4e00bc7446ae839407f656031643,bec85895a1fe1d75c7e15c1115d128f411295dd0..98631c26c20001931a6e4ca13032716992d6808c
--- 1/mm/shmem.c
--- 2/mm/shmem.c
+++ b/mm/shmem.c
@@@ -49,6 -49,7 +49,6 @@@ static struct vfsmount *shm_mnt
   #include <linux/backing-dev.h>
   #include <linux/shmem_fs.h>
   #include <linux/writeback.h>
- -#include <linux/vfs.h>
   #include <linux/blkdev.h>
   #include <linux/security.h>
   #include <linux/swapops.h>
@@@ -1096,10 -1097,6 +1096,10 @@@ static int shmem_writepage(struct page 
         shmem_swp_unmap(entry);
   unlock:
         spin_unlock(&info->lock);
+ +      /*
+ +       * add_to_swap_cache() doesn't return -EEXIST, so we can safely
+ +       * clear SWAP_HAS_CACHE flag.
+ +       */
         swapcache_free(swap, NULL);
   redirty:
         set_page_dirty(page);
@@@ -1633,8 -1630,8 +1633,8 @@@ shmem_write_end(struct file *file, stru
         if (pos + copied > inode->i_size)
                 i_size_write(inode, pos + copied);
   
-       unlock_page(page);
         set_page_dirty(page);
+       unlock_page(page);
         page_cache_release(page);
   
         return copied;
@@@ -1971,13 -1968,13 +1971,13 @@@ static int shmem_symlink(struct inode *
                         iput(inode);
                         return error;
                 }
-               unlock_page(page);
                 inode->i_mapping->a_ops = &shmem_aops;
                 inode->i_op = &shmem_symlink_inode_operations;
                 kaddr = kmap_atomic(page, KM_USER0);
                 memcpy(kaddr, symname, len);
                 kunmap_atomic(kaddr, KM_USER0);
                 set_page_dirty(page);
+               unlock_page(page);
                 page_cache_release(page);
         }
         if (dir->i_mode & S_ISGID)
@@@ -2301,7 -2298,8 +2301,7 @@@ static void shmem_put_super(struct supe
         sb->s_fs_info = NULL;
   }
   
- -static int shmem_fill_super(struct super_block *sb,
- -                          void *data, int silent)
+ +int shmem_fill_super(struct super_block *sb, void *data, int silent)
   {
         struct inode *inode;
         struct dentry *root;
@@@ -2309,14 -2307,17 +2309,14 @@@
         int err = -ENOMEM;
   
         /* Round up to L1_CACHE_BYTES to resist false sharing */
- -      sbinfo = kmalloc(max((int)sizeof(struct shmem_sb_info),
+ +      sbinfo = kzalloc(max((int)sizeof(struct shmem_sb_info),
                                 L1_CACHE_BYTES), GFP_KERNEL);
         if (!sbinfo)
                 return -ENOMEM;
   
- -      sbinfo->max_blocks = 0;
- -      sbinfo->max_inodes = 0;
         sbinfo->mode = S_IRWXUGO | S_ISVTX;
         sbinfo->uid = current_fsuid();
         sbinfo->gid = current_fsgid();
- -      sbinfo->mpol = NULL;
         sb->s_fs_info = sbinfo;
   
   #ifdef CONFIG_TMPFS
@@@ -2420,6 -2421,7 +2420,7 @@@ static const struct address_space_opera
         .write_end      = shmem_write_end,
   #endif
         .migratepage    = migrate_page,
+       .error_remove_page = generic_error_remove_page,
   };
   
   static const struct file_operations shmem_file_operations = {
@@@ -2518,7 -2520,7 +2519,7 @@@ static struct file_system_type tmpfs_fs
         .kill_sb        = kill_litter_super,
   };
   
- -static int __init init_tmpfs(void)
+ +int __init init_tmpfs(void)
   {
         int error;
   
@@@ -2575,7 -2577,7 +2576,7 @@@ static struct file_system_type tmpfs_fs
         .kill_sb        = kill_litter_super,
   };
   
- -static int __init init_tmpfs(void)
+ +int __init init_tmpfs(void)
   {
         BUG_ON(register_filesystem(&tmpfs_fs_type) != 0);
   
@@@ -2590,11 -2592,6 +2591,11 @@@ int shmem_unuse(swp_entry_t entry, stru
         return 0;
   }
   
+ +int shmem_lock(struct file *file, int lock, struct user_struct *user)
+ +{
+ +      return 0;
+ +}
+ +
   #define shmem_vm_ops                          generic_file_vm_ops
   #define shmem_file_operations                 ramfs_file_operations
   #define shmem_get_inode(sb, mode, dev, flags) ramfs_get_inode(sb, mode, dev)
@@@ -2691,3 -2688,5 +2692,3 @@@ int shmem_zero_setup(struct vm_area_str
         vma->vm_ops = &shmem_vm_ops;
         return 0;
   }
- -
- -module_init(init_tmpfs)
diff --combined mm/swapfile.c

index f1bf19daadc67143b099518c1bc29aa52ae04227,ce5dda6d604b503f2284d727f1600a3f12af97eb..4de7f02f820b03bfcf36b5fc8d6827b5eecd38cb
--- 1/mm/swapfile.c
--- 2/mm/swapfile.c
+++ b/mm/swapfile.c
@@@ -699,7 -699,7 +699,7 @@@ int free_swap_and_cache(swp_entry_t ent
         struct swap_info_struct *p;
         struct page *page = NULL;
   
-       if (is_migration_entry(entry))
+       if (non_swap_entry(entry))
                 return 1;
   
         p = swap_info_get(entry);
@@@ -1575,9 -1575,9 +1575,9 @@@ SYSCALL_DEFINE1(swapoff, const char __u
         p->flags &= ~SWP_WRITEOK;
         spin_unlock(&swap_lock);
   
- -      current->flags |= PF_SWAPOFF;
+ +      current->flags |= PF_OOM_ORIGIN;
         err = try_to_unuse(type);
- -      current->flags &= ~PF_SWAPOFF;
+ +      current->flags &= ~PF_OOM_ORIGIN;
   
         if (err) {
                 /* re-insert swap space back into swap_list */
@@@ -2085,7 -2085,7 +2085,7 @@@ static int __swap_duplicate(swp_entry_
         int count;
         bool has_cache;
   
-       if (is_migration_entry(entry))
+       if (non_swap_entry(entry))
                 return -EINVAL;
   
         type = swp_type(entry);
diff --combined mm/vmscan.c

index f444b7409085fc8e224da0b11e592b4364daccdd,ab3b0ad3ce527460fc08fe8169c16a6f59d479b6..1219ceb8a9b2d992da20bb9a10942e7cef2d98b1
--- 1/mm/vmscan.c
--- 2/mm/vmscan.c
+++ b/mm/vmscan.c
@@@ -148,8 -148,8 +148,8 @@@ static struct zone_reclaim_stat *get_re
         return &zone->reclaim_stat;
   }
   
- -static unsigned long zone_nr_pages(struct zone *zone, struct scan_control *sc,
- -                                 enum lru_list lru)
+ +static unsigned long zone_nr_lru_pages(struct zone *zone,
+ +                              struct scan_control *sc, enum lru_list lru)
   {
         if (!scanning_global_lru(sc))
                 return mem_cgroup_zone_nr_pages(sc->mem_cgroup, zone, lru);
@@@ -286,12 -286,7 +286,12 @@@ static inline int page_mapping_inuse(st
   
   static inline int is_page_cache_freeable(struct page *page)
   {
- -      return page_count(page) - !!page_has_private(page) == 2;
+ +      /*
+ +       * A freeable page cache page is referenced only by the caller
+ +       * that isolated the page, the page cache radix tree and
+ +       * optional buffer heads at page->private.
+ +       */
+ +      return page_count(page) - page_has_private(page) == 2;
   }
   
   static int may_write_to_queue(struct backing_dev_info *bdi)
@@@ -366,6 -361,7 +366,6 @@@ static pageout_t pageout(struct page *p
          * block, for some throttling. This happens by accident, because
          * swap_backing_dev_info is bust: it doesn't reflect the
          * congestion state of the swapdevs.  Easy to fix, if needed.
- -       * See swapfile.c:page_queue_congested().
          */
         if (!is_page_cache_freeable(page))
                 return PAGE_KEEP;
@@@ -535,7 -531,7 +535,7 @@@ redo
                  * unevictable page on [in]active list.
                  * We know how to handle that.
                  */
- -              lru = active + page_is_file_cache(page);
+ +              lru = active + page_lru_base_type(page);
                 lru_cache_add_lru(page, lru);
         } else {
                 /*
@@@ -663,7 -659,7 +663,7 @@@ static unsigned long shrink_page_list(s
                  * processes. Try to unmap it here.
                  */
                 if (page_mapped(page) && mapping) {
-                       switch (try_to_unmap(page, 0)) {
+                       switch (try_to_unmap(page, TTU_UNMAP)) {
                         case SWAP_FAIL:
                                 goto activate_locked;
                         case SWAP_AGAIN:
@@@ -825,7 -821,7 +825,7 @@@ int __isolate_lru_page(struct page *pag
         if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode))
                 return ret;
   
- -      if (mode != ISOLATE_BOTH && (!page_is_file_cache(page) != !file))
+ +      if (mode != ISOLATE_BOTH && page_is_file_cache(page) != file)
                 return ret;
   
         /*
@@@ -939,16 -935,6 +939,16 @@@ static unsigned long isolate_lru_pages(
                         /* Check that we have not crossed a zone boundary. */
                         if (unlikely(page_zone_id(cursor_page) != zone_id))
                                 continue;
+ +
+ +                      /*
+ +                       * If we don't have enough swap space, reclaiming of
+ +                       * anon page which don't already have a swap slot is
+ +                       * pointless.
+ +                       */
+ +                      if (nr_swap_pages <= 0 && PageAnon(cursor_page) &&
+ +                                      !PageSwapCache(cursor_page))
+ +                              continue;
+ +
                         if (__isolate_lru_page(cursor_page, mode, file) == 0) {
                                 list_move(&cursor_page->lru, dst);
                                 mem_cgroup_del_lru(cursor_page);
@@@ -975,7 -961,7 +975,7 @@@ static unsigned long isolate_pages_glob
         if (file)
                 lru += LRU_FILE;
         return isolate_lru_pages(nr, &z->lru[lru].list, dst, scanned, order,
- -                                                              mode, !!file);
+ +                                                              mode, file);
   }
   
   /*
@@@ -990,7 -976,7 +990,7 @@@ static unsigned long clear_active_flags
         struct page *page;
   
         list_for_each_entry(page, page_list, lru) {
- -              lru = page_is_file_cache(page);
+ +              lru = page_lru_base_type(page);
                 if (PageActive(page)) {
                         lru += LRU_ACTIVE;
                         ClearPageActive(page);
@@@ -1047,31 -1033,6 +1047,31 @@@ int isolate_lru_page(struct page *page
         return ret;
   }
   
+ +/*
+ + * Are there way too many processes in the direct reclaim path already?
+ + */
+ +static int too_many_isolated(struct zone *zone, int file,
+ +              struct scan_control *sc)
+ +{
+ +      unsigned long inactive, isolated;
+ +
+ +      if (current_is_kswapd())
+ +              return 0;
+ +
+ +      if (!scanning_global_lru(sc))
+ +              return 0;
+ +
+ +      if (file) {
+ +              inactive = zone_page_state(zone, NR_INACTIVE_FILE);
+ +              isolated = zone_page_state(zone, NR_ISOLATED_FILE);
+ +      } else {
+ +              inactive = zone_page_state(zone, NR_INACTIVE_ANON);
+ +              isolated = zone_page_state(zone, NR_ISOLATED_ANON);
+ +      }
+ +
+ +      return isolated > inactive;
+ +}
+ +
   /*
    * shrink_inactive_list() is a helper for shrink_zone().  It returns the number
    * of reclaimed pages
@@@ -1087,14 -1048,6 +1087,14 @@@ static unsigned long shrink_inactive_li
         struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
         int lumpy_reclaim = 0;
   
+ +      while (unlikely(too_many_isolated(zone, file, sc))) {
+ +              congestion_wait(WRITE, HZ/10);
+ +
+ +              /* We are about to die and free our memory. Return now. */
+ +              if (fatal_signal_pending(current))
+ +                      return SWAP_CLUSTER_MAX;
+ +      }
+ +
         /*
          * If we need a large contiguous chunk of memory, or have
          * trouble getting a small set of contiguous pages, we
@@@ -1119,26 -1072,10 +1119,26 @@@
                 unsigned long nr_active;
                 unsigned int count[NR_LRU_LISTS] = { 0, };
                 int mode = lumpy_reclaim ? ISOLATE_BOTH : ISOLATE_INACTIVE;
+ +              unsigned long nr_anon;
+ +              unsigned long nr_file;
   
                 nr_taken = sc->isolate_pages(sc->swap_cluster_max,
                              &page_list, &nr_scan, sc->order, mode,
                                 zone, sc->mem_cgroup, 0, file);
+ +
+ +              if (scanning_global_lru(sc)) {
+ +                      zone->pages_scanned += nr_scan;
+ +                      if (current_is_kswapd())
+ +                              __count_zone_vm_events(PGSCAN_KSWAPD, zone,
+ +                                                     nr_scan);
+ +                      else
+ +                              __count_zone_vm_events(PGSCAN_DIRECT, zone,
+ +                                                     nr_scan);
+ +              }
+ +
+ +              if (nr_taken == 0)
+ +                      goto done;
+ +
                 nr_active = clear_active_flags(&page_list, count);
                 __count_vm_events(PGDEACTIVATE, nr_active);
   
@@@ -1151,10 -1088,8 +1151,10 @@@
                 __mod_zone_page_state(zone, NR_INACTIVE_ANON,
                                                 -count[LRU_INACTIVE_ANON]);
   
- -              if (scanning_global_lru(sc))
- -                      zone->pages_scanned += nr_scan;
+ +              nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON];
+ +              nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE];
+ +              __mod_zone_page_state(zone, NR_ISOLATED_ANON, nr_anon);
+ +              __mod_zone_page_state(zone, NR_ISOLATED_FILE, nr_file);
   
                 reclaim_stat->recent_scanned[0] += count[LRU_INACTIVE_ANON];
                 reclaim_stat->recent_scanned[0] += count[LRU_ACTIVE_ANON];
@@@ -1188,12 -1123,18 +1188,12 @@@
                 }
   
                 nr_reclaimed += nr_freed;
+ +
                 local_irq_disable();
- -              if (current_is_kswapd()) {
- -                      __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scan);
+ +              if (current_is_kswapd())
                         __count_vm_events(KSWAPD_STEAL, nr_freed);
- -              } else if (scanning_global_lru(sc))
- -                      __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan);
- -
                 __count_zone_vm_events(PGSTEAL, zone, nr_freed);
   
- -              if (nr_taken == 0)
- -                      goto done;
- -
                 spin_lock(&zone->lru_lock);
                 /*
                  * Put back any unfreeable pages.
@@@ -1212,8 -1153,8 +1212,8 @@@
                         SetPageLRU(page);
                         lru = page_lru(page);
                         add_page_to_lru_list(zone, page, lru);
- -                      if (PageActive(page)) {
- -                              int file = !!page_is_file_cache(page);
+ +                      if (is_active_lru(lru)) {
+ +                              int file = is_file_lru(lru);
                                 reclaim_stat->recent_rotated[file]++;
                         }
                         if (!pagevec_add(&pvec, page)) {
@@@ -1222,13 -1163,10 +1222,13 @@@
                                 spin_lock_irq(&zone->lru_lock);
                         }
                 }
+ +              __mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon);
+ +              __mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file);
+ +
         } while (nr_scanned < max_scan);
- -      spin_unlock(&zone->lru_lock);
+ +
   done:
- -      local_irq_enable();
+ +      spin_unlock_irq(&zone->lru_lock);
         pagevec_release(&pvec);
         return nr_reclaimed;
   }
@@@ -1277,10 -1215,15 +1277,10 @@@ static void move_active_pages_to_lru(st
   
         while (!list_empty(list)) {
                 page = lru_to_page(list);
- -              prefetchw_prev_lru_page(page, list, flags);
   
                 VM_BUG_ON(PageLRU(page));
                 SetPageLRU(page);
   
- -              VM_BUG_ON(!PageActive(page));
- -              if (!is_active_lru(lru))
- -                      ClearPageActive(page);  /* we are de-activating */
- -
                 list_move(&page->lru, &zone->lru[lru].list);
                 mem_cgroup_add_lru_list(page, lru);
                 pgmoved++;
@@@ -1301,7 -1244,7 +1301,7 @@@
   static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
                         struct scan_control *sc, int priority, int file)
   {
- -      unsigned long pgmoved;
+ +      unsigned long nr_taken;
         unsigned long pgscanned;
         unsigned long vm_flags;
         LIST_HEAD(l_hold);      /* The pages which were snipped off */
@@@ -1309,11 -1252,10 +1309,11 @@@
         LIST_HEAD(l_inactive);
         struct page *page;
         struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
+ +      unsigned long nr_rotated = 0;
   
         lru_add_drain();
         spin_lock_irq(&zone->lru_lock);
- -      pgmoved = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order,
+ +      nr_taken = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order,
                                         ISOLATE_ACTIVE, zone,
                                         sc->mem_cgroup, 1, file);
         /*
@@@ -1323,16 -1265,16 +1323,16 @@@
         if (scanning_global_lru(sc)) {
                 zone->pages_scanned += pgscanned;
         }
- -      reclaim_stat->recent_scanned[!!file] += pgmoved;
+ +      reclaim_stat->recent_scanned[file] += nr_taken;
   
         __count_zone_vm_events(PGREFILL, zone, pgscanned);
         if (file)
- -              __mod_zone_page_state(zone, NR_ACTIVE_FILE, -pgmoved);
+ +              __mod_zone_page_state(zone, NR_ACTIVE_FILE, -nr_taken);
         else
- -              __mod_zone_page_state(zone, NR_ACTIVE_ANON, -pgmoved);
+ +              __mod_zone_page_state(zone, NR_ACTIVE_ANON, -nr_taken);
+ +      __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
         spin_unlock_irq(&zone->lru_lock);
   
- -      pgmoved = 0;  /* count referenced (mapping) mapped pages */
         while (!list_empty(&l_hold)) {
                 cond_resched();
                 page = lru_to_page(&l_hold);
@@@ -1346,7 -1288,7 +1346,7 @@@
                 /* page_referenced clears PageReferenced */
                 if (page_mapping_inuse(page) &&
                     page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) {
- -                      pgmoved++;
+ +                      nr_rotated++;
                         /*
                          * Identify referenced, file-backed active pages and
                          * give them one more trip around the active list. So
@@@ -1362,7 -1304,6 +1362,7 @@@
                         }
                 }
   
+ +              ClearPageActive(page);  /* we are de-activating */
                 list_add(&page->lru, &l_inactive);
         }
   
@@@ -1376,13 -1317,13 +1376,13 @@@
          * helps balance scan pressure between file and anonymous pages in
          * get_scan_ratio.
          */
- -      reclaim_stat->recent_rotated[!!file] += pgmoved;
+ +      reclaim_stat->recent_rotated[file] += nr_rotated;
   
         move_active_pages_to_lru(zone, &l_active,
                                                 LRU_ACTIVE + file * LRU_FILE);
         move_active_pages_to_lru(zone, &l_inactive,
                                                 LRU_BASE   + file * LRU_FILE);
- -
+ +      __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
         spin_unlock_irq(&zone->lru_lock);
   }
   
@@@ -1488,10 -1429,10 +1488,10 @@@ static void get_scan_ratio(struct zone 
         unsigned long ap, fp;
         struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
   
- -      anon  = zone_nr_pages(zone, sc, LRU_ACTIVE_ANON) +
- -              zone_nr_pages(zone, sc, LRU_INACTIVE_ANON);
- -      file  = zone_nr_pages(zone, sc, LRU_ACTIVE_FILE) +
- -              zone_nr_pages(zone, sc, LRU_INACTIVE_FILE);
+ +      anon  = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) +
+ +              zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
+ +      file  = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) +
+ +              zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
   
         if (scanning_global_lru(sc)) {
                 free  = zone_page_state(zone, NR_FREE_PAGES);
@@@ -1585,7 -1526,6 +1585,7 @@@ static void shrink_zone(int priority, s
         enum lru_list l;
         unsigned long nr_reclaimed = sc->nr_reclaimed;
         unsigned long swap_cluster_max = sc->swap_cluster_max;
+ +      struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
         int noswap = 0;
   
         /* If we have no swap space, do not bother scanning anon pages. */
@@@ -1600,14 -1540,17 +1600,14 @@@
                 int file = is_file_lru(l);
                 unsigned long scan;
   
- -              scan = zone_nr_pages(zone, sc, l);
+ +              scan = zone_nr_lru_pages(zone, sc, l);
                 if (priority || noswap) {
                         scan >>= priority;
                         scan = (scan * percent[file]) / 100;
                 }
- -              if (scanning_global_lru(sc))
- -                      nr[l] = nr_scan_try_batch(scan,
- -                                                &zone->lru[l].nr_saved_scan,
- -                                                swap_cluster_max);
- -              else
- -                      nr[l] = scan;
+ +              nr[l] = nr_scan_try_batch(scan,
+ +                                        &reclaim_stat->nr_saved_scan[l],
+ +                                        swap_cluster_max);
         }
   
         while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
@@@ -1742,7 -1685,7 +1742,7 @@@ static unsigned long do_try_to_free_pag
                         if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
                                 continue;
   
- -                      lru_pages += zone_lru_pages(zone);
+ +                      lru_pages += zone_reclaimable_pages(zone);
                 }
         }
   
@@@ -1836,45 -1779,11 +1836,45 @@@ unsigned long try_to_free_pages(struct 
   
   #ifdef CONFIG_CGROUP_MEM_RES_CTLR
   
+ +unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
+ +                                              gfp_t gfp_mask, bool noswap,
+ +                                              unsigned int swappiness,
+ +                                              struct zone *zone, int nid)
+ +{
+ +      struct scan_control sc = {
+ +              .may_writepage = !laptop_mode,
+ +              .may_unmap = 1,
+ +              .may_swap = !noswap,
+ +              .swap_cluster_max = SWAP_CLUSTER_MAX,
+ +              .swappiness = swappiness,
+ +              .order = 0,
+ +              .mem_cgroup = mem,
+ +              .isolate_pages = mem_cgroup_isolate_pages,
+ +      };
+ +      nodemask_t nm  = nodemask_of_node(nid);
+ +
+ +      sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
+ +                      (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
+ +      sc.nodemask = &nm;
+ +      sc.nr_reclaimed = 0;
+ +      sc.nr_scanned = 0;
+ +      /*
+ +       * NOTE: Although we can get the priority field, using it
+ +       * here is not a good idea, since it limits the pages we can scan.
+ +       * if we don't reclaim here, the shrink_zone from balance_pgdat
+ +       * will pick up pages from other mem cgroup's as well. We hack
+ +       * the priority and make it zero.
+ +       */
+ +      shrink_zone(0, zone, &sc);
+ +      return sc.nr_reclaimed;
+ +}
+ +
   unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
                                            gfp_t gfp_mask,
                                            bool noswap,
                                            unsigned int swappiness)
   {
+ +      struct zonelist *zonelist;
         struct scan_control sc = {
                 .may_writepage = !laptop_mode,
                 .may_unmap = 1,
@@@ -1886,6 -1795,7 +1886,6 @@@
                 .isolate_pages = mem_cgroup_isolate_pages,
                 .nodemask = NULL, /* we don't care the placement */
         };
- -      struct zonelist *zonelist;
   
         sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
                         (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
@@@ -1992,7 -1902,7 +1992,7 @@@ loop_again
                 for (i = 0; i <= end_zone; i++) {
                         struct zone *zone = pgdat->node_zones + i;
   
- -                      lru_pages += zone_lru_pages(zone);
+ +                      lru_pages += zone_reclaimable_pages(zone);
                 }
   
                 /*
@@@ -2007,7 -1917,6 +2007,7 @@@
                 for (i = 0; i <= end_zone; i++) {
                         struct zone *zone = pgdat->node_zones + i;
                         int nr_slab;
+ +                      int nid, zid;
   
                         if (!populated_zone(zone))
                                 continue;
@@@ -2022,15 -1931,6 +2022,15 @@@
                         temp_priority[i] = priority;
                         sc.nr_scanned = 0;
                         note_zone_scanning_priority(zone, priority);
+ +
+ +                      nid = pgdat->node_id;
+ +                      zid = zone_idx(zone);
+ +                      /*
+ +                       * Call soft limit reclaim before calling shrink_zone.
+ +                       * For now we ignore the return value
+ +                       */
+ +                      mem_cgroup_soft_limit_reclaim(zone, order, sc.gfp_mask,
+ +                                                      nid, zid);
                         /*
                          * We put equal pressure on every zone, unless one
                          * zone has way too many pages free already.
@@@ -2046,7 -1946,7 +2046,7 @@@
                         if (zone_is_all_unreclaimable(zone))
                                 continue;
                         if (nr_slab == 0 && zone->pages_scanned >=
- -                                              (zone_lru_pages(zone) * 6))
+ +                                      (zone_reclaimable_pages(zone) * 6))
                                         zone_set_flag(zone,
                                                       ZONE_ALL_UNRECLAIMABLE);
                         /*
@@@ -2213,39 -2113,12 +2213,39 @@@ void wakeup_kswapd(struct zone *zone, i
         wake_up_interruptible(&pgdat->kswapd_wait);
   }
   
- -unsigned long global_lru_pages(void)
+ +/*
+ + * The reclaimable count would be mostly accurate.
+ + * The less reclaimable pages may be
+ + * - mlocked pages, which will be moved to unevictable list when encountered
+ + * - mapped pages, which may require several travels to be reclaimed
+ + * - dirty pages, which is not "instantly" reclaimable
+ + */
+ +unsigned long global_reclaimable_pages(void)
+ +{
+ +      int nr;
+ +
+ +      nr = global_page_state(NR_ACTIVE_FILE) +
+ +           global_page_state(NR_INACTIVE_FILE);
+ +
+ +      if (nr_swap_pages > 0)
+ +              nr += global_page_state(NR_ACTIVE_ANON) +
+ +                    global_page_state(NR_INACTIVE_ANON);
+ +
+ +      return nr;
+ +}
+ +
+ +unsigned long zone_reclaimable_pages(struct zone *zone)
   {
- -      return global_page_state(NR_ACTIVE_ANON)
- -              + global_page_state(NR_ACTIVE_FILE)
- -              + global_page_state(NR_INACTIVE_ANON)
- -              + global_page_state(NR_INACTIVE_FILE);
+ +      int nr;
+ +
+ +      nr = zone_page_state(zone, NR_ACTIVE_FILE) +
+ +           zone_page_state(zone, NR_INACTIVE_FILE);
+ +
+ +      if (nr_swap_pages > 0)
+ +              nr += zone_page_state(zone, NR_ACTIVE_ANON) +
+ +                    zone_page_state(zone, NR_INACTIVE_ANON);
+ +
+ +      return nr;
   }
   
   #ifdef CONFIG_HIBERNATION
@@@ -2260,7 -2133,6 +2260,7 @@@ static void shrink_all_zones(unsigned l
   {
         struct zone *zone;
         unsigned long nr_reclaimed = 0;
+ +      struct zone_reclaim_stat *reclaim_stat;
   
         for_each_populated_zone(zone) {
                 enum lru_list l;
@@@ -2277,14 -2149,11 +2277,14 @@@
                                                 l == LRU_ACTIVE_FILE))
                                 continue;
   
- -                      zone->lru[l].nr_saved_scan += (lru_pages >> prio) + 1;
- -                      if (zone->lru[l].nr_saved_scan >= nr_pages || pass > 3) {
+ +                      reclaim_stat = get_reclaim_stat(zone, sc);
+ +                      reclaim_stat->nr_saved_scan[l] +=
+ +                                              (lru_pages >> prio) + 1;
+ +                      if (reclaim_stat->nr_saved_scan[l]
+ +                                              >= nr_pages || pass > 3) {
                                 unsigned long nr_to_scan;
   
- -                              zone->lru[l].nr_saved_scan = 0;
+ +                              reclaim_stat->nr_saved_scan[l] = 0;
                                 nr_to_scan = min(nr_pages, lru_pages);
                                 nr_reclaimed += shrink_list(l, nr_to_scan, zone,
                                                                 sc, prio);
@@@ -2321,7 -2190,7 +2321,7 @@@ unsigned long shrink_all_memory(unsigne
   
         current->reclaim_state = &reclaim_state;
   
- -      lru_pages = global_lru_pages();
+ +      lru_pages = global_reclaimable_pages();
         nr_slab = global_page_state(NR_SLAB_RECLAIMABLE);
         /* If slab caches are huge, it's better to hit them first */
         while (nr_slab >= lru_pages) {
@@@ -2363,7 -2232,7 +2363,7 @@@
   
                         reclaim_state.reclaimed_slab = 0;
                         shrink_slab(sc.nr_scanned, sc.gfp_mask,
- -                                      global_lru_pages());
+ +                                  global_reclaimable_pages());
                         sc.nr_reclaimed += reclaim_state.reclaimed_slab;
                         if (sc.nr_reclaimed >= nr_pages)
                                 goto out;
@@@ -2380,8 -2249,7 +2380,8 @@@
         if (!sc.nr_reclaimed) {
                 do {
                         reclaim_state.reclaimed_slab = 0;
- -                      shrink_slab(nr_pages, sc.gfp_mask, global_lru_pages());
+ +                      shrink_slab(nr_pages, sc.gfp_mask,
+ +                                  global_reclaimable_pages());
                         sc.nr_reclaimed += reclaim_state.reclaimed_slab;
                 } while (sc.nr_reclaimed < nr_pages &&
                                 reclaim_state.reclaimed_slab > 0);
@@@ -2701,7 -2569,7 +2701,7 @@@ static void check_move_unevictable_page
   retry:
         ClearPageUnevictable(page);
         if (page_evictable(page, NULL)) {
- -              enum lru_list l = LRU_INACTIVE_ANON + page_is_file_cache(page);
+ +              enum lru_list l = page_lru_base_type(page);
   
                 __dec_zone_state(zone, NR_UNEVICTABLE);
                 list_move(&page->lru, &zone->lru[l].list);
@@@ -2844,10 -2712,10 +2844,10 @@@ static void scan_all_zones_unevictable_
   unsigned long scan_unevictable_pages;
   
   int scan_unevictable_handler(struct ctl_table *table, int write,
- -                         struct file *file, void __user *buffer,
+ +                         void __user *buffer,
                            size_t *length, loff_t *ppos)
   {
- -      proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
+ +      proc_doulongvec_minmax(table, write, buffer, length, ppos);
   
         if (write && *(unsigned long *)table->data)
                 scan_all_zones_unevictable_pages();
author	Linus Torvalds <torvalds@linux-foundation.org>
	Thu, 24 Sep 2009 14:53:22 +0000 (07:53 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Thu, 24 Sep 2009 14:53:22 +0000 (07:53 -0700)
		1	2
Documentation/sysctl/vm.txt	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/mm/fault.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/inode.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/ext3/inode.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/ext4/inode.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/ocfs2/aops.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/proc/meminfo.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/xfs/linux-2.6/xfs_aops.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/asm-generic/mman-common.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/fs.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/mm.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/page-flags.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/prctl.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/rmap.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/sched.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/swap.h	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sys.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sysctl.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/Kconfig	patch \|	diff1 \|	diff2 \|	blob \| history
mm/Makefile	patch \|	diff1 \|	diff2 \|	blob \| history
mm/filemap.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/madvise.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/memory.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/migrate.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/page-writeback.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/page_alloc.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/rmap.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/shmem.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/swapfile.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/vmscan.c	patch \|	diff1 \|	diff2 \|	blob \| history