Merge branch 'master' into percpu
authorTejun Heo <tj@kernel.org>
Tue, 5 Jan 2010 00:17:33 +0000 (09:17 +0900)
committerTejun Heo <tj@kernel.org>
Tue, 5 Jan 2010 00:17:33 +0000 (09:17 +0900)
Conflicts:
arch/powerpc/platforms/pseries/hvCall.S
include/linux/percpu.h

12 files changed:
1  2 
arch/blackfin/mach-common/entry.S
arch/x86/include/asm/system.h
arch/x86/kernel/apic/nmi.c
arch/x86/kernel/head_32.S
arch/x86/kernel/vmlinux.lds.S
include/linux/compiler.h
include/linux/percpu.h
include/linux/vmstat.h
kernel/rcutorture.c
kernel/trace/trace.c
kernel/trace/trace_functions_graph.c
mm/percpu.c

index a3ea7e9fe43b1644fbe043c5722a1bbf823ffd58,b0ed0b487ff24dbd94b66565f8843afda8feea71..01b2f58dfb95f9e83d8f5cbf8067fb68358e6c9f
@@@ -1,32 -1,11 +1,11 @@@
  /*
-  * File:         arch/blackfin/mach-common/entry.S
-  * Based on:
-  * Author:       Linus Torvalds
+  * Contains the system-call and fault low-level handling routines.
+  * This also contains the timer-interrupt handler, as well as all
+  * interrupts and faults that can result in a task-switch.
   *
-  * Created:      ?
-  * Description:  contains the system-call and fault low-level handling routines.
-  *               This also contains the timer-interrupt handler, as well as all
-  *               interrupts and faults that can result in a task-switch.
+  * Copyright 2005-2009 Analog Devices Inc.
   *
-  * Modified:
-  *               Copyright 2004-2006 Analog Devices Inc.
-  *
-  * Bugs:         Enter bugs at http://blackfin.uclinux.org/
-  *
-  * This program is free software; you can redistribute it and/or modify
-  * it under the terms of the GNU General Public License as published by
-  * the Free Software Foundation; either version 2 of the License, or
-  * (at your option) any later version.
-  *
-  * This program is distributed in the hope that it will be useful,
-  * but WITHOUT ANY WARRANTY; without even the implied warranty of
-  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  * GNU General Public License for more details.
-  *
-  * You should have received a copy of the GNU General Public License
-  * along with this program; if not, see the file COPYING, or write
-  * to the Free Software Foundation, Inc.,
-  * 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+  * Licensed under the GPL-2 or later.
   */
  
  /* NOTE: This code handles signal-recognition, which happens every time
@@@ -734,6 -713,8 +713,8 @@@ ENTRY(_system_call
        cc = BITTST(r7, TIF_RESTORE_SIGMASK);
        if cc jump .Lsyscall_do_signals;
        cc = BITTST(r7, TIF_SIGPENDING);
+       if cc jump .Lsyscall_do_signals;
+       cc = BITTST(r7, TIF_NOTIFY_RESUME);
        if !cc jump .Lsyscall_really_exit;
  .Lsyscall_do_signals:
        /* Reenable interrupts.  */
  
        r0 = sp;
        SP += -12;
-       call _do_signal;
+       call _do_notify_resume;
        SP += 12;
  
  .Lsyscall_really_exit:
@@@ -835,8 -816,8 +816,8 @@@ ENDPROC(_resume
  
  ENTRY(_ret_from_exception)
  #ifdef CONFIG_IPIPE
 -      p2.l = _per_cpu__ipipe_percpu_domain;
 -      p2.h = _per_cpu__ipipe_percpu_domain;
 +      p2.l = _ipipe_percpu_domain;
 +      p2.h = _ipipe_percpu_domain;
        r0.l = _ipipe_root;
        r0.h = _ipipe_root;
        r2 = [p2];
@@@ -1443,7 -1424,7 +1424,7 @@@ ENTRY(_sys_call_table
        .long _sys_ni_syscall   /* streams2 */
        .long _sys_vfork                /* 190 */
        .long _sys_getrlimit
-       .long _sys_mmap2
+       .long _sys_mmap_pgoff
        .long _sys_truncate64
        .long _sys_ftruncate64
        .long _sys_stat64       /* 195 */
        .long _sys_pwritev
        .long _sys_rt_tgsigqueueinfo
        .long _sys_perf_event_open
+       .long _sys_recvmmsg             /* 370 */
  
        .rept NR_syscalls-(.-_sys_call_table)/4
        .long _sys_ni_syscall
index de10c19d95586b6b59251ad959d6e8990dd65d9c,ecb544e65382893970f2090dd3bb341d03583f4a..e529f26c3292762193e6281f832fc90e01daa1b2
@@@ -23,6 -23,7 +23,7 @@@ struct task_struct *__switch_to(struct 
  struct tss_struct;
  void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
                      struct tss_struct *tss);
+ extern void show_regs_common(void);
  
  #ifdef CONFIG_X86_32
  
@@@ -31,7 -32,7 +32,7 @@@
        "movl %P[task_canary](%[next]), %%ebx\n\t"                      \
        "movl %%ebx, "__percpu_arg([stack_canary])"\n\t"
  #define __switch_canary_oparam                                                \
 -      , [stack_canary] "=m" (per_cpu_var(stack_canary.canary))
 +      , [stack_canary] "=m" (stack_canary.canary)
  #define __switch_canary_iparam                                                \
        , [task_canary] "i" (offsetof(struct task_struct, stack_canary))
  #else /* CC_STACKPROTECTOR */
@@@ -113,7 -114,7 +114,7 @@@ do {                                                                       
        "movq %P[task_canary](%%rsi),%%r8\n\t"                            \
        "movq %%r8,"__percpu_arg([gs_canary])"\n\t"
  #define __switch_canary_oparam                                                  \
 -      , [gs_canary] "=m" (per_cpu_var(irq_stack_union.stack_canary))
 +      , [gs_canary] "=m" (irq_stack_union.stack_canary)
  #define __switch_canary_iparam                                                  \
        , [task_canary] "i" (offsetof(struct task_struct, stack_canary))
  #else /* CC_STACKPROTECTOR */
             "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */       \
             "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */    \
             "call __switch_to\n\t"                                       \
-            ".globl thread_return\n"                                     \
-            "thread_return:\n\t"                                         \
             "movq "__percpu_arg([current_task])",%%rsi\n\t"              \
             __switch_canary                                              \
             "movq %P[thread_info](%%rsi),%%r8\n\t"                       \
             "movq %%rax,%%rdi\n\t"                                       \
 -           "testl  %[_tif_fork],%P[ti_flags](%%r8)\n\t"         \
 +           "testl  %[_tif_fork],%P[ti_flags](%%r8)\n\t"                 \
             "jnz   ret_from_fork\n\t"                                    \
             RESTORE_CONTEXT                                              \
             : "=a" (last)                                                \
               [ti_flags] "i" (offsetof(struct thread_info, flags)),      \
               [_tif_fork] "i" (_TIF_FORK),                               \
               [thread_info] "i" (offsetof(struct task_struct, stack)),   \
 -             [current_task] "m" (per_cpu_var(current_task))             \
 +             [current_task] "m" (current_task)                          \
               __switch_canary_iparam                                     \
             : "memory", "cc" __EXTRA_CLOBBER)
  #endif
@@@ -157,19 -156,22 +156,22 @@@ extern void native_load_gs_index(unsign
   * Load a segment. Fall back on loading the zero
   * segment if something goes wrong..
   */
- #define loadsegment(seg, value)                       \
-       asm volatile("\n"                       \
-                    "1:\t"                     \
-                    "movl %k0,%%" #seg "\n"    \
-                    "2:\n"                     \
-                    ".section .fixup,\"ax\"\n" \
-                    "3:\t"                     \
-                    "movl %k1, %%" #seg "\n\t" \
-                    "jmp 2b\n"                 \
-                    ".previous\n"              \
-                    _ASM_EXTABLE(1b,3b)        \
-                    : :"r" (value), "r" (0) : "memory")
+ #define loadsegment(seg, value)                                               \
+ do {                                                                  \
+       unsigned short __val = (value);                                 \
+                                                                       \
+       asm volatile("                                          \n"     \
+                    "1:        movl %k0,%%" #seg "             \n"     \
+                                                                       \
+                    ".section .fixup,\"ax\"                    \n"     \
+                    "2:        xorl %k0,%k0                    \n"     \
+                    "          jmp 1b                          \n"     \
+                    ".previous                                 \n"     \
+                                                                       \
+                    _ASM_EXTABLE(1b, 2b)                               \
+                                                                       \
+                    : "+r" (__val) : : "memory");                      \
+ } while (0)
  
  /*
   * Save a segment register away
index 45404379d173637133731004540b8511495bee17,0159a69396cba449a424190459a02d83a3f417d8..4ada42c3dabb97aec6dd0dd27b5405258d31397a
@@@ -39,7 -39,8 +39,8 @@@
  int unknown_nmi_panic;
  int nmi_watchdog_enabled;
  
- static cpumask_t backtrace_mask __read_mostly;
+ /* For reliability, we're prepared to waste bits here. */
+ static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly;
  
  /* nmi_active:
   * >0: the lapic NMI watchdog is active, but can be disabled
@@@ -414,7 -415,7 +415,7 @@@ nmi_watchdog_tick(struct pt_regs *regs
        }
  
        /* We can be called before check_nmi_watchdog, hence NULL check. */
-       if (cpumask_test_cpu(cpu, &backtrace_mask)) {
+       if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) {
                static DEFINE_SPINLOCK(lock);   /* Serialise the printks */
  
                spin_lock(&lock);
                show_regs(regs);
                dump_stack();
                spin_unlock(&lock);
-               cpumask_clear_cpu(cpu, &backtrace_mask);
+               cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask));
  
                rc = 1;
        }
                 * Ayiee, looks like this CPU is stuck ...
                 * wait a few IRQs (5 seconds) before doing the oops ...
                 */
 -              __this_cpu_inc(per_cpu_var(alert_counter));
 -              if (__this_cpu_read(per_cpu_var(alert_counter)) == 5 * nmi_hz)
 +              __this_cpu_inc(alert_counter);
 +              if (__this_cpu_read(alert_counter) == 5 * nmi_hz)
                        /*
                         * die_nmi will return ONLY if NOTIFY_STOP happens..
                         */
                                regs, panic_on_timeout);
        } else {
                __get_cpu_var(last_irq_sum) = sum;
 -              __this_cpu_write(per_cpu_var(alert_counter), 0);
 +              __this_cpu_write(alert_counter, 0);
        }
  
        /* see if the nmi watchdog went off */
@@@ -558,14 -559,14 +559,14 @@@ void arch_trigger_all_cpu_backtrace(voi
  {
        int i;
  
-       cpumask_copy(&backtrace_mask, cpu_online_mask);
+       cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask);
  
        printk(KERN_INFO "sending NMI to all CPUs:\n");
        apic->send_IPI_all(NMI_VECTOR);
  
        /* Wait for up to 10 seconds for all CPUs to do the backtrace */
        for (i = 0; i < 10 * 1000; i++) {
-               if (cpumask_empty(&backtrace_mask))
+               if (cpumask_empty(to_cpumask(backtrace_mask)))
                        break;
                mdelay(1);
        }
index fd39eaf83b8487606166fb7425378c5afdc06714,7fd318bac59ce54294ca3e33cef0347c39240f6c..37c3d4b17d859d6ee38029a83f2abcaee6d4dc05
@@@ -18,6 -18,8 +18,8 @@@
  #include <asm/asm-offsets.h>
  #include <asm/setup.h>
  #include <asm/processor-flags.h>
+ #include <asm/msr-index.h>
+ #include <asm/cpufeature.h>
  #include <asm/percpu.h>
  
  /* Physical address */
@@@ -297,25 -299,27 +299,27 @@@ ENTRY(startup_32_smp
        orl %edx,%eax
        movl %eax,%cr4
  
-       btl $5, %eax            # check if PAE is enabled
-       jnc 6f
+       testb $X86_CR4_PAE, %al         # check if PAE is enabled
+       jz 6f
  
        /* Check if extended functions are implemented */
        movl $0x80000000, %eax
        cpuid
-       cmpl $0x80000000, %eax
-       jbe 6f
+       /* Value must be in the range 0x80000001 to 0x8000ffff */
+       subl $0x80000001, %eax
+       cmpl $(0x8000ffff-0x80000001), %eax
+       ja 6f
        mov $0x80000001, %eax
        cpuid
        /* Execute Disable bit supported? */
-       btl $20, %edx
+       btl $(X86_FEATURE_NX & 31), %edx
        jnc 6f
  
        /* Setup EFER (Extended Feature Enable Register) */
-       movl $0xc0000080, %ecx
+       movl $MSR_EFER, %ecx
        rdmsr
  
-       btsl $11, %eax
+       btsl $_EFER_NX, %eax
        /* Make changes effective */
        wrmsr
  
@@@ -438,8 -442,8 +442,8 @@@ is386:     movl $2,%ecx            # set M
         */
        cmpb $0,ready
        jne 1f
 -      movl $per_cpu__gdt_page,%eax
 -      movl $per_cpu__stack_canary,%ecx
 +      movl $gdt_page,%eax
 +      movl $stack_canary,%ecx
        movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax)
        shrl $16, %ecx
        movb %cl, 8 * GDT_ENTRY_STACK_CANARY + 4(%eax)
@@@ -702,7 -706,7 +706,7 @@@ idt_descr
        .word 0                         # 32 bit align gdt_desc.address
  ENTRY(early_gdt_descr)
        .word GDT_ENTRIES*8-1
 -      .long per_cpu__gdt_page         /* Overwritten for secondary CPUs */
 +      .long gdt_page                  /* Overwritten for secondary CPUs */
  
  /*
   * The boot_gdt must mirror the equivalent in setup.S and is
index ecb92717c41264aab23e184463d63fb75db539ae,f92a0da608cb3ade16374118320e4d73220e4b95..44879df55696407556d711b69b0f83fdc311f7f0
@@@ -41,6 -41,32 +41,32 @@@ ENTRY(phys_startup_64
  jiffies_64 = jiffies;
  #endif
  
+ #if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
+ /*
+  * On 64-bit, align RODATA to 2MB so that even with CONFIG_DEBUG_RODATA
+  * we retain large page mappings for boundaries spanning kernel text, rodata
+  * and data sections.
+  *
+  * However, kernel identity mappings will have different RWX permissions
+  * to the pages mapping to text and to the pages padding (which are freed) the
+  * text section. Hence kernel identity mappings will be broken to smaller
+  * pages. For 64-bit, kernel text and kernel identity mappings are different,
+  * so we can enable protection checks that come with CONFIG_DEBUG_RODATA,
+  * as well as retain 2MB large page mappings for kernel text.
+  */
+ #define X64_ALIGN_DEBUG_RODATA_BEGIN  . = ALIGN(HPAGE_SIZE);
+ #define X64_ALIGN_DEBUG_RODATA_END                            \
+               . = ALIGN(HPAGE_SIZE);                          \
+               __end_rodata_hpage_align = .;
+ #else
+ #define X64_ALIGN_DEBUG_RODATA_BEGIN
+ #define X64_ALIGN_DEBUG_RODATA_END
+ #endif
  PHDRS {
        text PT_LOAD FLAGS(5);          /* R_E */
        data PT_LOAD FLAGS(7);          /* RWE */
@@@ -90,7 -116,9 +116,9 @@@ SECTION
  
        EXCEPTION_TABLE(16) :text = 0x9090
  
+       X64_ALIGN_DEBUG_RODATA_BEGIN
        RO_DATA(PAGE_SIZE)
+       X64_ALIGN_DEBUG_RODATA_END
  
        /* Data */
        .data : AT(ADDR(.data) - LOAD_OFFSET) {
  
                PAGE_ALIGNED_DATA(PAGE_SIZE)
  
-               CACHELINE_ALIGNED_DATA(CONFIG_X86_L1_CACHE_BYTES)
+               CACHELINE_ALIGNED_DATA(L1_CACHE_BYTES)
  
                DATA_DATA
                CONSTRUCTORS
  
                /* rarely changed data like cpu maps */
-               READ_MOSTLY_DATA(CONFIG_X86_INTERNODE_CACHE_BYTES)
+               READ_MOSTLY_DATA(INTERNODE_CACHE_BYTES)
  
                /* End of data section */
                _edata = .;
                *(.vsyscall_0)
        } :user
  
-       . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
+       . = ALIGN(L1_CACHE_BYTES);
        .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) {
                *(.vsyscall_fn)
        }
  
-       . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
+       . = ALIGN(L1_CACHE_BYTES);
        .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) {
                *(.vsyscall_gtod_data)
        }
        }
        vgetcpu_mode = VVIRT(.vgetcpu_mode);
  
-       . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
+       . = ALIGN(L1_CACHE_BYTES);
        .jiffies : AT(VLOAD(.jiffies)) {
                *(.jiffies)
        }
                __brk_limit = .;
        }
  
-       .end : AT(ADDR(.end) - LOAD_OFFSET) {
-               _end = .;
-       }
+       _end = .;
  
          STABS_DEBUG
          DWARF_DEBUG
  
  
  #ifdef CONFIG_X86_32
+ /*
+  * The ASSERT() sink to . is intentional, for binutils 2.14 compatibility:
+  */
  . = ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE),
           "kernel image bigger than KERNEL_IMAGE_SIZE");
  #else
   * Per-cpu symbols which need to be offset from __per_cpu_load
   * for the boot processor.
   */
 -#define INIT_PER_CPU(x) init_per_cpu__##x = per_cpu__##x + __per_cpu_load
 +#define INIT_PER_CPU(x) init_per_cpu__##x = x + __per_cpu_load
  INIT_PER_CPU(gdt_page);
  INIT_PER_CPU(irq_stack_union);
  
           "kernel image bigger than KERNEL_IMAGE_SIZE");
  
  #ifdef CONFIG_SMP
 -. = ASSERT((per_cpu__irq_stack_union == 0),
 +. = ASSERT((irq_stack_union == 0),
             "irq_stack_union is not at start of per-cpu area");
  #endif
  
diff --combined include/linux/compiler.h
index abba8045c6ef19b69e0dea7f625b52d2a816d94e,5be3dab4a69547bf6bb378a45d73e553166c5b12..a5a472b10746c662450059d8af969d58c7dcac6c
@@@ -5,7 -5,7 +5,7 @@@
  
  #ifdef __CHECKER__
  # define __user               __attribute__((noderef, address_space(1)))
 -# define __kernel     /* default address space */
 +# define __kernel     __attribute__((address_space(0)))
  # define __safe               __attribute__((safe))
  # define __force      __attribute__((force))
  # define __nocast     __attribute__((nocast))
@@@ -15,7 -15,6 +15,7 @@@
  # define __acquire(x) __context__(x,1)
  # define __release(x) __context__(x,-1)
  # define __cond_lock(x,c)     ((c) ? ({ __acquire(x); 1; }) : 0)
 +# define __percpu     __attribute__((noderef, address_space(3)))
  extern void __chk_user_ptr(const volatile void __user *);
  extern void __chk_io_ptr(const volatile void __iomem *);
  #else
@@@ -33,7 -32,6 +33,7 @@@
  # define __acquire(x) (void)0
  # define __release(x) (void)0
  # define __cond_lock(x,c) (c)
 +# define __percpu
  #endif
  
  #ifdef __KERNEL__
@@@ -146,6 -144,11 +146,11 @@@ void ftrace_likely_update(struct ftrace
  # define barrier() __memory_barrier()
  #endif
  
+ /* Unreachable code */
+ #ifndef unreachable
+ # define unreachable() do { } while (1)
+ #endif
  #ifndef RELOC_HIDE
  # define RELOC_HIDE(ptr, off)                                 \
    ({ unsigned long __ptr;                                     \
  # define __maybe_unused               /* unimplemented */
  #endif
  
+ #ifndef __always_unused
+ # define __always_unused      /* unimplemented */
+ #endif
  #ifndef noinline
  #define noinline
  #endif
  # define __same_type(a, b) __builtin_types_compatible_p(typeof(a), typeof(b))
  #endif
  
+ /* Compile time object size, -1 for unknown */
+ #ifndef __compiletime_object_size
+ # define __compiletime_object_size(obj) -1
+ #endif
+ #ifndef __compiletime_warning
+ # define __compiletime_warning(message)
+ #endif
+ #ifndef __compiletime_error
+ # define __compiletime_error(message)
+ #endif
  /*
   * Prevent the compiler from merging or refetching accesses.  The compiler
   * is also forbidden from reordering successive instances of ACCESS_ONCE(),
diff --combined include/linux/percpu.h
index 42878f0cd0e2212ee79f853832431b027d7c3062,cf5efbcf716c8cecf74d4d315e2619f6fdcfa1f4..a93e5bfdccb8e8b825006776f4afb77ebc975e90
   * we force a syntax error here if it isn't.
   */
  #define get_cpu_var(var) (*({                         \
 -      extern int simple_identifier_##var(void);       \
        preempt_disable();                              \
        &__get_cpu_var(var); }))
 -#define put_cpu_var(var) preempt_enable()
 +
 +/*
 + * The weird & is necessary because sparse considers (void)(var) to be
 + * a direct dereference of percpu variable (var).
 + */
 +#define put_cpu_var(var) do {                         \
 +      (void)&(var);                                   \
 +      preempt_enable();                               \
 +} while (0)
  
  #ifdef CONFIG_SMP
  
@@@ -134,9 -127,10 +134,10 @@@ extern int __init pcpu_page_first_chunk
   */
  #define per_cpu_ptr(ptr, cpu) SHIFT_PERCPU_PTR((ptr), per_cpu_offset((cpu)))
  
 -extern void *__alloc_reserved_percpu(size_t size, size_t align);
 -extern void *__alloc_percpu(size_t size, size_t align);
 -extern void free_percpu(void *__pdata);
 +extern void __percpu *__alloc_reserved_percpu(size_t size, size_t align);
 +extern void __percpu *__alloc_percpu(size_t size, size_t align);
 +extern void free_percpu(void __percpu *__pdata);
+ extern phys_addr_t per_cpu_ptr_to_phys(void *addr);
  
  #ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA
  extern void __init setup_per_cpu_areas(void);
  
  #define per_cpu_ptr(ptr, cpu) ({ (void)(cpu); (ptr); })
  
 -static inline void *__alloc_percpu(size_t size, size_t align)
 +static inline void __percpu *__alloc_percpu(size_t size, size_t align)
  {
        /*
         * Can't easily make larger alignment work with kmalloc.  WARN
        return kzalloc(size, GFP_KERNEL);
  }
  
 -static inline void free_percpu(void *p)
 +static inline void free_percpu(void __percpu *p)
  {
        kfree(p);
  }
  
+ static inline phys_addr_t per_cpu_ptr_to_phys(void *addr)
+ {
+       return __pa(addr);
+ }
  static inline void __init setup_per_cpu_areas(void) { }
  
  static inline void *pcpu_lpage_remapped(void *kaddr)
  #endif /* CONFIG_SMP */
  
  #define alloc_percpu(type)    \
 -      (typeof(type) *)__alloc_percpu(sizeof(type), __alignof__(type))
 +      (typeof(type) __percpu *)__alloc_percpu(sizeof(type), __alignof__(type))
  
  /*
   * Optional methods for optimized non-lvalue per-cpu variable access.
  #ifndef percpu_read
  # define percpu_read(var)                                             \
    ({                                                                  \
 -      typeof(per_cpu_var(var)) __tmp_var__;                           \
 -      __tmp_var__ = get_cpu_var(var);                                 \
 -      put_cpu_var(var);                                               \
 -      __tmp_var__;                                                    \
 +      typeof(var) *pr_ptr__ = &(var);                                 \
 +      typeof(var) pr_ret__;                                           \
 +      pr_ret__ = get_cpu_var(*pr_ptr__);                              \
 +      put_cpu_var(*pr_ptr__);                                         \
 +      pr_ret__;                                                       \
    })
  #endif
  
  #define __percpu_generic_to_op(var, val, op)                          \
  do {                                                                  \
 -      get_cpu_var(var) op val;                                        \
 -      put_cpu_var(var);                                               \
 +      typeof(var) *pgto_ptr__ = &(var);                               \
 +      get_cpu_var(*pgto_ptr__) op val;                                \
 +      put_cpu_var(*pgto_ptr__);                                       \
  } while (0)
  
  #ifndef percpu_write
@@@ -237,7 -234,6 +243,7 @@@ extern void __bad_size_call_parameter(v
  
  #define __pcpu_size_call_return(stem, variable)                               \
  ({    typeof(variable) pscr_ret__;                                    \
 +      __verify_pcpu_ptr(&(variable));                                 \
        switch(sizeof(variable)) {                                      \
        case 1: pscr_ret__ = stem##1(variable);break;                   \
        case 2: pscr_ret__ = stem##2(variable);break;                   \
  
  #define __pcpu_size_call(stem, variable, ...)                         \
  do {                                                                  \
 +      __verify_pcpu_ptr(&(variable));                                 \
        switch(sizeof(variable)) {                                      \
                case 1: stem##1(variable, __VA_ARGS__);break;           \
                case 2: stem##2(variable, __VA_ARGS__);break;           \
  
  /*
   * Optimized manipulation for memory allocated through the per cpu
 - * allocator or for addresses of per cpu variables (can be determined
 - * using per_cpu_var(xx).
 + * allocator or for addresses of per cpu variables.
   *
   * These operation guarantee exclusivity of access for other operations
   * on the *same* processor. The assumption is that per cpu data is only
  #define _this_cpu_generic_to_op(pcp, val, op)                         \
  do {                                                                  \
        preempt_disable();                                              \
 -      *__this_cpu_ptr(&pcp) op val;                                   \
 +      *__this_cpu_ptr(&(pcp)) op val;                                 \
        preempt_enable();                                               \
  } while (0)
  
diff --combined include/linux/vmstat.h
index 3e489fda11a1d4a5686efdd50a0f74018602a027,ee03bba9c5df8e9d0b0586fcfff5ef39e254c717..117f0dd8ad03fa3780b86b8feedbdbb1603c1576
@@@ -40,6 -40,8 +40,8 @@@ enum vm_event_item { PGPGIN, PGPGOUT, P
                PGSCAN_ZONE_RECLAIM_FAILED,
  #endif
                PGINODESTEAL, SLABS_SCANNED, KSWAPD_STEAL, KSWAPD_INODESTEAL,
+               KSWAPD_LOW_WMARK_HIT_QUICKLY, KSWAPD_HIGH_WMARK_HIT_QUICKLY,
+               KSWAPD_SKIP_CONGESTION_WAIT,
                PAGEOUTRUN, ALLOCSTALL, PGROTATED,
  #ifdef CONFIG_HUGETLB_PAGE
                HTLB_BUDDY_PGALLOC, HTLB_BUDDY_PGALLOC_FAIL,
@@@ -76,22 -78,22 +78,22 @@@ DECLARE_PER_CPU(struct vm_event_state, 
  
  static inline void __count_vm_event(enum vm_event_item item)
  {
 -      __this_cpu_inc(per_cpu_var(vm_event_states).event[item]);
 +      __this_cpu_inc(vm_event_states.event[item]);
  }
  
  static inline void count_vm_event(enum vm_event_item item)
  {
 -      this_cpu_inc(per_cpu_var(vm_event_states).event[item]);
 +      this_cpu_inc(vm_event_states.event[item]);
  }
  
  static inline void __count_vm_events(enum vm_event_item item, long delta)
  {
 -      __this_cpu_add(per_cpu_var(vm_event_states).event[item], delta);
 +      __this_cpu_add(vm_event_states.event[item], delta);
  }
  
  static inline void count_vm_events(enum vm_event_item item, long delta)
  {
 -      this_cpu_add(per_cpu_var(vm_event_states).event[item], delta);
 +      this_cpu_add(vm_event_states.event[item], delta);
  }
  
  extern void all_vm_events(unsigned long *);
diff --combined kernel/rcutorture.c
index e339ab34912144c36ba8a0efd4e48b45bebd91d4,9bb52177af02a3e20aa347e3b65c0a236caa1922..0b5217535f71b57e905fab1caaf26923a241593d
@@@ -327,6 -327,11 +327,11 @@@ rcu_torture_cb(struct rcu_head *p
                cur_ops->deferred_free(rp);
  }
  
+ static int rcu_no_completed(void)
+ {
+       return 0;
+ }
  static void rcu_torture_deferred_free(struct rcu_torture *p)
  {
        call_rcu(&p->rtort_rcu, rcu_torture_cb);
@@@ -388,6 -393,21 +393,21 @@@ static struct rcu_torture_ops rcu_sync_
        .name           = "rcu_sync"
  };
  
+ static struct rcu_torture_ops rcu_expedited_ops = {
+       .init           = rcu_sync_torture_init,
+       .cleanup        = NULL,
+       .readlock       = rcu_torture_read_lock,
+       .read_delay     = rcu_read_delay,  /* just reuse rcu's version. */
+       .readunlock     = rcu_torture_read_unlock,
+       .completed      = rcu_no_completed,
+       .deferred_free  = rcu_sync_torture_deferred_free,
+       .sync           = synchronize_rcu_expedited,
+       .cb_barrier     = NULL,
+       .stats          = NULL,
+       .irq_capable    = 1,
+       .name           = "rcu_expedited"
+ };
  /*
   * Definitions for rcu_bh torture testing.
   */
@@@ -547,6 -567,25 +567,25 @@@ static struct rcu_torture_ops srcu_ops 
        .name           = "srcu"
  };
  
+ static void srcu_torture_synchronize_expedited(void)
+ {
+       synchronize_srcu_expedited(&srcu_ctl);
+ }
+ static struct rcu_torture_ops srcu_expedited_ops = {
+       .init           = srcu_torture_init,
+       .cleanup        = srcu_torture_cleanup,
+       .readlock       = srcu_torture_read_lock,
+       .read_delay     = srcu_read_delay,
+       .readunlock     = srcu_torture_read_unlock,
+       .completed      = srcu_torture_completed,
+       .deferred_free  = rcu_sync_torture_deferred_free,
+       .sync           = srcu_torture_synchronize_expedited,
+       .cb_barrier     = NULL,
+       .stats          = srcu_torture_stats,
+       .name           = "srcu_expedited"
+ };
  /*
   * Definitions for sched torture testing.
   */
@@@ -562,11 -601,6 +601,6 @@@ static void sched_torture_read_unlock(i
        preempt_enable();
  }
  
- static int sched_torture_completed(void)
- {
-       return 0;
- }
  static void rcu_sched_torture_deferred_free(struct rcu_torture *p)
  {
        call_rcu_sched(&p->rtort_rcu, rcu_torture_cb);
@@@ -583,7 -617,7 +617,7 @@@ static struct rcu_torture_ops sched_op
        .readlock       = sched_torture_read_lock,
        .read_delay     = rcu_read_delay,  /* just reuse rcu's version. */
        .readunlock     = sched_torture_read_unlock,
-       .completed      = sched_torture_completed,
+       .completed      = rcu_no_completed,
        .deferred_free  = rcu_sched_torture_deferred_free,
        .sync           = sched_torture_synchronize,
        .cb_barrier     = rcu_barrier_sched,
        .name           = "sched"
  };
  
- static struct rcu_torture_ops sched_ops_sync = {
+ static struct rcu_torture_ops sched_sync_ops = {
        .init           = rcu_sync_torture_init,
        .cleanup        = NULL,
        .readlock       = sched_torture_read_lock,
        .read_delay     = rcu_read_delay,  /* just reuse rcu's version. */
        .readunlock     = sched_torture_read_unlock,
-       .completed      = sched_torture_completed,
+       .completed      = rcu_no_completed,
        .deferred_free  = rcu_sync_torture_deferred_free,
        .sync           = sched_torture_synchronize,
        .cb_barrier     = NULL,
        .name           = "sched_sync"
  };
  
- extern int rcu_expedited_torture_stats(char *page);
  static struct rcu_torture_ops sched_expedited_ops = {
        .init           = rcu_sync_torture_init,
        .cleanup        = NULL,
        .readlock       = sched_torture_read_lock,
        .read_delay     = rcu_read_delay,  /* just reuse rcu's version. */
        .readunlock     = sched_torture_read_unlock,
-       .completed      = sched_torture_completed,
+       .completed      = rcu_no_completed,
        .deferred_free  = rcu_sync_torture_deferred_free,
        .sync           = synchronize_sched_expedited,
        .cb_barrier     = NULL,
@@@ -650,7 -682,7 +682,7 @@@ rcu_torture_writer(void *arg
                old_rp = rcu_torture_current;
                rp->rtort_mbtest = 1;
                rcu_assign_pointer(rcu_torture_current, rp);
-               smp_wmb();
+               smp_wmb(); /* Mods to old_rp must follow rcu_assign_pointer() */
                if (old_rp) {
                        i = old_rp->rtort_pipe_count;
                        if (i > RCU_TORTURE_PIPE_LEN)
@@@ -731,13 -763,13 +763,13 @@@ static void rcu_torture_timer(unsigned 
                /* Should not happen, but... */
                pipe_count = RCU_TORTURE_PIPE_LEN;
        }
 -      __this_cpu_inc(per_cpu_var(rcu_torture_count)[pipe_count]);
 +      __this_cpu_inc(rcu_torture_count[pipe_count]);
        completed = cur_ops->completed() - completed;
        if (completed > RCU_TORTURE_PIPE_LEN) {
                /* Should not happen, but... */
                completed = RCU_TORTURE_PIPE_LEN;
        }
 -      __this_cpu_inc(per_cpu_var(rcu_torture_batch)[completed]);
 +      __this_cpu_inc(rcu_torture_batch[completed]);
        preempt_enable();
        cur_ops->readunlock(idx);
  }
@@@ -786,13 -818,13 +818,13 @@@ rcu_torture_reader(void *arg
                        /* Should not happen, but... */
                        pipe_count = RCU_TORTURE_PIPE_LEN;
                }
 -              __this_cpu_inc(per_cpu_var(rcu_torture_count)[pipe_count]);
 +              __this_cpu_inc(rcu_torture_count[pipe_count]);
                completed = cur_ops->completed() - completed;
                if (completed > RCU_TORTURE_PIPE_LEN) {
                        /* Should not happen, but... */
                        completed = RCU_TORTURE_PIPE_LEN;
                }
 -              __this_cpu_inc(per_cpu_var(rcu_torture_batch)[completed]);
 +              __this_cpu_inc(rcu_torture_batch[completed]);
                preempt_enable();
                cur_ops->readunlock(idx);
                schedule();
@@@ -1099,9 -1131,10 +1131,10 @@@ rcu_torture_init(void
        int cpu;
        int firsterr = 0;
        static struct rcu_torture_ops *torture_ops[] =
-               { &rcu_ops, &rcu_sync_ops, &rcu_bh_ops, &rcu_bh_sync_ops,
-                 &sched_expedited_ops,
-                 &srcu_ops, &sched_ops, &sched_ops_sync, };
+               { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops,
+                 &rcu_bh_ops, &rcu_bh_sync_ops,
+                 &srcu_ops, &srcu_expedited_ops,
+                 &sched_ops, &sched_sync_ops, &sched_expedited_ops, };
  
        mutex_lock(&fullstop_mutex);
  
                        break;
        }
        if (i == ARRAY_SIZE(torture_ops)) {
-               printk(KERN_ALERT "rcutorture: invalid torture type: \"%s\"\n",
+               printk(KERN_ALERT "rcu-torture: invalid torture type: \"%s\"\n",
                       torture_type);
+               printk(KERN_ALERT "rcu-torture types:");
+               for (i = 0; i < ARRAY_SIZE(torture_ops); i++)
+                       printk(KERN_ALERT " %s", torture_ops[i]->name);
+               printk(KERN_ALERT "\n");
                mutex_unlock(&fullstop_mutex);
                return -EINVAL;
        }
diff --combined kernel/trace/trace.c
index b808177af8168299b28fec9fe8d70bd9767936b0,0df1b0f2cb9e0717f2a21f6923389c1978a1fa04..ab2bbb0e942958e70812b43a24014255ea00b163
@@@ -12,7 -12,7 +12,7 @@@
   *  Copyright (C) 2004 William Lee Irwin III
   */
  #include <linux/ring_buffer.h>
- #include <linux/utsrelease.h>
+ #include <generated/utsrelease.h>
  #include <linux/stacktrace.h>
  #include <linux/writeback.h>
  #include <linux/kallsyms.h>
@@@ -91,12 -91,12 +91,12 @@@ DEFINE_PER_CPU(int, ftrace_cpu_disabled
  static inline void ftrace_disable_cpu(void)
  {
        preempt_disable();
 -      __this_cpu_inc(per_cpu_var(ftrace_cpu_disabled));
 +      __this_cpu_inc(ftrace_cpu_disabled);
  }
  
  static inline void ftrace_enable_cpu(void)
  {
 -      __this_cpu_dec(per_cpu_var(ftrace_cpu_disabled));
 +      __this_cpu_dec(ftrace_cpu_disabled);
        preempt_enable();
  }
  
@@@ -129,7 -129,7 +129,7 @@@ static int tracing_set_tracer(const cha
  static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata;
  static char *default_bootup_tracer;
  
- static int __init set_ftrace(char *str)
+ static int __init set_cmdline_ftrace(char *str)
  {
        strncpy(bootup_tracer_buf, str, MAX_TRACER_SIZE);
        default_bootup_tracer = bootup_tracer_buf;
        ring_buffer_expanded = 1;
        return 1;
  }
- __setup("ftrace=", set_ftrace);
+ __setup("ftrace=", set_cmdline_ftrace);
  
  static int __init set_ftrace_dump_on_oops(char *str)
  {
@@@ -313,7 -313,6 +313,6 @@@ static const char *trace_options[] = 
        "bin",
        "block",
        "stacktrace",
-       "sched-tree",
        "trace_printk",
        "ftrace_preempt",
        "branch",
@@@ -493,15 -492,15 +492,15 @@@ static ssize_t trace_seq_to_buffer(stru
   * protected by per_cpu spinlocks. But the action of the swap
   * needs its own lock.
   *
-  * This is defined as a raw_spinlock_t in order to help
+  * This is defined as a arch_spinlock_t in order to help
   * with performance when lockdep debugging is enabled.
   *
   * It is also used in other places outside the update_max_tr
   * so it needs to be defined outside of the
   * CONFIG_TRACER_MAX_TRACE.
   */
- static raw_spinlock_t ftrace_max_lock =
-       (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
+ static arch_spinlock_t ftrace_max_lock =
+       (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
  
  #ifdef CONFIG_TRACER_MAX_TRACE
  unsigned long __read_mostly   tracing_max_latency;
@@@ -555,13 -554,13 +554,13 @@@ update_max_tr(struct trace_array *tr, s
                return;
  
        WARN_ON_ONCE(!irqs_disabled());
-       __raw_spin_lock(&ftrace_max_lock);
+       arch_spin_lock(&ftrace_max_lock);
  
        tr->buffer = max_tr.buffer;
        max_tr.buffer = buf;
  
        __update_max_tr(tr, tsk, cpu);
-       __raw_spin_unlock(&ftrace_max_lock);
+       arch_spin_unlock(&ftrace_max_lock);
  }
  
  /**
@@@ -581,7 -580,7 +580,7 @@@ update_max_tr_single(struct trace_arra
                return;
  
        WARN_ON_ONCE(!irqs_disabled());
-       __raw_spin_lock(&ftrace_max_lock);
+       arch_spin_lock(&ftrace_max_lock);
  
        ftrace_disable_cpu();
  
        WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY);
  
        __update_max_tr(tr, tsk, cpu);
-       __raw_spin_unlock(&ftrace_max_lock);
+       arch_spin_unlock(&ftrace_max_lock);
  }
  #endif /* CONFIG_TRACER_MAX_TRACE */
  
@@@ -802,7 -801,7 +801,7 @@@ static unsigned map_pid_to_cmdline[PID_
  static unsigned map_cmdline_to_pid[SAVED_CMDLINES];
  static char saved_cmdlines[SAVED_CMDLINES][TASK_COMM_LEN];
  static int cmdline_idx;
- static raw_spinlock_t trace_cmdline_lock = __RAW_SPIN_LOCK_UNLOCKED;
+ static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED;
  
  /* temporary disable recording */
  static atomic_t trace_record_cmdline_disabled __read_mostly;
@@@ -915,7 -914,7 +914,7 @@@ static void trace_save_cmdline(struct t
         * nor do we want to disable interrupts,
         * so if we miss here, then better luck next time.
         */
-       if (!__raw_spin_trylock(&trace_cmdline_lock))
+       if (!arch_spin_trylock(&trace_cmdline_lock))
                return;
  
        idx = map_pid_to_cmdline[tsk->pid];
  
        memcpy(&saved_cmdlines[idx], tsk->comm, TASK_COMM_LEN);
  
-       __raw_spin_unlock(&trace_cmdline_lock);
+       arch_spin_unlock(&trace_cmdline_lock);
  }
  
  void trace_find_cmdline(int pid, char comm[])
        }
  
        preempt_disable();
-       __raw_spin_lock(&trace_cmdline_lock);
+       arch_spin_lock(&trace_cmdline_lock);
        map = map_pid_to_cmdline[pid];
        if (map != NO_CMDLINE_MAP)
                strcpy(comm, saved_cmdlines[map]);
        else
                strcpy(comm, "<...>");
  
-       __raw_spin_unlock(&trace_cmdline_lock);
+       arch_spin_unlock(&trace_cmdline_lock);
        preempt_enable();
  }
  
@@@ -1085,7 -1084,7 +1084,7 @@@ trace_function(struct trace_array *tr
        struct ftrace_entry *entry;
  
        /* If we are reading the ring buffer, don't trace */
 -      if (unlikely(__this_cpu_read(per_cpu_var(ftrace_cpu_disabled))))
 +      if (unlikely(__this_cpu_read(ftrace_cpu_disabled)))
                return;
  
        event = trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry),
@@@ -1151,6 -1150,22 +1150,22 @@@ void __trace_stack(struct trace_array *
        __ftrace_trace_stack(tr->buffer, flags, skip, pc);
  }
  
+ /**
+  * trace_dump_stack - record a stack back trace in the trace buffer
+  */
+ void trace_dump_stack(void)
+ {
+       unsigned long flags;
+       if (tracing_disabled || tracing_selftest_running)
+               return;
+       local_save_flags(flags);
+       /* skipping 3 traces, seems to get us at the caller of this function */
+       __ftrace_trace_stack(global_trace.buffer, flags, 3, preempt_count());
+ }
  void
  ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
  {
@@@ -1251,8 -1266,8 +1266,8 @@@ ftrace_special(unsigned long arg1, unsi
   */
  int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
  {
-       static raw_spinlock_t trace_buf_lock =
-               (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
+       static arch_spinlock_t trace_buf_lock =
+               (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
        static u32 trace_buf[TRACE_BUF_SIZE];
  
        struct ftrace_event_call *call = &event_bprint;
  
        /* Lockdep uses trace_printk for lock tracing */
        local_irq_save(flags);
-       __raw_spin_lock(&trace_buf_lock);
+       arch_spin_lock(&trace_buf_lock);
        len = vbin_printf(trace_buf, TRACE_BUF_SIZE, fmt, args);
  
        if (len > TRACE_BUF_SIZE || len < 0)
                ring_buffer_unlock_commit(buffer, event);
  
  out_unlock:
-       __raw_spin_unlock(&trace_buf_lock);
+       arch_spin_unlock(&trace_buf_lock);
        local_irq_restore(flags);
  
  out:
@@@ -1334,7 -1349,7 +1349,7 @@@ int trace_array_printk(struct trace_arr
  int trace_array_vprintk(struct trace_array *tr,
                        unsigned long ip, const char *fmt, va_list args)
  {
-       static raw_spinlock_t trace_buf_lock = __RAW_SPIN_LOCK_UNLOCKED;
+       static arch_spinlock_t trace_buf_lock = __ARCH_SPIN_LOCK_UNLOCKED;
        static char trace_buf[TRACE_BUF_SIZE];
  
        struct ftrace_event_call *call = &event_print;
  
        pause_graph_tracing();
        raw_local_irq_save(irq_flags);
-       __raw_spin_lock(&trace_buf_lock);
+       arch_spin_lock(&trace_buf_lock);
        len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args);
  
-       len = min(len, TRACE_BUF_SIZE-1);
-       trace_buf[len] = 0;
        size = sizeof(*entry) + len + 1;
        buffer = tr->buffer;
        event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
        if (!event)
                goto out_unlock;
        entry = ring_buffer_event_data(event);
-       entry->ip                       = ip;
+       entry->ip = ip;
  
        memcpy(&entry->buf, trace_buf, len);
-       entry->buf[len] = 0;
+       entry->buf[len] = '\0';
        if (!filter_check_discard(call, entry, buffer, event))
                ring_buffer_unlock_commit(buffer, event);
  
   out_unlock:
-       __raw_spin_unlock(&trace_buf_lock);
+       arch_spin_unlock(&trace_buf_lock);
        raw_local_irq_restore(irq_flags);
        unpause_graph_tracing();
   out:
  
  int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
  {
-       return trace_array_printk(&global_trace, ip, fmt, args);
+       return trace_array_vprintk(&global_trace, ip, fmt, args);
  }
  EXPORT_SYMBOL_GPL(trace_vprintk);
  
@@@ -1515,6 -1527,8 +1527,8 @@@ static void *s_next(struct seq_file *m
        int i = (int)*pos;
        void *ent;
  
+       WARN_ON_ONCE(iter->leftover);
        (*pos)++;
  
        /* can't go backwards */
@@@ -1613,8 -1627,16 +1627,16 @@@ static void *s_start(struct seq_file *m
                        ;
  
        } else {
-               l = *pos - 1;
-               p = s_next(m, p, &l);
+               /*
+                * If we overflowed the seq_file before, then we want
+                * to just reuse the trace_seq buffer again.
+                */
+               if (iter->leftover)
+                       p = iter;
+               else {
+                       l = *pos - 1;
+                       p = s_next(m, p, &l);
+               }
        }
  
        trace_event_read_lock();
@@@ -1922,6 -1944,7 +1944,7 @@@ static enum print_line_t print_trace_li
  static int s_show(struct seq_file *m, void *v)
  {
        struct trace_iterator *iter = v;
+       int ret;
  
        if (iter->ent == NULL) {
                if (iter->tr) {
                        if (!(trace_flags & TRACE_ITER_VERBOSE))
                                print_func_help_header(m);
                }
+       } else if (iter->leftover) {
+               /*
+                * If we filled the seq_file buffer earlier, we
+                * want to just show it now.
+                */
+               ret = trace_print_seq(m, &iter->seq);
+               /* ret should this time be zero, but you never know */
+               iter->leftover = ret;
        } else {
                print_trace_line(iter);
-               trace_print_seq(m, &iter->seq);
+               ret = trace_print_seq(m, &iter->seq);
+               /*
+                * If we overflow the seq_file buffer, then it will
+                * ask us for this data again at start up.
+                * Use that instead.
+                *  ret is 0 if seq_file write succeeded.
+                *        -1 otherwise.
+                */
+               iter->leftover = ret;
        }
  
        return 0;
@@@ -2253,7 -2294,7 +2294,7 @@@ tracing_cpumask_write(struct file *filp
        mutex_lock(&tracing_cpumask_update_lock);
  
        local_irq_disable();
-       __raw_spin_lock(&ftrace_max_lock);
+       arch_spin_lock(&ftrace_max_lock);
        for_each_tracing_cpu(cpu) {
                /*
                 * Increase/decrease the disabled counter if we are
                        atomic_dec(&global_trace.data[cpu]->disabled);
                }
        }
-       __raw_spin_unlock(&ftrace_max_lock);
+       arch_spin_unlock(&ftrace_max_lock);
        local_irq_enable();
  
        cpumask_copy(tracing_cpumask, tracing_cpumask_new);
@@@ -2290,67 -2331,49 +2331,49 @@@ static const struct file_operations tra
        .write          = tracing_cpumask_write,
  };
  
- static ssize_t
- tracing_trace_options_read(struct file *filp, char __user *ubuf,
-                      size_t cnt, loff_t *ppos)
+ static int tracing_trace_options_show(struct seq_file *m, void *v)
  {
        struct tracer_opt *trace_opts;
        u32 tracer_flags;
-       int len = 0;
-       char *buf;
-       int r = 0;
        int i;
  
-       /* calculate max size */
-       for (i = 0; trace_options[i]; i++) {
-               len += strlen(trace_options[i]);
-               len += 3; /* "no" and newline */
-       }
        mutex_lock(&trace_types_lock);
        tracer_flags = current_trace->flags->val;
        trace_opts = current_trace->flags->opts;
  
-       /*
-        * Increase the size with names of options specific
-        * of the current tracer.
-        */
-       for (i = 0; trace_opts[i].name; i++) {
-               len += strlen(trace_opts[i].name);
-               len += 3; /* "no" and newline */
-       }
-       /* +1 for \0 */
-       buf = kmalloc(len + 1, GFP_KERNEL);
-       if (!buf) {
-               mutex_unlock(&trace_types_lock);
-               return -ENOMEM;
-       }
        for (i = 0; trace_options[i]; i++) {
                if (trace_flags & (1 << i))
-                       r += sprintf(buf + r, "%s\n", trace_options[i]);
+                       seq_printf(m, "%s\n", trace_options[i]);
                else
-                       r += sprintf(buf + r, "no%s\n", trace_options[i]);
+                       seq_printf(m, "no%s\n", trace_options[i]);
        }
  
        for (i = 0; trace_opts[i].name; i++) {
                if (tracer_flags & trace_opts[i].bit)
-                       r += sprintf(buf + r, "%s\n",
-                               trace_opts[i].name);
+                       seq_printf(m, "%s\n", trace_opts[i].name);
                else
-                       r += sprintf(buf + r, "no%s\n",
-                               trace_opts[i].name);
+                       seq_printf(m, "no%s\n", trace_opts[i].name);
        }
        mutex_unlock(&trace_types_lock);
  
-       WARN_ON(r >= len + 1);
+       return 0;
+ }
  
-       r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+ static int __set_tracer_option(struct tracer *trace,
+                              struct tracer_flags *tracer_flags,
+                              struct tracer_opt *opts, int neg)
+ {
+       int ret;
  
-       kfree(buf);
-       return r;
+       ret = trace->set_flag(tracer_flags->val, opts->bit, !neg);
+       if (ret)
+               return ret;
+       if (neg)
+               tracer_flags->val &= ~opts->bit;
+       else
+               tracer_flags->val |= opts->bit;
+       return 0;
  }
  
  /* Try to assign a tracer specific option */
@@@ -2358,33 -2381,17 +2381,17 @@@ static int set_tracer_option(struct tra
  {
        struct tracer_flags *tracer_flags = trace->flags;
        struct tracer_opt *opts = NULL;
-       int ret = 0, i = 0;
-       int len;
+       int i;
  
        for (i = 0; tracer_flags->opts[i].name; i++) {
                opts = &tracer_flags->opts[i];
-               len = strlen(opts->name);
  
-               if (strncmp(cmp, opts->name, len) == 0) {
-                       ret = trace->set_flag(tracer_flags->val,
-                               opts->bit, !neg);
-                       break;
-               }
+               if (strcmp(cmp, opts->name) == 0)
+                       return __set_tracer_option(trace, trace->flags,
+                                                  opts, neg);
        }
-       /* Not found */
-       if (!tracer_flags->opts[i].name)
-               return -EINVAL;
-       /* Refused to handle */
-       if (ret)
-               return ret;
-       if (neg)
-               tracer_flags->val &= ~opts->bit;
-       else
-               tracer_flags->val |= opts->bit;
  
-       return 0;
+       return -EINVAL;
  }
  
  static void set_tracer_flags(unsigned int mask, int enabled)
@@@ -2404,7 -2411,7 +2411,7 @@@ tracing_trace_options_write(struct fil
                        size_t cnt, loff_t *ppos)
  {
        char buf[64];
-       char *cmp = buf;
+       char *cmp;
        int neg = 0;
        int ret;
        int i;
                return -EFAULT;
  
        buf[cnt] = 0;
+       cmp = strstrip(buf);
  
-       if (strncmp(buf, "no", 2) == 0) {
+       if (strncmp(cmp, "no", 2) == 0) {
                neg = 1;
                cmp += 2;
        }
  
        for (i = 0; trace_options[i]; i++) {
-               int len = strlen(trace_options[i]);
-               if (strncmp(cmp, trace_options[i], len) == 0) {
+               if (strcmp(cmp, trace_options[i]) == 0) {
                        set_tracer_flags(1 << i, !neg);
                        break;
                }
                        return ret;
        }
  
-       filp->f_pos += cnt;
+       *ppos += cnt;
  
        return cnt;
  }
  
+ static int tracing_trace_options_open(struct inode *inode, struct file *file)
+ {
+       if (tracing_disabled)
+               return -ENODEV;
+       return single_open(file, tracing_trace_options_show, NULL);
+ }
  static const struct file_operations tracing_iter_fops = {
-       .open           = tracing_open_generic,
-       .read           = tracing_trace_options_read,
+       .open           = tracing_trace_options_open,
+       .read           = seq_read,
+       .llseek         = seq_lseek,
+       .release        = single_release,
        .write          = tracing_trace_options_write,
  };
  
@@@ -2582,7 -2597,7 +2597,7 @@@ tracing_ctrl_write(struct file *filp, c
        }
        mutex_unlock(&trace_types_lock);
  
-       filp->f_pos += cnt;
+       *ppos += cnt;
  
        return cnt;
  }
@@@ -2764,7 -2779,7 +2779,7 @@@ tracing_set_trace_write(struct file *fi
        if (err)
                return err;
  
-       filp->f_pos += ret;
+       *ppos += ret;
  
        return ret;
  }
@@@ -2897,6 -2912,10 +2912,10 @@@ static int tracing_release_pipe(struct 
        else
                cpumask_clear_cpu(iter->cpu_file, tracing_reader_cpumask);
  
+       if (iter->trace->pipe_close)
+               iter->trace->pipe_close(iter);
        mutex_unlock(&trace_types_lock);
  
        free_cpumask_var(iter->started);
@@@ -3103,7 -3122,7 +3122,7 @@@ static void tracing_spd_release_pipe(st
        __free_page(spd->pages[idx]);
  }
  
- static struct pipe_buf_operations tracing_pipe_buf_ops = {
+ static const struct pipe_buf_operations tracing_pipe_buf_ops = {
        .can_merge              = 0,
        .map                    = generic_pipe_buf_map,
        .unmap                  = generic_pipe_buf_unmap,
@@@ -3299,7 -3318,7 +3318,7 @@@ tracing_entries_write(struct file *filp
                }
        }
  
-       filp->f_pos += cnt;
+       *ppos += cnt;
  
        /* If check pages failed, return ENOMEM */
        if (tracing_disabled)
@@@ -3334,7 -3353,6 +3353,6 @@@ tracing_mark_write(struct file *filp, c
                                        size_t cnt, loff_t *fpos)
  {
        char *buf;
-       char *end;
  
        if (tracing_disabled)
                return -EINVAL;
        if (cnt > TRACE_BUF_SIZE)
                cnt = TRACE_BUF_SIZE;
  
-       buf = kmalloc(cnt + 1, GFP_KERNEL);
+       buf = kmalloc(cnt + 2, GFP_KERNEL);
        if (buf == NULL)
                return -ENOMEM;
  
                kfree(buf);
                return -EFAULT;
        }
+       if (buf[cnt-1] != '\n') {
+               buf[cnt] = '\n';
+               buf[cnt+1] = '\0';
+       } else
+               buf[cnt] = '\0';
  
-       /* Cut from the first nil or newline. */
-       buf[cnt] = '\0';
-       end = strchr(buf, '\n');
-       if (end)
-               *end = '\0';
-       cnt = mark_printk("%s\n", buf);
+       cnt = mark_printk("%s", buf);
        kfree(buf);
        *fpos += cnt;
  
        return cnt;
  }
  
- static ssize_t tracing_clock_read(struct file *filp, char __user *ubuf,
-                                 size_t cnt, loff_t *ppos)
+ static int tracing_clock_show(struct seq_file *m, void *v)
  {
-       char buf[64];
-       int bufiter = 0;
        int i;
  
        for (i = 0; i < ARRAY_SIZE(trace_clocks); i++)
-               bufiter += snprintf(buf + bufiter, sizeof(buf) - bufiter,
+               seq_printf(m,
                        "%s%s%s%s", i ? " " : "",
                        i == trace_clock_id ? "[" : "", trace_clocks[i].name,
                        i == trace_clock_id ? "]" : "");
-       bufiter += snprintf(buf + bufiter, sizeof(buf) - bufiter, "\n");
+       seq_putc(m, '\n');
  
-       return simple_read_from_buffer(ubuf, cnt, ppos, buf, bufiter);
+       return 0;
  }
  
  static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf,
        return cnt;
  }
  
+ static int tracing_clock_open(struct inode *inode, struct file *file)
+ {
+       if (tracing_disabled)
+               return -ENODEV;
+       return single_open(file, tracing_clock_show, NULL);
+ }
  static const struct file_operations tracing_max_lat_fops = {
        .open           = tracing_open_generic,
        .read           = tracing_max_lat_read,
@@@ -3458,8 -3479,10 +3479,10 @@@ static const struct file_operations tra
  };
  
  static const struct file_operations trace_clock_fops = {
-       .open           = tracing_open_generic,
-       .read           = tracing_clock_read,
+       .open           = tracing_clock_open,
+       .read           = seq_read,
+       .llseek         = seq_lseek,
+       .release        = single_release,
        .write          = tracing_clock_write,
  };
  
@@@ -3589,7 -3612,7 +3612,7 @@@ static void buffer_pipe_buf_get(struct 
  }
  
  /* Pipe buffer operations for a buffer. */
- static struct pipe_buf_operations buffer_pipe_buf_ops = {
+ static const struct pipe_buf_operations buffer_pipe_buf_ops = {
        .can_merge              = 0,
        .map                    = generic_pipe_buf_map,
        .unmap                  = generic_pipe_buf_unmap,
@@@ -3730,7 -3753,7 +3753,7 @@@ tracing_stats_read(struct file *filp, c
  
        s = kmalloc(sizeof(*s), GFP_KERNEL);
        if (!s)
-               return ENOMEM;
+               return -ENOMEM;
  
        trace_seq_init(s);
  
@@@ -3920,39 -3943,16 +3943,16 @@@ trace_options_write(struct file *filp, 
        if (ret < 0)
                return ret;
  
-       ret = 0;
-       switch (val) {
-       case 0:
-               /* do nothing if already cleared */
-               if (!(topt->flags->val & topt->opt->bit))
-                       break;
-               mutex_lock(&trace_types_lock);
-               if (current_trace->set_flag)
-                       ret = current_trace->set_flag(topt->flags->val,
-                                                     topt->opt->bit, 0);
-               mutex_unlock(&trace_types_lock);
-               if (ret)
-                       return ret;
-               topt->flags->val &= ~topt->opt->bit;
-               break;
-       case 1:
-               /* do nothing if already set */
-               if (topt->flags->val & topt->opt->bit)
-                       break;
+       if (val != 0 && val != 1)
+               return -EINVAL;
  
+       if (!!(topt->flags->val & topt->opt->bit) != val) {
                mutex_lock(&trace_types_lock);
-               if (current_trace->set_flag)
-                       ret = current_trace->set_flag(topt->flags->val,
-                                                     topt->opt->bit, 1);
+               ret = __set_tracer_option(current_trace, topt->flags,
+                                         topt->opt, !val);
                mutex_unlock(&trace_types_lock);
                if (ret)
                        return ret;
-               topt->flags->val |= topt->opt->bit;
-               break;
-       default:
-               return -EINVAL;
        }
  
        *ppos += cnt;
@@@ -4279,8 -4279,8 +4279,8 @@@ trace_printk_seq(struct trace_seq *s
  
  static void __ftrace_dump(bool disable_tracing)
  {
-       static raw_spinlock_t ftrace_dump_lock =
-               (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
+       static arch_spinlock_t ftrace_dump_lock =
+               (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
        /* use static because iter can be a bit big for the stack */
        static struct trace_iterator iter;
        unsigned int old_userobj;
  
        /* only one dump */
        local_irq_save(flags);
-       __raw_spin_lock(&ftrace_dump_lock);
+       arch_spin_lock(&ftrace_dump_lock);
        if (dump_ran)
                goto out;
  
        }
  
   out:
-       __raw_spin_unlock(&ftrace_dump_lock);
+       arch_spin_unlock(&ftrace_dump_lock);
        local_irq_restore(flags);
  }
  
index 8614e3241ff86e5cffb7afd57887adfe525c1412,b1342c5d37cfb821cfb96fcfd610cb95cdfb1082..9d976f3249a3e044a38c0633eeb73a882bbbd828
  #include "trace.h"
  #include "trace_output.h"
  
- struct fgraph_data {
+ struct fgraph_cpu_data {
        pid_t           last_pid;
        int             depth;
+       int             ignore;
+ };
+ struct fgraph_data {
+       struct fgraph_cpu_data          *cpu_data;
+       /* Place to preserve last processed entry. */
+       struct ftrace_graph_ent_entry   ent;
+       struct ftrace_graph_ret_entry   ret;
+       int                             failed;
+       int                             cpu;
  };
  
  #define TRACE_GRAPH_INDENT    2
@@@ -176,7 -187,7 +187,7 @@@ static int __trace_graph_entry(struct t
        struct ring_buffer *buffer = tr->buffer;
        struct ftrace_graph_ent_entry *entry;
  
 -      if (unlikely(__this_cpu_read(per_cpu_var(ftrace_cpu_disabled))))
 +      if (unlikely(__this_cpu_read(ftrace_cpu_disabled)))
                return 0;
  
        event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT,
@@@ -240,7 -251,7 +251,7 @@@ static void __trace_graph_return(struc
        struct ring_buffer *buffer = tr->buffer;
        struct ftrace_graph_ret_entry *entry;
  
 -      if (unlikely(__this_cpu_read(per_cpu_var(ftrace_cpu_disabled))))
 +      if (unlikely(__this_cpu_read(ftrace_cpu_disabled)))
                return;
  
        event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RET,
@@@ -384,7 -395,7 +395,7 @@@ verif_pid(struct trace_seq *s, pid_t pi
        if (!data)
                return TRACE_TYPE_HANDLED;
  
-       last_pid = &(per_cpu_ptr(data, cpu)->last_pid);
+       last_pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid);
  
        if (*last_pid == pid)
                return TRACE_TYPE_HANDLED;
@@@ -435,26 -446,49 +446,49 @@@ static struct ftrace_graph_ret_entry 
  get_return_for_leaf(struct trace_iterator *iter,
                struct ftrace_graph_ent_entry *curr)
  {
-       struct ring_buffer_iter *ring_iter;
+       struct fgraph_data *data = iter->private;
+       struct ring_buffer_iter *ring_iter = NULL;
        struct ring_buffer_event *event;
        struct ftrace_graph_ret_entry *next;
  
-       ring_iter = iter->buffer_iter[iter->cpu];
+       /*
+        * If the previous output failed to write to the seq buffer,
+        * then we just reuse the data from before.
+        */
+       if (data && data->failed) {
+               curr = &data->ent;
+               next = &data->ret;
+       } else {
  
-       /* First peek to compare current entry and the next one */
-       if (ring_iter)
-               event = ring_buffer_iter_peek(ring_iter, NULL);
-       else {
-       /* We need to consume the current entry to see the next one */
-               ring_buffer_consume(iter->tr->buffer, iter->cpu, NULL);
-               event = ring_buffer_peek(iter->tr->buffer, iter->cpu,
-                                       NULL);
-       }
+               ring_iter = iter->buffer_iter[iter->cpu];
+               /* First peek to compare current entry and the next one */
+               if (ring_iter)
+                       event = ring_buffer_iter_peek(ring_iter, NULL);
+               else {
+                       /*
+                        * We need to consume the current entry to see
+                        * the next one.
+                        */
+                       ring_buffer_consume(iter->tr->buffer, iter->cpu, NULL);
+                       event = ring_buffer_peek(iter->tr->buffer, iter->cpu,
+                                                NULL);
+               }
  
-       if (!event)
-               return NULL;
+               if (!event)
+                       return NULL;
+               next = ring_buffer_event_data(event);
  
-       next = ring_buffer_event_data(event);
+               if (data) {
+                       /*
+                        * Save current and next entries for later reference
+                        * if the output fails.
+                        */
+                       data->ent = *curr;
+                       data->ret = *next;
+               }
+       }
  
        if (next->ent.type != TRACE_GRAPH_RET)
                return NULL;
@@@ -640,7 -674,7 +674,7 @@@ print_graph_entry_leaf(struct trace_ite
  
        if (data) {
                int cpu = iter->cpu;
-               int *depth = &(per_cpu_ptr(data, cpu)->depth);
+               int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth);
  
                /*
                 * Comments display at + 1 to depth. Since
@@@ -688,7 -722,7 +722,7 @@@ print_graph_entry_nested(struct trace_i
  
        if (data) {
                int cpu = iter->cpu;
-               int *depth = &(per_cpu_ptr(data, cpu)->depth);
+               int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth);
  
                *depth = call->depth;
        }
@@@ -782,19 -816,34 +816,34 @@@ static enum print_line_
  print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
                        struct trace_iterator *iter)
  {
-       int cpu = iter->cpu;
+       struct fgraph_data *data = iter->private;
        struct ftrace_graph_ent *call = &field->graph_ent;
        struct ftrace_graph_ret_entry *leaf_ret;
+       static enum print_line_t ret;
+       int cpu = iter->cpu;
  
        if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func))
                return TRACE_TYPE_PARTIAL_LINE;
  
        leaf_ret = get_return_for_leaf(iter, field);
        if (leaf_ret)
-               return print_graph_entry_leaf(iter, field, leaf_ret, s);
+               ret = print_graph_entry_leaf(iter, field, leaf_ret, s);
        else
-               return print_graph_entry_nested(iter, field, s, cpu);
+               ret = print_graph_entry_nested(iter, field, s, cpu);
  
+       if (data) {
+               /*
+                * If we failed to write our output, then we need to make
+                * note of it. Because we already consumed our entry.
+                */
+               if (s->full) {
+                       data->failed = 1;
+                       data->cpu = cpu;
+               } else
+                       data->failed = 0;
+       }
+       return ret;
  }
  
  static enum print_line_t
@@@ -810,7 -859,7 +859,7 @@@ print_graph_return(struct ftrace_graph_
  
        if (data) {
                int cpu = iter->cpu;
-               int *depth = &(per_cpu_ptr(data, cpu)->depth);
+               int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth);
  
                /*
                 * Comments display at + 1 to depth. This is the
@@@ -873,7 -922,7 +922,7 @@@ print_graph_comment(struct trace_seq *s
        int i;
  
        if (data)
-               depth = per_cpu_ptr(data, iter->cpu)->depth;
+               depth = per_cpu_ptr(data->cpu_data, iter->cpu)->depth;
  
        if (print_graph_prologue(iter, s, 0, 0))
                return TRACE_TYPE_PARTIAL_LINE;
  enum print_line_t
  print_graph_function(struct trace_iterator *iter)
  {
+       struct ftrace_graph_ent_entry *field;
+       struct fgraph_data *data = iter->private;
        struct trace_entry *entry = iter->ent;
        struct trace_seq *s = &iter->seq;
+       int cpu = iter->cpu;
+       int ret;
+       if (data && per_cpu_ptr(data->cpu_data, cpu)->ignore) {
+               per_cpu_ptr(data->cpu_data, cpu)->ignore = 0;
+               return TRACE_TYPE_HANDLED;
+       }
+       /*
+        * If the last output failed, there's a possibility we need
+        * to print out the missing entry which would never go out.
+        */
+       if (data && data->failed) {
+               field = &data->ent;
+               iter->cpu = data->cpu;
+               ret = print_graph_entry(field, s, iter);
+               if (ret == TRACE_TYPE_HANDLED && iter->cpu != cpu) {
+                       per_cpu_ptr(data->cpu_data, iter->cpu)->ignore = 1;
+                       ret = TRACE_TYPE_NO_CONSUME;
+               }
+               iter->cpu = cpu;
+               return ret;
+       }
  
        switch (entry->type) {
        case TRACE_GRAPH_ENT: {
                 * sizeof(struct ftrace_graph_ent_entry) is very small,
                 * it can be safely saved at the stack.
                 */
-               struct ftrace_graph_ent_entry *field, saved;
+               struct ftrace_graph_ent_entry saved;
                trace_assign_type(field, entry);
                saved = *field;
                return print_graph_entry(&saved, s, iter);
@@@ -1030,31 -1104,54 +1104,54 @@@ static void print_graph_headers(struct 
  static void graph_trace_open(struct trace_iterator *iter)
  {
        /* pid and depth on the last trace processed */
-       struct fgraph_data *data = alloc_percpu(struct fgraph_data);
+       struct fgraph_data *data;
        int cpu;
  
+       iter->private = NULL;
+       data = kzalloc(sizeof(*data), GFP_KERNEL);
        if (!data)
-               pr_warning("function graph tracer: not enough memory\n");
-       else
-               for_each_possible_cpu(cpu) {
-                       pid_t *pid = &(per_cpu_ptr(data, cpu)->last_pid);
-                       int *depth = &(per_cpu_ptr(data, cpu)->depth);
-                       *pid = -1;
-                       *depth = 0;
-               }
+               goto out_err;
+       data->cpu_data = alloc_percpu(struct fgraph_cpu_data);
+       if (!data->cpu_data)
+               goto out_err_free;
+       for_each_possible_cpu(cpu) {
+               pid_t *pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid);
+               int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth);
+               int *ignore = &(per_cpu_ptr(data->cpu_data, cpu)->ignore);
+               *pid = -1;
+               *depth = 0;
+               *ignore = 0;
+       }
  
        iter->private = data;
+       return;
+  out_err_free:
+       kfree(data);
+  out_err:
+       pr_warning("function graph tracer: not enough memory\n");
  }
  
  static void graph_trace_close(struct trace_iterator *iter)
  {
-       free_percpu(iter->private);
+       struct fgraph_data *data = iter->private;
+       if (data) {
+               free_percpu(data->cpu_data);
+               kfree(data);
+       }
  }
  
  static struct tracer graph_trace __read_mostly = {
        .name           = "function_graph",
        .open           = graph_trace_open,
+       .pipe_open      = graph_trace_open,
        .close          = graph_trace_close,
+       .pipe_close     = graph_trace_close,
        .wait_pipe      = poll_wait_pipe,
        .init           = graph_trace_init,
        .reset          = graph_trace_reset,
diff --combined mm/percpu.c
index 77c6f7994a46cc4add460e5f05ff055123b8a1a4,442010cc91c6c82eb8489e64d21500baa52b5911..626e43c99498d878281c20211b56afd361e93af2
@@@ -72,6 -72,7 +72,7 @@@
  #include <asm/cacheflush.h>
  #include <asm/sections.h>
  #include <asm/tlbflush.h>
+ #include <asm/io.h>
  
  #define PCPU_SLOT_BASE_SHIFT          5       /* 1-31 shares the same slot */
  #define PCPU_DFL_MAP_ALLOC            16      /* start a map with 16 ents */
@@@ -151,7 -152,10 +152,10 @@@ static int pcpu_reserved_chunk_limit
   *
   * During allocation, pcpu_alloc_mutex is kept locked all the time and
   * pcpu_lock is grabbed and released as necessary.  All actual memory
-  * allocations are done using GFP_KERNEL with pcpu_lock released.
+  * allocations are done using GFP_KERNEL with pcpu_lock released.  In
+  * general, percpu memory can't be allocated with irq off but
+  * irqsave/restore are still used in alloc path so that it can be used
+  * from early init path - sched_init() specifically.
   *
   * Free path accesses and alters only the index data structures, so it
   * can be safely called from atomic context.  When memory needs to be
@@@ -350,63 -354,86 +354,86 @@@ static struct pcpu_chunk *pcpu_chunk_ad
  }
  
  /**
-  * pcpu_extend_area_map - extend area map for allocation
-  * @chunk: target chunk
+  * pcpu_need_to_extend - determine whether chunk area map needs to be extended
+  * @chunk: chunk of interest
   *
-  * Extend area map of @chunk so that it can accomodate an allocation.
-  * A single allocation can split an area into three areas, so this
-  * function makes sure that @chunk->map has at least two extra slots.
+  * Determine whether area map of @chunk needs to be extended to
+  * accomodate a new allocation.
   *
   * CONTEXT:
-  * pcpu_alloc_mutex, pcpu_lock.  pcpu_lock is released and reacquired
-  * if area map is extended.
+  * pcpu_lock.
   *
   * RETURNS:
-  * 0 if noop, 1 if successfully extended, -errno on failure.
+  * New target map allocation length if extension is necessary, 0
+  * otherwise.
   */
- static int pcpu_extend_area_map(struct pcpu_chunk *chunk)
-       __releases(lock) __acquires(lock)
+ static int pcpu_need_to_extend(struct pcpu_chunk *chunk)
  {
        int new_alloc;
-       int *new;
-       size_t size;
  
-       /* has enough? */
        if (chunk->map_alloc >= chunk->map_used + 2)
                return 0;
  
-       spin_unlock_irq(&pcpu_lock);
        new_alloc = PCPU_DFL_MAP_ALLOC;
        while (new_alloc < chunk->map_used + 2)
                new_alloc *= 2;
  
-       new = pcpu_mem_alloc(new_alloc * sizeof(new[0]));
-       if (!new) {
-               spin_lock_irq(&pcpu_lock);
+       return new_alloc;
+ }
+ /**
+  * pcpu_extend_area_map - extend area map of a chunk
+  * @chunk: chunk of interest
+  * @new_alloc: new target allocation length of the area map
+  *
+  * Extend area map of @chunk to have @new_alloc entries.
+  *
+  * CONTEXT:
+  * Does GFP_KERNEL allocation.  Grabs and releases pcpu_lock.
+  *
+  * RETURNS:
+  * 0 on success, -errno on failure.
+  */
+ static int pcpu_extend_area_map(struct pcpu_chunk *chunk, int new_alloc)
+ {
+       int *old = NULL, *new = NULL;
+       size_t old_size = 0, new_size = new_alloc * sizeof(new[0]);
+       unsigned long flags;
+       new = pcpu_mem_alloc(new_size);
+       if (!new)
                return -ENOMEM;
-       }
  
-       /*
-        * Acquire pcpu_lock and switch to new area map.  Only free
-        * could have happened inbetween, so map_used couldn't have
-        * grown.
-        */
-       spin_lock_irq(&pcpu_lock);
-       BUG_ON(new_alloc < chunk->map_used + 2);
+       /* acquire pcpu_lock and switch to new area map */
+       spin_lock_irqsave(&pcpu_lock, flags);
+       if (new_alloc <= chunk->map_alloc)
+               goto out_unlock;
  
-       size = chunk->map_alloc * sizeof(chunk->map[0]);
-       memcpy(new, chunk->map, size);
+       old_size = chunk->map_alloc * sizeof(chunk->map[0]);
+       memcpy(new, chunk->map, old_size);
  
        /*
         * map_alloc < PCPU_DFL_MAP_ALLOC indicates that the chunk is
         * one of the first chunks and still using static map.
         */
        if (chunk->map_alloc >= PCPU_DFL_MAP_ALLOC)
-               pcpu_mem_free(chunk->map, size);
+               old = chunk->map;
  
        chunk->map_alloc = new_alloc;
        chunk->map = new;
+       new = NULL;
+ out_unlock:
+       spin_unlock_irqrestore(&pcpu_lock, flags);
+       /*
+        * pcpu_mem_free() might end up calling vfree() which uses
+        * IRQ-unsafe lock and thus can't be called under pcpu_lock.
+        */
+       pcpu_mem_free(old, old_size);
+       pcpu_mem_free(new, new_size);
        return 0;
  }
  
@@@ -886,10 -913,11 +913,10 @@@ static void pcpu_depopulate_chunk(struc
        int rs, re;
  
        /* quick path, check whether it's empty already */
 -      pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
 -              if (rs == page_start && re == page_end)
 -                      return;
 -              break;
 -      }
 +      rs = page_start;
 +      pcpu_next_unpop(chunk, &rs, &re, page_end);
 +      if (rs == page_start && re == page_end)
 +              return;
  
        /* immutable chunks can't be depopulated */
        WARN_ON(chunk->immutable);
@@@ -940,10 -968,11 +967,10 @@@ static int pcpu_populate_chunk(struct p
        int rs, re, rc;
  
        /* quick path, check whether all pages are already there */
 -      pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) {
 -              if (rs == page_start && re == page_end)
 -                      goto clear;
 -              break;
 -      }
 +      rs = page_start;
 +      pcpu_next_pop(chunk, &rs, &re, page_end);
 +      if (rs == page_start && re == page_end)
 +              goto clear;
  
        /* need to allocate and map pages, this chunk can't be immutable */
        WARN_ON(chunk->immutable);
@@@ -1043,7 -1072,8 +1070,8 @@@ static void *pcpu_alloc(size_t size, si
        static int warn_limit = 10;
        struct pcpu_chunk *chunk;
        const char *err;
-       int slot, off;
+       int slot, off, new_alloc;
+       unsigned long flags;
  
        if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) {
                WARN(true, "illegal size (%zu) or align (%zu) for "
        }
  
        mutex_lock(&pcpu_alloc_mutex);
-       spin_lock_irq(&pcpu_lock);
+       spin_lock_irqsave(&pcpu_lock, flags);
  
        /* serve reserved allocations from the reserved chunk if available */
        if (reserved && pcpu_reserved_chunk) {
                chunk = pcpu_reserved_chunk;
-               if (size > chunk->contig_hint ||
-                   pcpu_extend_area_map(chunk) < 0) {
-                       err = "failed to extend area map of reserved chunk";
+               if (size > chunk->contig_hint) {
+                       err = "alloc from reserved chunk failed";
                        goto fail_unlock;
                }
+               while ((new_alloc = pcpu_need_to_extend(chunk))) {
+                       spin_unlock_irqrestore(&pcpu_lock, flags);
+                       if (pcpu_extend_area_map(chunk, new_alloc) < 0) {
+                               err = "failed to extend area map of reserved chunk";
+                               goto fail_unlock_mutex;
+                       }
+                       spin_lock_irqsave(&pcpu_lock, flags);
+               }
                off = pcpu_alloc_area(chunk, size, align);
                if (off >= 0)
                        goto area_found;
                err = "alloc from reserved chunk failed";
                goto fail_unlock;
        }
@@@ -1076,14 -1117,20 +1115,20 @@@ restart
                        if (size > chunk->contig_hint)
                                continue;
  
-                       switch (pcpu_extend_area_map(chunk)) {
-                       case 0:
-                               break;
-                       case 1:
-                               goto restart;   /* pcpu_lock dropped, restart */
-                       default:
-                               err = "failed to extend area map";
-                               goto fail_unlock;
+                       new_alloc = pcpu_need_to_extend(chunk);
+                       if (new_alloc) {
+                               spin_unlock_irqrestore(&pcpu_lock, flags);
+                               if (pcpu_extend_area_map(chunk,
+                                                        new_alloc) < 0) {
+                                       err = "failed to extend area map";
+                                       goto fail_unlock_mutex;
+                               }
+                               spin_lock_irqsave(&pcpu_lock, flags);
+                               /*
+                                * pcpu_lock has been dropped, need to
+                                * restart cpu_slot list walking.
+                                */
+                               goto restart;
                        }
  
                        off = pcpu_alloc_area(chunk, size, align);
        }
  
        /* hmmm... no space left, create a new chunk */
-       spin_unlock_irq(&pcpu_lock);
+       spin_unlock_irqrestore(&pcpu_lock, flags);
  
        chunk = alloc_pcpu_chunk();
        if (!chunk) {
                goto fail_unlock_mutex;
        }
  
-       spin_lock_irq(&pcpu_lock);
+       spin_lock_irqsave(&pcpu_lock, flags);
        pcpu_chunk_relocate(chunk, -1);
        goto restart;
  
  area_found:
-       spin_unlock_irq(&pcpu_lock);
+       spin_unlock_irqrestore(&pcpu_lock, flags);
  
        /* populate, map and clear the area */
        if (pcpu_populate_chunk(chunk, off, size)) {
-               spin_lock_irq(&pcpu_lock);
+               spin_lock_irqsave(&pcpu_lock, flags);
                pcpu_free_area(chunk, off);
                err = "failed to populate";
                goto fail_unlock;
        return __addr_to_pcpu_ptr(chunk->base_addr + off);
  
  fail_unlock:
-       spin_unlock_irq(&pcpu_lock);
+       spin_unlock_irqrestore(&pcpu_lock, flags);
  fail_unlock_mutex:
        mutex_unlock(&pcpu_alloc_mutex);
        if (warn_limit) {
@@@ -1254,6 -1301,27 +1299,27 @@@ void free_percpu(void *ptr
  }
  EXPORT_SYMBOL_GPL(free_percpu);
  
+ /**
+  * per_cpu_ptr_to_phys - convert translated percpu address to physical address
+  * @addr: the address to be converted to physical address
+  *
+  * Given @addr which is dereferenceable address obtained via one of
+  * percpu access macros, this function translates it into its physical
+  * address.  The caller is responsible for ensuring @addr stays valid
+  * until this function finishes.
+  *
+  * RETURNS:
+  * The physical address for @addr.
+  */
+ phys_addr_t per_cpu_ptr_to_phys(void *addr)
+ {
+       if ((unsigned long)addr < VMALLOC_START ||
+                       (unsigned long)addr >= VMALLOC_END)
+               return __pa(addr);
+       else
+               return page_to_phys(vmalloc_to_page(addr));
+ }
  static inline size_t pcpu_calc_fc_sizes(size_t static_size,
                                        size_t reserved_size,
                                        ssize_t *dyn_sizep)