Merge branch 'powernv-cpuidle' of git://git.kernel.org/pub/scm/linux/kernel/git/benh...
authorLinus Torvalds <torvalds@linux-foundation.org>
Wed, 2 Apr 2014 20:47:29 +0000 (13:47 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Wed, 2 Apr 2014 20:47:29 +0000 (13:47 -0700)
Pull powerpc non-virtualized cpuidle from Ben Herrenschmidt:
 "This is the branch I mentioned in my other pull request which contains
  our improved cpuidle support for the "powernv" platform
  (non-virtualized).

  It adds support for the "fast sleep" feature of the processor which
  provides higher power savings than our usual "nap" mode but at the
  cost of losing the timers while asleep, and thus exploits the new
  timer broadcast framework to work around that limitation.

  It's based on a tip timer tree that you seem to have already merged"

* 'powernv-cpuidle' of git://git.kernel.org/pub/scm/linux/kernel/git/benh/powerpc:
  cpuidle/powernv: Parse device tree to setup idle states
  cpuidle/powernv: Add "Fast-Sleep" CPU idle state
  powerpc/powernv: Add OPAL call to resync timebase on wakeup
  powerpc/powernv: Add context management for Fast Sleep
  powerpc: Split timer_interrupt() into timer handling and interrupt handling routines
  powerpc: Implement tick broadcast IPI as a fixed IPI message
  powerpc: Free up the slot of PPC_MSG_CALL_FUNC_SINGLE IPI message

13 files changed:
arch/powerpc/Kconfig
arch/powerpc/include/asm/opal.h
arch/powerpc/include/asm/processor.h
arch/powerpc/include/asm/smp.h
arch/powerpc/include/asm/time.h
arch/powerpc/kernel/exceptions-64s.S
arch/powerpc/kernel/idle_power7.S
arch/powerpc/kernel/smp.c
arch/powerpc/kernel/time.c
arch/powerpc/platforms/cell/interrupt.c
arch/powerpc/platforms/powernv/opal-wrappers.S
arch/powerpc/platforms/ps3/smp.c
drivers/cpuidle/cpuidle-powernv.c

index 05e532984c13212b47a87e04412a722e9e6d889a..f3d7846bc9b284b6893546f8cca2edd5c72d6809 100644 (file)
@@ -130,6 +130,8 @@ config PPC
        select GENERIC_CMOS_UPDATE
        select GENERIC_TIME_VSYSCALL_OLD
        select GENERIC_CLOCKEVENTS
+       select GENERIC_CLOCKEVENTS_BROADCAST if SMP
+       select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
        select GENERIC_STRNCPY_FROM_USER
        select GENERIC_STRNLEN_USER
        select HAVE_MOD_ARCH_SPECIFIC
index ffafab037ba860b5eaada654528a8bb9332ee338..fe2aa0b48d2b00e43e47212ddb55ce6c5142537f 100644 (file)
@@ -161,6 +161,7 @@ extern int opal_enter_rtas(struct rtas_args *args,
 #define OPAL_FLASH_VALIDATE                    76
 #define OPAL_FLASH_MANAGE                      77
 #define OPAL_FLASH_UPDATE                      78
+#define OPAL_RESYNC_TIMEBASE                   79
 #define OPAL_DUMP_INIT                         81
 #define OPAL_DUMP_INFO                         82
 #define OPAL_DUMP_READ                         83
@@ -923,6 +924,7 @@ extern int opal_machine_check(struct pt_regs *regs);
 extern bool opal_mce_check_early_recovery(struct pt_regs *regs);
 
 extern void opal_shutdown(void);
+extern int opal_resync_timebase(void);
 
 extern void opal_lpc_init(void);
 
index b62de43ae5f344a02d6096af7842c3799c192b93..d660dc36831afd5420c6da1dc1a35e9ee9eb6234 100644 (file)
@@ -450,6 +450,7 @@ enum idle_boot_override {IDLE_NO_OVERRIDE = 0, IDLE_POWERSAVE_OFF};
 
 extern int powersave_nap;      /* set if nap mode can be used in idle loop */
 extern void power7_nap(void);
+extern void power7_sleep(void);
 extern void flush_instruction_cache(void);
 extern void hard_reset_now(void);
 extern void poweroff_now(void);
index 084e0807db988e2a24b836df800406739f91a7e9..ff51046b6466993ead747864b115facbd08af6b4 100644 (file)
@@ -120,7 +120,7 @@ extern int cpu_to_core_id(int cpu);
  * in /proc/interrupts will be wrong!!! --Troy */
 #define PPC_MSG_CALL_FUNCTION   0
 #define PPC_MSG_RESCHEDULE      1
-#define PPC_MSG_CALL_FUNC_SINGLE       2
+#define PPC_MSG_TICK_BROADCAST 2
 #define PPC_MSG_DEBUGGER_BREAK  3
 
 /* for irq controllers that have dedicated ipis per message (4) */
index c1f267694acbecd7f072245f57199faa0594e3c6..1d428e6007caa64de18efcd0f5c5291ad53b9f94 100644 (file)
@@ -28,6 +28,7 @@ extern struct clock_event_device decrementer_clockevent;
 struct rtc_time;
 extern void to_tm(int tim, struct rtc_time * tm);
 extern void GregorianDay(struct rtc_time *tm);
+extern void tick_broadcast_ipi_handler(void);
 
 extern void generic_calibrate_decr(void);
 
index 4c34c3c827ad5be65a4f59381c9f485086662fe9..d9c650ec7dac2ee047e5c9ab32d4dda61d31d808 100644 (file)
@@ -121,9 +121,10 @@ BEGIN_FTR_SECTION
        cmpwi   cr1,r13,2
        /* Total loss of HV state is fatal, we could try to use the
         * PIR to locate a PACA, then use an emergency stack etc...
-        * but for now, let's just stay stuck here
+        * OPAL v3 based powernv platforms have new idle states
+        * which fall in this catagory.
         */
-       bgt     cr1,.
+       bgt     cr1,8f
        GET_PACA(r13)
 
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
@@ -141,6 +142,11 @@ BEGIN_FTR_SECTION
        beq     cr1,2f
        b       .power7_wakeup_noloss
 2:     b       .power7_wakeup_loss
+
+       /* Fast Sleep wakeup on PowerNV */
+8:     GET_PACA(r13)
+       b       .power7_wakeup_tb_loss
+
 9:
 END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
 #endif /* CONFIG_PPC_P7_NAP */
index 3fdef0f0c67fa959e631a4ccee2d3a330001fa44..c3ab86975614a4a9d4af842962fa66f32c5c8e47 100644 (file)
 #include <asm/ppc-opcode.h>
 #include <asm/hw_irq.h>
 #include <asm/kvm_book3s_asm.h>
+#include <asm/opal.h>
 
 #undef DEBUG
 
-       .text
+/* Idle state entry routines */
 
-_GLOBAL(power7_idle)
-       /* Now check if user or arch enabled NAP mode */
-       LOAD_REG_ADDRBASE(r3,powersave_nap)
-       lwz     r4,ADDROFF(powersave_nap)(r3)
-       cmpwi   0,r4,0
-       beqlr
-       /* fall through */
+#define        IDLE_STATE_ENTER_SEQ(IDLE_INST)                         \
+       /* Magic NAP/SLEEP/WINKLE mode enter sequence */        \
+       std     r0,0(r1);                                       \
+       ptesync;                                                \
+       ld      r0,0(r1);                                       \
+1:     cmp     cr0,r0,r0;                                      \
+       bne     1b;                                             \
+       IDLE_INST;                                              \
+       b       .
 
-_GLOBAL(power7_nap)
+       .text
+
+/*
+ * Pass requested state in r3:
+ *     0 - nap
+ *     1 - sleep
+ */
+_GLOBAL(power7_powersave_common)
+       /* Use r3 to pass state nap/sleep/winkle */
        /* NAP is a state loss, we create a regs frame on the
         * stack, fill it up with the state we care about and
         * stick a pointer to it in PACAR1. We really only
@@ -79,8 +90,8 @@ _GLOBAL(power7_nap)
        /* Continue saving state */
        SAVE_GPR(2, r1)
        SAVE_NVGPRS(r1)
-       mfcr    r3
-       std     r3,_CCR(r1)
+       mfcr    r4
+       std     r4,_CCR(r1)
        std     r9,_MSR(r1)
        std     r1,PACAR1(r13)
 
@@ -90,15 +101,56 @@ _GLOBAL(power7_enter_nap_mode)
        li      r4,KVM_HWTHREAD_IN_NAP
        stb     r4,HSTATE_HWTHREAD_STATE(r13)
 #endif
+       cmpwi   cr0,r3,1
+       beq     2f
+       IDLE_STATE_ENTER_SEQ(PPC_NAP)
+       /* No return */
+2:     IDLE_STATE_ENTER_SEQ(PPC_SLEEP)
+       /* No return */
 
-       /* Magic NAP mode enter sequence */
-       std     r0,0(r1)
-       ptesync
-       ld      r0,0(r1)
-1:     cmp     cr0,r0,r0
-       bne     1b
-       PPC_NAP
-       b       .
+_GLOBAL(power7_idle)
+       /* Now check if user or arch enabled NAP mode */
+       LOAD_REG_ADDRBASE(r3,powersave_nap)
+       lwz     r4,ADDROFF(powersave_nap)(r3)
+       cmpwi   0,r4,0
+       beqlr
+       /* fall through */
+
+_GLOBAL(power7_nap)
+       li      r3,0
+       b       power7_powersave_common
+       /* No return */
+
+_GLOBAL(power7_sleep)
+       li      r3,1
+       b       power7_powersave_common
+       /* No return */
+
+_GLOBAL(power7_wakeup_tb_loss)
+       ld      r2,PACATOC(r13);
+       ld      r1,PACAR1(r13)
+
+       /* Time base re-sync */
+       li      r0,OPAL_RESYNC_TIMEBASE
+       LOAD_REG_ADDR(r11,opal);
+       ld      r12,8(r11);
+       ld      r2,0(r11);
+       mtctr   r12
+       bctrl
+
+       /* TODO: Check r3 for failure */
+
+       REST_NVGPRS(r1)
+       REST_GPR(2, r1)
+       ld      r3,_CCR(r1)
+       ld      r4,_MSR(r1)
+       ld      r5,_NIP(r1)
+       addi    r1,r1,INT_FRAME_SIZE
+       mtcr    r3
+       mfspr   r3,SPRN_SRR1            /* Return SRR1 */
+       mtspr   SPRN_SRR1,r4
+       mtspr   SPRN_SRR0,r5
+       rfid
 
 _GLOBAL(power7_wakeup_loss)
        ld      r1,PACAR1(r13)
index ac2621af31545b4aa3a4a434f7119c2c3aac222e..e2a4232c5871a19056b0b5b666adb540a17b1403 100644 (file)
@@ -35,6 +35,7 @@
 #include <asm/ptrace.h>
 #include <linux/atomic.h>
 #include <asm/irq.h>
+#include <asm/hw_irq.h>
 #include <asm/page.h>
 #include <asm/pgtable.h>
 #include <asm/prom.h>
@@ -145,9 +146,9 @@ static irqreturn_t reschedule_action(int irq, void *data)
        return IRQ_HANDLED;
 }
 
-static irqreturn_t call_function_single_action(int irq, void *data)
+static irqreturn_t tick_broadcast_ipi_action(int irq, void *data)
 {
-       generic_smp_call_function_single_interrupt();
+       tick_broadcast_ipi_handler();
        return IRQ_HANDLED;
 }
 
@@ -168,14 +169,14 @@ static irqreturn_t debug_ipi_action(int irq, void *data)
 static irq_handler_t smp_ipi_action[] = {
        [PPC_MSG_CALL_FUNCTION] =  call_function_action,
        [PPC_MSG_RESCHEDULE] = reschedule_action,
-       [PPC_MSG_CALL_FUNC_SINGLE] = call_function_single_action,
+       [PPC_MSG_TICK_BROADCAST] = tick_broadcast_ipi_action,
        [PPC_MSG_DEBUGGER_BREAK] = debug_ipi_action,
 };
 
 const char *smp_ipi_name[] = {
        [PPC_MSG_CALL_FUNCTION] =  "ipi call function",
        [PPC_MSG_RESCHEDULE] = "ipi reschedule",
-       [PPC_MSG_CALL_FUNC_SINGLE] = "ipi call function single",
+       [PPC_MSG_TICK_BROADCAST] = "ipi tick-broadcast",
        [PPC_MSG_DEBUGGER_BREAK] = "ipi debugger",
 };
 
@@ -251,8 +252,8 @@ irqreturn_t smp_ipi_demux(void)
                        generic_smp_call_function_interrupt();
                if (all & IPI_MESSAGE(PPC_MSG_RESCHEDULE))
                        scheduler_ipi();
-               if (all & IPI_MESSAGE(PPC_MSG_CALL_FUNC_SINGLE))
-                       generic_smp_call_function_single_interrupt();
+               if (all & IPI_MESSAGE(PPC_MSG_TICK_BROADCAST))
+                       tick_broadcast_ipi_handler();
                if (all & IPI_MESSAGE(PPC_MSG_DEBUGGER_BREAK))
                        debug_ipi_action(0, NULL);
        } while (info->messages);
@@ -280,7 +281,7 @@ EXPORT_SYMBOL_GPL(smp_send_reschedule);
 
 void arch_send_call_function_single_ipi(int cpu)
 {
-       do_message_pass(cpu, PPC_MSG_CALL_FUNC_SINGLE);
+       do_message_pass(cpu, PPC_MSG_CALL_FUNCTION);
 }
 
 void arch_send_call_function_ipi_mask(const struct cpumask *mask)
@@ -291,6 +292,16 @@ void arch_send_call_function_ipi_mask(const struct cpumask *mask)
                do_message_pass(cpu, PPC_MSG_CALL_FUNCTION);
 }
 
+#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
+void tick_broadcast(const struct cpumask *mask)
+{
+       unsigned int cpu;
+
+       for_each_cpu(cpu, mask)
+               do_message_pass(cpu, PPC_MSG_TICK_BROADCAST);
+}
+#endif
+
 #if defined(CONFIG_DEBUGGER) || defined(CONFIG_KEXEC)
 void smp_send_debugger_break(void)
 {
index b3dab20acf34abe126e244d34a7dee39021457de..122a580f732246c02c5e31c301078cb725435919 100644 (file)
@@ -42,6 +42,7 @@
 #include <linux/timex.h>
 #include <linux/kernel_stat.h>
 #include <linux/time.h>
+#include <linux/clockchips.h>
 #include <linux/init.h>
 #include <linux/profile.h>
 #include <linux/cpu.h>
@@ -106,7 +107,7 @@ struct clock_event_device decrementer_clockevent = {
        .irq            = 0,
        .set_next_event = decrementer_set_next_event,
        .set_mode       = decrementer_set_mode,
-       .features       = CLOCK_EVT_FEAT_ONESHOT,
+       .features       = CLOCK_EVT_FEAT_ONESHOT | CLOCK_EVT_FEAT_C3STOP,
 };
 EXPORT_SYMBOL(decrementer_clockevent);
 
@@ -478,6 +479,47 @@ void arch_irq_work_raise(void)
 
 #endif /* CONFIG_IRQ_WORK */
 
+void __timer_interrupt(void)
+{
+       struct pt_regs *regs = get_irq_regs();
+       u64 *next_tb = &__get_cpu_var(decrementers_next_tb);
+       struct clock_event_device *evt = &__get_cpu_var(decrementers);
+       u64 now;
+
+       trace_timer_interrupt_entry(regs);
+
+       if (test_irq_work_pending()) {
+               clear_irq_work_pending();
+               irq_work_run();
+       }
+
+       now = get_tb_or_rtc();
+       if (now >= *next_tb) {
+               *next_tb = ~(u64)0;
+               if (evt->event_handler)
+                       evt->event_handler(evt);
+               __get_cpu_var(irq_stat).timer_irqs_event++;
+       } else {
+               now = *next_tb - now;
+               if (now <= DECREMENTER_MAX)
+                       set_dec((int)now);
+               /* We may have raced with new irq work */
+               if (test_irq_work_pending())
+                       set_dec(1);
+               __get_cpu_var(irq_stat).timer_irqs_others++;
+       }
+
+#ifdef CONFIG_PPC64
+       /* collect purr register values often, for accurate calculations */
+       if (firmware_has_feature(FW_FEATURE_SPLPAR)) {
+               struct cpu_usage *cu = &__get_cpu_var(cpu_usage_array);
+               cu->current_tb = mfspr(SPRN_PURR);
+       }
+#endif
+
+       trace_timer_interrupt_exit(regs);
+}
+
 /*
  * timer_interrupt - gets called when the decrementer overflows,
  * with interrupts disabled.
@@ -486,8 +528,6 @@ void timer_interrupt(struct pt_regs * regs)
 {
        struct pt_regs *old_regs;
        u64 *next_tb = &__get_cpu_var(decrementers_next_tb);
-       struct clock_event_device *evt = &__get_cpu_var(decrementers);
-       u64 now;
 
        /* Ensure a positive value is written to the decrementer, or else
         * some CPUs will continue to take decrementer exceptions.
@@ -519,39 +559,7 @@ void timer_interrupt(struct pt_regs * regs)
        old_regs = set_irq_regs(regs);
        irq_enter();
 
-       trace_timer_interrupt_entry(regs);
-
-       if (test_irq_work_pending()) {
-               clear_irq_work_pending();
-               irq_work_run();
-       }
-
-       now = get_tb_or_rtc();
-       if (now >= *next_tb) {
-               *next_tb = ~(u64)0;
-               if (evt->event_handler)
-                       evt->event_handler(evt);
-               __get_cpu_var(irq_stat).timer_irqs_event++;
-       } else {
-               now = *next_tb - now;
-               if (now <= DECREMENTER_MAX)
-                       set_dec((int)now);
-               /* We may have raced with new irq work */
-               if (test_irq_work_pending())
-                       set_dec(1);
-               __get_cpu_var(irq_stat).timer_irqs_others++;
-       }
-
-#ifdef CONFIG_PPC64
-       /* collect purr register values often, for accurate calculations */
-       if (firmware_has_feature(FW_FEATURE_SPLPAR)) {
-               struct cpu_usage *cu = &__get_cpu_var(cpu_usage_array);
-               cu->current_tb = mfspr(SPRN_PURR);
-       }
-#endif
-
-       trace_timer_interrupt_exit(regs);
-
+       __timer_interrupt();
        irq_exit();
        set_irq_regs(old_regs);
 }
@@ -825,6 +833,15 @@ static void decrementer_set_mode(enum clock_event_mode mode,
                decrementer_set_next_event(DECREMENTER_MAX, dev);
 }
 
+/* Interrupt handler for the timer broadcast IPI */
+void tick_broadcast_ipi_handler(void)
+{
+       u64 *next_tb = &__get_cpu_var(decrementers_next_tb);
+
+       *next_tb = get_tb_or_rtc();
+       __timer_interrupt();
+}
+
 static void register_decrementer_clockevent(int cpu)
 {
        struct clock_event_device *dec = &per_cpu(decrementers, cpu);
@@ -928,6 +945,7 @@ void __init time_init(void)
        clocksource_init();
 
        init_decrementer_clockevent();
+       tick_setup_hrtimer_broadcast();
 }
 
 
index 2d42f3bb66d662e4e166b4e2ada1d763a8ef1788..8a106b4172e0e740dd8cedd7f54b6dc61a5bc270 100644 (file)
@@ -215,7 +215,7 @@ void iic_request_IPIs(void)
 {
        iic_request_ipi(PPC_MSG_CALL_FUNCTION);
        iic_request_ipi(PPC_MSG_RESCHEDULE);
-       iic_request_ipi(PPC_MSG_CALL_FUNC_SINGLE);
+       iic_request_ipi(PPC_MSG_TICK_BROADCAST);
        iic_request_ipi(PPC_MSG_DEBUGGER_BREAK);
 }
 
index 75c89df8d71e95b130dc827e036f25330cc1d6ad..bb90f9a4e0270be18a63a4126841fe81ff84cb12 100644 (file)
@@ -131,6 +131,7 @@ OPAL_CALL(opal_write_elog,                  OPAL_ELOG_WRITE);
 OPAL_CALL(opal_validate_flash,                 OPAL_FLASH_VALIDATE);
 OPAL_CALL(opal_manage_flash,                   OPAL_FLASH_MANAGE);
 OPAL_CALL(opal_update_flash,                   OPAL_FLASH_UPDATE);
+OPAL_CALL(opal_resync_timebase,                        OPAL_RESYNC_TIMEBASE);
 OPAL_CALL(opal_dump_init,                      OPAL_DUMP_INIT);
 OPAL_CALL(opal_dump_info,                      OPAL_DUMP_INFO);
 OPAL_CALL(opal_dump_info2,                     OPAL_DUMP_INFO2);
index 4b35166229fe9eda50ed10be1b51998a6e9a5986..b358bec6c8cb16837aa460bbd6c9908cf9abf093 100644 (file)
@@ -76,7 +76,7 @@ static int __init ps3_smp_probe(void)
 
                BUILD_BUG_ON(PPC_MSG_CALL_FUNCTION    != 0);
                BUILD_BUG_ON(PPC_MSG_RESCHEDULE       != 1);
-               BUILD_BUG_ON(PPC_MSG_CALL_FUNC_SINGLE != 2);
+               BUILD_BUG_ON(PPC_MSG_TICK_BROADCAST   != 2);
                BUILD_BUG_ON(PPC_MSG_DEBUGGER_BREAK   != 3);
 
                for (i = 0; i < MSG_COUNT; i++) {
index f48607cd254024f07501bd802112e90991d697b1..719f6fb5b1c35d00108c47a61918741200eae75d 100644 (file)
 #include <linux/cpuidle.h>
 #include <linux/cpu.h>
 #include <linux/notifier.h>
+#include <linux/clockchips.h>
+#include <linux/of.h>
 
 #include <asm/machdep.h>
 #include <asm/firmware.h>
 #include <asm/runlatch.h>
 
+/* Flags and constants used in PowerNV platform */
+
+#define MAX_POWERNV_IDLE_STATES        8
+#define IDLE_USE_INST_NAP      0x00010000 /* Use nap instruction */
+#define IDLE_USE_INST_SLEEP    0x00020000 /* Use sleep instruction */
+
 struct cpuidle_driver powernv_idle_driver = {
        .name             = "powernv_idle",
        .owner            = THIS_MODULE,
@@ -54,10 +62,36 @@ static int nap_loop(struct cpuidle_device *dev,
        return index;
 }
 
+static int fastsleep_loop(struct cpuidle_device *dev,
+                               struct cpuidle_driver *drv,
+                               int index)
+{
+       unsigned long old_lpcr = mfspr(SPRN_LPCR);
+       unsigned long new_lpcr;
+
+       if (unlikely(system_state < SYSTEM_RUNNING))
+               return index;
+
+       new_lpcr = old_lpcr;
+       new_lpcr &= ~(LPCR_MER | LPCR_PECE); /* lpcr[mer] must be 0 */
+
+       /* exit powersave upon external interrupt, but not decrementer
+        * interrupt.
+        */
+       new_lpcr |= LPCR_PECE0;
+
+       mtspr(SPRN_LPCR, new_lpcr);
+       power7_sleep();
+
+       mtspr(SPRN_LPCR, old_lpcr);
+
+       return index;
+}
+
 /*
  * States for dedicated partition case.
  */
-static struct cpuidle_state powernv_states[] = {
+static struct cpuidle_state powernv_states[MAX_POWERNV_IDLE_STATES] = {
        { /* Snooze */
                .name = "snooze",
                .desc = "snooze",
@@ -65,13 +99,6 @@ static struct cpuidle_state powernv_states[] = {
                .exit_latency = 0,
                .target_residency = 0,
                .enter = &snooze_loop },
-       { /* NAP */
-               .name = "NAP",
-               .desc = "NAP",
-               .flags = CPUIDLE_FLAG_TIME_VALID,
-               .exit_latency = 10,
-               .target_residency = 100,
-               .enter = &nap_loop },
 };
 
 static int powernv_cpuidle_add_cpu_notifier(struct notifier_block *n,
@@ -132,19 +159,74 @@ static int powernv_cpuidle_driver_init(void)
        return 0;
 }
 
+static int powernv_add_idle_states(void)
+{
+       struct device_node *power_mgt;
+       struct property *prop;
+       int nr_idle_states = 1; /* Snooze */
+       int dt_idle_states;
+       u32 *flags;
+       int i;
+
+       /* Currently we have snooze statically defined */
+
+       power_mgt = of_find_node_by_path("/ibm,opal/power-mgt");
+       if (!power_mgt) {
+               pr_warn("opal: PowerMgmt Node not found\n");
+               return nr_idle_states;
+       }
+
+       prop = of_find_property(power_mgt, "ibm,cpu-idle-state-flags", NULL);
+       if (!prop) {
+               pr_warn("DT-PowerMgmt: missing ibm,cpu-idle-state-flags\n");
+               return nr_idle_states;
+       }
+
+       dt_idle_states = prop->length / sizeof(u32);
+       flags = (u32 *) prop->value;
+
+       for (i = 0; i < dt_idle_states; i++) {
+
+               if (flags[i] & IDLE_USE_INST_NAP) {
+                       /* Add NAP state */
+                       strcpy(powernv_states[nr_idle_states].name, "Nap");
+                       strcpy(powernv_states[nr_idle_states].desc, "Nap");
+                       powernv_states[nr_idle_states].flags = CPUIDLE_FLAG_TIME_VALID;
+                       powernv_states[nr_idle_states].exit_latency = 10;
+                       powernv_states[nr_idle_states].target_residency = 100;
+                       powernv_states[nr_idle_states].enter = &nap_loop;
+                       nr_idle_states++;
+               }
+
+               if (flags[i] & IDLE_USE_INST_SLEEP) {
+                       /* Add FASTSLEEP state */
+                       strcpy(powernv_states[nr_idle_states].name, "FastSleep");
+                       strcpy(powernv_states[nr_idle_states].desc, "FastSleep");
+                       powernv_states[nr_idle_states].flags =
+                               CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TIMER_STOP;
+                       powernv_states[nr_idle_states].exit_latency = 300;
+                       powernv_states[nr_idle_states].target_residency = 1000000;
+                       powernv_states[nr_idle_states].enter = &fastsleep_loop;
+                       nr_idle_states++;
+               }
+       }
+
+       return nr_idle_states;
+}
+
 /*
  * powernv_idle_probe()
  * Choose state table for shared versus dedicated partition
  */
 static int powernv_idle_probe(void)
 {
-
        if (cpuidle_disable != IDLE_NO_OVERRIDE)
                return -ENODEV;
 
        if (firmware_has_feature(FW_FEATURE_OPALv3)) {
                cpuidle_state_table = powernv_states;
-               max_idle_state = ARRAY_SIZE(powernv_states);
+               /* Device tree can indicate more idle states */
+               max_idle_state = powernv_add_idle_states();
        } else
                return -ENODEV;