Merge branch 'linus' into perfcounters/core
authorIngo Molnar <mingo@elte.hu>
Sat, 19 Sep 2009 09:27:32 +0000 (11:27 +0200)
committerIngo Molnar <mingo@elte.hu>
Sat, 19 Sep 2009 09:28:41 +0000 (11:28 +0200)
Merge reason: Bring in tracing changes we depend on.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
1  2 
arch/x86/kernel/cpu/perf_counter.c
kernel/perf_counter.c
kernel/sched_fair.c

index 6a0e71b38126cc93f84624502d06e9abfca6f222,2732e2c1e4d340257e10a072fd7f4e2dbaa0a0a3..dbdf712fae9ec52f5a75c4a2b50fb6d272ca6f43
@@@ -36,10 -36,10 +36,10 @@@ static u64 perf_counter_mask __read_mos
  #define BTS_RECORD_SIZE               24
  
  /* The size of a per-cpu BTS buffer in bytes: */
 -#define BTS_BUFFER_SIZE               (BTS_RECORD_SIZE * 1024)
 +#define BTS_BUFFER_SIZE               (BTS_RECORD_SIZE * 2048)
  
  /* The BTS overflow threshold in bytes from the end of the buffer: */
 -#define BTS_OVFL_TH           (BTS_RECORD_SIZE * 64)
 +#define BTS_OVFL_TH           (BTS_RECORD_SIZE * 128)
  
  
  /*
@@@ -1211,7 -1211,7 +1211,7 @@@ amd_pmu_disable_counter(struct hw_perf_
        x86_pmu_disable_counter(hwc, idx);
  }
  
- static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]);
+ static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
  
  /*
   * Set the next IRQ period, based on the hwc->period_left value.
@@@ -1253,7 -1253,7 +1253,7 @@@ x86_perf_counter_set_period(struct perf
        if (left > x86_pmu.max_period)
                left = x86_pmu.max_period;
  
-       per_cpu(prev_left[idx], smp_processor_id()) = left;
+       per_cpu(pmc_prev_left[idx], smp_processor_id()) = left;
  
        /*
         * The hw counter starts counting from this counter offset,
@@@ -1470,7 -1470,7 +1470,7 @@@ void perf_counter_print_debug(void
                rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl);
                rdmsrl(x86_pmu.perfctr  + idx, pmc_count);
  
-               prev_left = per_cpu(prev_left[idx], cpu);
+               prev_left = per_cpu(pmc_prev_left[idx], cpu);
  
                pr_info("CPU#%d:   gen-PMC%d ctrl:  %016llx\n",
                        cpu, idx, pmc_ctrl);
        local_irq_restore(flags);
  }
  
 -static void intel_pmu_drain_bts_buffer(struct cpu_hw_counters *cpuc,
 -                                     struct perf_sample_data *data)
 +static void intel_pmu_drain_bts_buffer(struct cpu_hw_counters *cpuc)
  {
        struct debug_store *ds = cpuc->ds;
        struct bts_record {
                u64     flags;
        };
        struct perf_counter *counter = cpuc->counters[X86_PMC_IDX_FIXED_BTS];
 -      unsigned long orig_ip = data->regs->ip;
        struct bts_record *at, *top;
 +      struct perf_output_handle handle;
 +      struct perf_event_header header;
 +      struct perf_sample_data data;
 +      struct pt_regs regs;
  
        if (!counter)
                return;
        at  = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
        top = (struct bts_record *)(unsigned long)ds->bts_index;
  
 +      if (top <= at)
 +              return;
 +
        ds->bts_index = ds->bts_buffer_base;
  
 +
 +      data.period     = counter->hw.last_period;
 +      data.addr       = 0;
 +      regs.ip         = 0;
 +
 +      /*
 +       * Prepare a generic sample, i.e. fill in the invariant fields.
 +       * We will overwrite the from and to address before we output
 +       * the sample.
 +       */
 +      perf_prepare_sample(&header, &data, counter, &regs);
 +
 +      if (perf_output_begin(&handle, counter,
 +                            header.size * (top - at), 1, 1))
 +              return;
 +
        for (; at < top; at++) {
 -              data->regs->ip  = at->from;
 -              data->addr      = at->to;
 +              data.ip         = at->from;
 +              data.addr       = at->to;
  
 -              perf_counter_output(counter, 1, data);
 +              perf_output_sample(&handle, &header, &data, counter);
        }
  
 -      data->regs->ip  = orig_ip;
 -      data->addr      = 0;
 +      perf_output_end(&handle);
  
        /* There's new data available. */
 +      counter->hw.interrupts++;
        counter->pending_kill = POLL_IN;
  }
  
@@@ -1573,9 -1552,13 +1573,9 @@@ static void x86_pmu_disable(struct perf
        x86_perf_counter_update(counter, hwc, idx);
  
        /* Drain the remaining BTS records. */
 -      if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) {
 -              struct perf_sample_data data;
 -              struct pt_regs regs;
 +      if (unlikely(idx == X86_PMC_IDX_FIXED_BTS))
 +              intel_pmu_drain_bts_buffer(cpuc);
  
 -              data.regs = &regs;
 -              intel_pmu_drain_bts_buffer(cpuc, &data);
 -      }
        cpuc->counters[idx] = NULL;
        clear_bit(idx, cpuc->used_mask);
  
@@@ -1636,6 -1619,7 +1636,6 @@@ static int p6_pmu_handle_irq(struct pt_
        int idx, handled = 0;
        u64 val;
  
 -      data.regs = regs;
        data.addr = 0;
  
        cpuc = &__get_cpu_var(cpu_hw_counters);
                if (!x86_perf_counter_set_period(counter, hwc, idx))
                        continue;
  
 -              if (perf_counter_overflow(counter, 1, &data))
 +              if (perf_counter_overflow(counter, 1, &data, regs))
                        p6_pmu_disable_counter(hwc, idx);
        }
  
@@@ -1681,12 -1665,13 +1681,12 @@@ static int intel_pmu_handle_irq(struct 
        int bit, loops;
        u64 ack, status;
  
 -      data.regs = regs;
        data.addr = 0;
  
        cpuc = &__get_cpu_var(cpu_hw_counters);
  
        perf_disable();
 -      intel_pmu_drain_bts_buffer(cpuc, &data);
 +      intel_pmu_drain_bts_buffer(cpuc);
        status = intel_pmu_get_status();
        if (!status) {
                perf_enable();
@@@ -1717,7 -1702,7 +1717,7 @@@ again
  
                data.period = counter->hw.last_period;
  
 -              if (perf_counter_overflow(counter, 1, &data))
 +              if (perf_counter_overflow(counter, 1, &data, regs))
                        intel_pmu_disable_counter(&counter->hw, bit);
        }
  
@@@ -1744,6 -1729,7 +1744,6 @@@ static int amd_pmu_handle_irq(struct pt
        int idx, handled = 0;
        u64 val;
  
 -      data.regs = regs;
        data.addr = 0;
  
        cpuc = &__get_cpu_var(cpu_hw_counters);
                if (!x86_perf_counter_set_period(counter, hwc, idx))
                        continue;
  
 -              if (perf_counter_overflow(counter, 1, &data))
 +              if (perf_counter_overflow(counter, 1, &data, regs))
                        amd_pmu_disable_counter(hwc, idx);
        }
  
@@@ -2124,8 -2110,8 +2124,8 @@@ void callchain_store(struct perf_callch
                entry->ip[entry->nr++] = ip;
  }
  
- static DEFINE_PER_CPU(struct perf_callchain_entry, irq_entry);
- static DEFINE_PER_CPU(struct perf_callchain_entry, nmi_entry);
+ static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry);
+ static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry);
  static DEFINE_PER_CPU(int, in_nmi_frame);
  
  
@@@ -2278,9 -2264,9 +2278,9 @@@ struct perf_callchain_entry *perf_callc
        struct perf_callchain_entry *entry;
  
        if (in_nmi())
-               entry = &__get_cpu_var(nmi_entry);
+               entry = &__get_cpu_var(pmc_nmi_entry);
        else
-               entry = &__get_cpu_var(irq_entry);
+               entry = &__get_cpu_var(pmc_irq_entry);
  
        entry->nr = 0;
  
diff --combined kernel/perf_counter.c
index 06d233a06da59842a81f51daf24f57c333660d85,8cb94a52d1bb3c21154e8dd028eecc93e1588866..d013f4e89e9cdee913b117cbbc83e4e326889113
@@@ -106,16 -106,16 +106,16 @@@ hw_perf_group_sched_in(struct perf_coun
  
  void __weak perf_counter_print_debug(void)    { }
  
- static DEFINE_PER_CPU(int, disable_count);
+ static DEFINE_PER_CPU(int, perf_disable_count);
  
  void __perf_disable(void)
  {
-       __get_cpu_var(disable_count)++;
+       __get_cpu_var(perf_disable_count)++;
  }
  
  bool __perf_enable(void)
  {
-       return !--__get_cpu_var(disable_count);
+       return !--__get_cpu_var(perf_disable_count);
  }
  
  void perf_disable(void)
@@@ -2176,13 -2176,6 +2176,13 @@@ static int perf_mmap_data_alloc(struct 
        data->nr_pages = nr_pages;
        atomic_set(&data->lock, -1);
  
 +      if (counter->attr.watermark) {
 +              data->watermark = min_t(long, PAGE_SIZE * nr_pages,
 +                                    counter->attr.wakeup_watermark);
 +      }
 +      if (!data->watermark)
 +              data->watermark = max(PAGE_SIZE, PAGE_SIZE * nr_pages / 4);
 +
        rcu_assign_pointer(counter->data, data);
  
        return 0;
@@@ -2322,8 -2315,7 +2322,8 @@@ static int perf_mmap(struct file *file
        lock_limit >>= PAGE_SHIFT;
        locked = vma->vm_mm->locked_vm + extra;
  
 -      if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) {
 +      if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
 +              !capable(CAP_IPC_LOCK)) {
                ret = -EPERM;
                goto unlock;
        }
@@@ -2512,15 -2504,35 +2512,15 @@@ __weak struct perf_callchain_entry *per
  /*
   * Output
   */
 -
 -struct perf_output_handle {
 -      struct perf_counter     *counter;
 -      struct perf_mmap_data   *data;
 -      unsigned long           head;
 -      unsigned long           offset;
 -      int                     nmi;
 -      int                     sample;
 -      int                     locked;
 -      unsigned long           flags;
 -};
 -
 -static bool perf_output_space(struct perf_mmap_data *data,
 -                            unsigned int offset, unsigned int head)
 +static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail,
 +                            unsigned long offset, unsigned long head)
  {
 -      unsigned long tail;
        unsigned long mask;
  
        if (!data->writable)
                return true;
  
        mask = (data->nr_pages << PAGE_SHIFT) - 1;
 -      /*
 -       * Userspace could choose to issue a mb() before updating the tail
 -       * pointer. So that all reads will be completed before the write is
 -       * issued.
 -       */
 -      tail = ACCESS_ONCE(data->user_page->data_tail);
 -      smp_rmb();
  
        offset = (offset - tail) & mask;
        head   = (head   - tail) & mask;
@@@ -2621,8 -2633,8 +2621,8 @@@ out
        local_irq_restore(handle->flags);
  }
  
 -static void perf_output_copy(struct perf_output_handle *handle,
 -                           const void *buf, unsigned int len)
 +void perf_output_copy(struct perf_output_handle *handle,
 +                    const void *buf, unsigned int len)
  {
        unsigned int pages_mask;
        unsigned int offset;
        WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0);
  }
  
 -#define perf_output_put(handle, x) \
 -      perf_output_copy((handle), &(x), sizeof(x))
 -
 -static int perf_output_begin(struct perf_output_handle *handle,
 -                           struct perf_counter *counter, unsigned int size,
 -                           int nmi, int sample)
 +int perf_output_begin(struct perf_output_handle *handle,
 +                    struct perf_counter *counter, unsigned int size,
 +                    int nmi, int sample)
  {
        struct perf_counter *output_counter;
        struct perf_mmap_data *data;
 -      unsigned int offset, head;
 +      unsigned long tail, offset, head;
        int have_lost;
        struct {
                struct perf_event_header header;
        perf_output_lock(handle);
  
        do {
 +              /*
 +               * Userspace could choose to issue a mb() before updating the
 +               * tail pointer. So that all reads will be completed before the
 +               * write is issued.
 +               */
 +              tail = ACCESS_ONCE(data->user_page->data_tail);
 +              smp_rmb();
                offset = head = atomic_long_read(&data->head);
                head += size;
 -              if (unlikely(!perf_output_space(data, offset, head)))
 +              if (unlikely(!perf_output_space(data, tail, offset, head)))
                        goto fail;
        } while (atomic_long_cmpxchg(&data->head, offset, head) != offset);
  
        handle->offset  = offset;
        handle->head    = head;
  
 -      if ((offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT))
 +      if (head - tail > data->watermark)
                atomic_set(&data->wakeup, 1);
  
        if (have_lost) {
@@@ -2741,7 -2749,7 +2741,7 @@@ out
        return -ENOSPC;
  }
  
 -static void perf_output_end(struct perf_output_handle *handle)
 +void perf_output_end(struct perf_output_handle *handle)
  {
        struct perf_counter *counter = handle->counter;
        struct perf_mmap_data *data = handle->data;
@@@ -2855,176 -2863,156 +2855,176 @@@ static void perf_output_read(struct per
                perf_output_read_one(handle, counter);
  }
  
 -void perf_counter_output(struct perf_counter *counter, int nmi,
 -                              struct perf_sample_data *data)
 +void perf_output_sample(struct perf_output_handle *handle,
 +                      struct perf_event_header *header,
 +                      struct perf_sample_data *data,
 +                      struct perf_counter *counter)
  {
 -      int ret;
 -      u64 sample_type = counter->attr.sample_type;
 -      struct perf_output_handle handle;
 -      struct perf_event_header header;
 -      u64 ip;
 -      struct {
 -              u32 pid, tid;
 -      } tid_entry;
 -      struct perf_callchain_entry *callchain = NULL;
 -      int callchain_size = 0;
 -      u64 time;
 -      struct {
 -              u32 cpu, reserved;
 -      } cpu_entry;
 -
 -      header.type = PERF_EVENT_SAMPLE;
 -      header.size = sizeof(header);
 -
 -      header.misc = 0;
 -      header.misc |= perf_misc_flags(data->regs);
 -
 -      if (sample_type & PERF_SAMPLE_IP) {
 -              ip = perf_instruction_pointer(data->regs);
 -              header.size += sizeof(ip);
 -      }
 +      u64 sample_type = data->type;
  
 -      if (sample_type & PERF_SAMPLE_TID) {
 -              /* namespace issues */
 -              tid_entry.pid = perf_counter_pid(counter, current);
 -              tid_entry.tid = perf_counter_tid(counter, current);
 +      perf_output_put(handle, *header);
  
 -              header.size += sizeof(tid_entry);
 -      }
 +      if (sample_type & PERF_SAMPLE_IP)
 +              perf_output_put(handle, data->ip);
  
 -      if (sample_type & PERF_SAMPLE_TIME) {
 -              /*
 -               * Maybe do better on x86 and provide cpu_clock_nmi()
 -               */
 -              time = sched_clock();
 +      if (sample_type & PERF_SAMPLE_TID)
 +              perf_output_put(handle, data->tid_entry);
  
 -              header.size += sizeof(u64);
 -      }
 +      if (sample_type & PERF_SAMPLE_TIME)
 +              perf_output_put(handle, data->time);
  
        if (sample_type & PERF_SAMPLE_ADDR)
 -              header.size += sizeof(u64);
 +              perf_output_put(handle, data->addr);
  
        if (sample_type & PERF_SAMPLE_ID)
 -              header.size += sizeof(u64);
 +              perf_output_put(handle, data->id);
  
        if (sample_type & PERF_SAMPLE_STREAM_ID)
 -              header.size += sizeof(u64);
 +              perf_output_put(handle, data->stream_id);
  
 -      if (sample_type & PERF_SAMPLE_CPU) {
 -              header.size += sizeof(cpu_entry);
 -
 -              cpu_entry.cpu = raw_smp_processor_id();
 -              cpu_entry.reserved = 0;
 -      }
 +      if (sample_type & PERF_SAMPLE_CPU)
 +              perf_output_put(handle, data->cpu_entry);
  
        if (sample_type & PERF_SAMPLE_PERIOD)
 -              header.size += sizeof(u64);
 +              perf_output_put(handle, data->period);
  
        if (sample_type & PERF_SAMPLE_READ)
 -              header.size += perf_counter_read_size(counter);
 +              perf_output_read(handle, counter);
  
        if (sample_type & PERF_SAMPLE_CALLCHAIN) {
 -              callchain = perf_callchain(data->regs);
 +              if (data->callchain) {
 +                      int size = 1;
  
 -              if (callchain) {
 -                      callchain_size = (1 + callchain->nr) * sizeof(u64);
 -                      header.size += callchain_size;
 -              } else
 -                      header.size += sizeof(u64);
 +                      if (data->callchain)
 +                              size += data->callchain->nr;
 +
 +                      size *= sizeof(u64);
 +
 +                      perf_output_copy(handle, data->callchain, size);
 +              } else {
 +                      u64 nr = 0;
 +                      perf_output_put(handle, nr);
 +              }
        }
  
        if (sample_type & PERF_SAMPLE_RAW) {
 -              int size = sizeof(u32);
 +              if (data->raw) {
 +                      perf_output_put(handle, data->raw->size);
 +                      perf_output_copy(handle, data->raw->data,
 +                                       data->raw->size);
 +              } else {
 +                      struct {
 +                              u32     size;
 +                              u32     data;
 +                      } raw = {
 +                              .size = sizeof(u32),
 +                              .data = 0,
 +                      };
 +                      perf_output_put(handle, raw);
 +              }
 +      }
 +}
  
 -              if (data->raw)
 -                      size += data->raw->size;
 -              else
 -                      size += sizeof(u32);
 +void perf_prepare_sample(struct perf_event_header *header,
 +                       struct perf_sample_data *data,
 +                       struct perf_counter *counter,
 +                       struct pt_regs *regs)
 +{
 +      u64 sample_type = counter->attr.sample_type;
  
 -              WARN_ON_ONCE(size & (sizeof(u64)-1));
 -              header.size += size;
 -      }
 +      data->type = sample_type;
  
 -      ret = perf_output_begin(&handle, counter, header.size, nmi, 1);
 -      if (ret)
 -              return;
 +      header->type = PERF_EVENT_SAMPLE;
 +      header->size = sizeof(*header);
  
 -      perf_output_put(&handle, header);
 +      header->misc = 0;
 +      header->misc |= perf_misc_flags(regs);
  
 -      if (sample_type & PERF_SAMPLE_IP)
 -              perf_output_put(&handle, ip);
 +      if (sample_type & PERF_SAMPLE_IP) {
 +              data->ip = perf_instruction_pointer(regs);
  
 -      if (sample_type & PERF_SAMPLE_TID)
 -              perf_output_put(&handle, tid_entry);
 +              header->size += sizeof(data->ip);
 +      }
  
 -      if (sample_type & PERF_SAMPLE_TIME)
 -              perf_output_put(&handle, time);
 +      if (sample_type & PERF_SAMPLE_TID) {
 +              /* namespace issues */
 +              data->tid_entry.pid = perf_counter_pid(counter, current);
 +              data->tid_entry.tid = perf_counter_tid(counter, current);
 +
 +              header->size += sizeof(data->tid_entry);
 +      }
 +
 +      if (sample_type & PERF_SAMPLE_TIME) {
 +              data->time = perf_clock();
 +
 +              header->size += sizeof(data->time);
 +      }
  
        if (sample_type & PERF_SAMPLE_ADDR)
 -              perf_output_put(&handle, data->addr);
 +              header->size += sizeof(data->addr);
  
        if (sample_type & PERF_SAMPLE_ID) {
 -              u64 id = primary_counter_id(counter);
 +              data->id = primary_counter_id(counter);
  
 -              perf_output_put(&handle, id);
 +              header->size += sizeof(data->id);
        }
  
 -      if (sample_type & PERF_SAMPLE_STREAM_ID)
 -              perf_output_put(&handle, counter->id);
 +      if (sample_type & PERF_SAMPLE_STREAM_ID) {
 +              data->stream_id = counter->id;
  
 -      if (sample_type & PERF_SAMPLE_CPU)
 -              perf_output_put(&handle, cpu_entry);
 +              header->size += sizeof(data->stream_id);
 +      }
 +
 +      if (sample_type & PERF_SAMPLE_CPU) {
 +              data->cpu_entry.cpu             = raw_smp_processor_id();
 +              data->cpu_entry.reserved        = 0;
 +
 +              header->size += sizeof(data->cpu_entry);
 +      }
  
        if (sample_type & PERF_SAMPLE_PERIOD)
 -              perf_output_put(&handle, data->period);
 +              header->size += sizeof(data->period);
  
        if (sample_type & PERF_SAMPLE_READ)
 -              perf_output_read(&handle, counter);
 +              header->size += perf_counter_read_size(counter);
  
        if (sample_type & PERF_SAMPLE_CALLCHAIN) {
 -              if (callchain)
 -                      perf_output_copy(&handle, callchain, callchain_size);
 -              else {
 -                      u64 nr = 0;
 -                      perf_output_put(&handle, nr);
 -              }
 +              int size = 1;
 +
 +              data->callchain = perf_callchain(regs);
 +
 +              if (data->callchain)
 +                      size += data->callchain->nr;
 +
 +              header->size += size * sizeof(u64);
        }
  
        if (sample_type & PERF_SAMPLE_RAW) {
 -              if (data->raw) {
 -                      perf_output_put(&handle, data->raw->size);
 -                      perf_output_copy(&handle, data->raw->data, data->raw->size);
 -              } else {
 -                      struct {
 -                              u32     size;
 -                              u32     data;
 -                      } raw = {
 -                              .size = sizeof(u32),
 -                              .data = 0,
 -                      };
 -                      perf_output_put(&handle, raw);
 -              }
 +              int size = sizeof(u32);
 +
 +              if (data->raw)
 +                      size += data->raw->size;
 +              else
 +                      size += sizeof(u32);
 +
 +              WARN_ON_ONCE(size & (sizeof(u64)-1));
 +              header->size += size;
        }
 +}
 +
 +static void perf_counter_output(struct perf_counter *counter, int nmi,
 +                              struct perf_sample_data *data,
 +                              struct pt_regs *regs)
 +{
 +      struct perf_output_handle handle;
 +      struct perf_event_header header;
 +
 +      perf_prepare_sample(&header, data, counter, regs);
 +
 +      if (perf_output_begin(&handle, counter, header.size, nmi, 1))
 +              return;
 +
 +      perf_output_sample(&handle, &header, data, counter);
  
        perf_output_end(&handle);
  }
@@@ -3485,7 -3473,7 +3485,7 @@@ static void perf_log_throttle(struct pe
                        .misc = 0,
                        .size = sizeof(throttle_event),
                },
 -              .time           = sched_clock(),
 +              .time           = perf_clock(),
                .id             = primary_counter_id(counter),
                .stream_id      = counter->id,
        };
   * Generic counter overflow handling, sampling.
   */
  
 -int perf_counter_overflow(struct perf_counter *counter, int nmi,
 -                        struct perf_sample_data *data)
 +static int __perf_counter_overflow(struct perf_counter *counter, int nmi,
 +                                 int throttle, struct perf_sample_data *data,
 +                                 struct pt_regs *regs)
  {
        int events = atomic_read(&counter->event_limit);
 -      int throttle = counter->pmu->unthrottle != NULL;
        struct hw_perf_counter *hwc = &counter->hw;
        int ret = 0;
  
 +      throttle = (throttle && counter->pmu->unthrottle != NULL);
 +
        if (!throttle) {
                hwc->interrupts++;
        } else {
        }
  
        if (counter->attr.freq) {
 -              u64 now = sched_clock();
 +              u64 now = perf_clock();
                s64 delta = now - hwc->freq_stamp;
  
                hwc->freq_stamp = now;
                        perf_counter_disable(counter);
        }
  
 -      perf_counter_output(counter, nmi, data);
 +      perf_counter_output(counter, nmi, data, regs);
        return ret;
  }
  
 +int perf_counter_overflow(struct perf_counter *counter, int nmi,
 +                        struct perf_sample_data *data,
 +                        struct pt_regs *regs)
 +{
 +      return __perf_counter_overflow(counter, nmi, 1, data, regs);
 +}
 +
  /*
   * Generic software counter infrastructure
   */
@@@ -3609,11 -3588,9 +3609,11 @@@ again
  }
  
  static void perf_swcounter_overflow(struct perf_counter *counter,
 -                                  int nmi, struct perf_sample_data *data)
 +                                  int nmi, struct perf_sample_data *data,
 +                                  struct pt_regs *regs)
  {
        struct hw_perf_counter *hwc = &counter->hw;
 +      int throttle = 0;
        u64 overflow;
  
        data->period = counter->hw.last_period;
                return;
  
        for (; overflow; overflow--) {
 -              if (perf_counter_overflow(counter, nmi, data)) {
 +              if (__perf_counter_overflow(counter, nmi, throttle,
 +                                          data, regs)) {
                        /*
                         * We inhibit the overflow from happening when
                         * hwc->interrupts == MAX_INTERRUPTS.
                         */
                        break;
                }
 +              throttle = 1;
        }
  }
  
@@@ -3643,8 -3618,7 +3643,8 @@@ static void perf_swcounter_unthrottle(s
  }
  
  static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
 -                             int nmi, struct perf_sample_data *data)
 +                             int nmi, struct perf_sample_data *data,
 +                             struct pt_regs *regs)
  {
        struct hw_perf_counter *hwc = &counter->hw;
  
        if (!hwc->sample_period)
                return;
  
 -      if (!data->regs)
 +      if (!regs)
                return;
  
        if (!atomic64_add_negative(nr, &hwc->period_left))
 -              perf_swcounter_overflow(counter, nmi, data);
 +              perf_swcounter_overflow(counter, nmi, data, regs);
  }
  
  static int perf_swcounter_is_counting(struct perf_counter *counter)
@@@ -3716,8 -3690,7 +3716,8 @@@ static int perf_swcounter_match(struct 
  static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
                                     enum perf_type_id type,
                                     u32 event, u64 nr, int nmi,
 -                                   struct perf_sample_data *data)
 +                                   struct perf_sample_data *data,
 +                                   struct pt_regs *regs)
  {
        struct perf_counter *counter;
  
  
        rcu_read_lock();
        list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
 -              if (perf_swcounter_match(counter, type, event, data->regs))
 -                      perf_swcounter_add(counter, nr, nmi, data);
 +              if (perf_swcounter_match(counter, type, event, regs))
 +                      perf_swcounter_add(counter, nr, nmi, data, regs);
        }
        rcu_read_unlock();
  }
@@@ -3748,8 -3721,7 +3748,8 @@@ static int *perf_swcounter_recursion_co
  
  static void do_perf_swcounter_event(enum perf_type_id type, u32 event,
                                    u64 nr, int nmi,
 -                                  struct perf_sample_data *data)
 +                                  struct perf_sample_data *data,
 +                                  struct pt_regs *regs)
  {
        struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
        int *recursion = perf_swcounter_recursion_context(cpuctx);
        barrier();
  
        perf_swcounter_ctx_event(&cpuctx->ctx, type, event,
 -                               nr, nmi, data);
 +                               nr, nmi, data, regs);
        rcu_read_lock();
        /*
         * doesn't really matter which of the child contexts the
         */
        ctx = rcu_dereference(current->perf_counter_ctxp);
        if (ctx)
 -              perf_swcounter_ctx_event(ctx, type, event, nr, nmi, data);
 +              perf_swcounter_ctx_event(ctx, type, event, nr, nmi, data, regs);
        rcu_read_unlock();
  
        barrier();
@@@ -3784,11 -3756,11 +3784,11 @@@ void __perf_swcounter_event(u32 event, 
                            struct pt_regs *regs, u64 addr)
  {
        struct perf_sample_data data = {
 -              .regs = regs,
                .addr = addr,
        };
  
 -      do_perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, &data);
 +      do_perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi,
 +                              &data, regs);
  }
  
  static void perf_swcounter_read(struct perf_counter *counter)
@@@ -3825,7 -3797,6 +3825,7 @@@ static enum hrtimer_restart perf_swcoun
  {
        enum hrtimer_restart ret = HRTIMER_RESTART;
        struct perf_sample_data data;
 +      struct pt_regs *regs;
        struct perf_counter *counter;
        u64 period;
  
        counter->pmu->read(counter);
  
        data.addr = 0;
 -      data.regs = get_irq_regs();
 +      regs = get_irq_regs();
        /*
         * In case we exclude kernel IPs or are somehow not in interrupt
         * context, provide the next best thing, the user IP.
         */
 -      if ((counter->attr.exclude_kernel || !data.regs) &&
 +      if ((counter->attr.exclude_kernel || !regs) &&
                        !counter->attr.exclude_user)
 -              data.regs = task_pt_regs(current);
 +              regs = task_pt_regs(current);
  
 -      if (data.regs) {
 -              if (perf_counter_overflow(counter, 0, &data))
 +      if (regs) {
 +              if (perf_counter_overflow(counter, 0, &data, regs))
                        ret = HRTIMER_NORESTART;
        }
  
@@@ -3979,17 -3950,15 +3979,17 @@@ void perf_tpcounter_event(int event_id
        };
  
        struct perf_sample_data data = {
 -              .regs = get_irq_regs(),
                .addr = addr,
                .raw = &raw,
        };
  
 -      if (!data.regs)
 -              data.regs = task_pt_regs(current);
 +      struct pt_regs *regs = get_irq_regs();
 +
 +      if (!regs)
 +              regs = task_pt_regs(current);
  
 -      do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, &data);
 +      do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, count, 1,
 +                              &data, regs);
  }
  EXPORT_SYMBOL_GPL(perf_tpcounter_event);
  
@@@ -4246,6 -4215,7 +4246,7 @@@ static int perf_copy_attr(struct perf_c
                        if (val)
                                goto err_size;
                }
+               size = sizeof(*attr);
        }
  
        ret = copy_from_user(attr, uattr, size);
diff --combined kernel/sched_fair.c
index a097e909e80f1d38ec976b857f1b227efa920be7,10d218ab69f2ba4eac39a2d07461c0b2b256de34..990b188803ced00f8b3554bb1fe570b14b716c69
@@@ -513,7 -513,6 +513,7 @@@ static void update_curr(struct cfs_rq *
        if (entity_is_task(curr)) {
                struct task_struct *curtask = task_of(curr);
  
 +              trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
                cpuacct_charge(curtask, delta_exec);
                account_group_exec_runtime(curtask, delta_exec);
        }
@@@ -712,7 -711,7 +712,7 @@@ place_entity(struct cfs_rq *cfs_rq, str
  
        if (!initial) {
                /* sleeps upto a single latency don't count. */
-               if (sched_feat(NEW_FAIR_SLEEPERS)) {
+               if (sched_feat(FAIR_SLEEPERS)) {
                        unsigned long thresh = sysctl_sched_latency;
  
                        /*
                                         task_of(se)->policy != SCHED_IDLE))
                                thresh = calc_delta_fair(thresh, se);
  
+                       /*
+                        * Halve their sleep time's effect, to allow
+                        * for a gentler effect of sleepers:
+                        */
+                       if (sched_feat(GENTLE_FAIR_SLEEPERS))
+                               thresh >>= 1;
                        vruntime -= thresh;
                }
        }
@@@ -758,10 -764,10 +765,10 @@@ enqueue_entity(struct cfs_rq *cfs_rq, s
  
  static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
  {
-       if (cfs_rq->last == se)
+       if (!se || cfs_rq->last == se)
                cfs_rq->last = NULL;
  
-       if (cfs_rq->next == se)
+       if (!se || cfs_rq->next == se)
                cfs_rq->next = NULL;
  }
  
@@@ -1063,83 -1069,6 +1070,6 @@@ static void yield_task_fair(struct rq *
        se->vruntime = rightmost->vruntime + 1;
  }
  
- /*
-  * wake_idle() will wake a task on an idle cpu if task->cpu is
-  * not idle and an idle cpu is available.  The span of cpus to
-  * search starts with cpus closest then further out as needed,
-  * so we always favor a closer, idle cpu.
-  * Domains may include CPUs that are not usable for migration,
-  * hence we need to mask them out (rq->rd->online)
-  *
-  * Returns the CPU we should wake onto.
-  */
- #if defined(ARCH_HAS_SCHED_WAKE_IDLE)
- #define cpu_rd_active(cpu, rq) cpumask_test_cpu(cpu, rq->rd->online)
- static int wake_idle(int cpu, struct task_struct *p)
- {
-       struct sched_domain *sd;
-       int i;
-       unsigned int chosen_wakeup_cpu;
-       int this_cpu;
-       struct rq *task_rq = task_rq(p);
-       /*
-        * At POWERSAVINGS_BALANCE_WAKEUP level, if both this_cpu and prev_cpu
-        * are idle and this is not a kernel thread and this task's affinity
-        * allows it to be moved to preferred cpu, then just move!
-        */
-       this_cpu = smp_processor_id();
-       chosen_wakeup_cpu =
-               cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu;
-       if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP &&
-               idle_cpu(cpu) && idle_cpu(this_cpu) &&
-               p->mm && !(p->flags & PF_KTHREAD) &&
-               cpu_isset(chosen_wakeup_cpu, p->cpus_allowed))
-               return chosen_wakeup_cpu;
-       /*
-        * If it is idle, then it is the best cpu to run this task.
-        *
-        * This cpu is also the best, if it has more than one task already.
-        * Siblings must be also busy(in most cases) as they didn't already
-        * pickup the extra load from this cpu and hence we need not check
-        * sibling runqueue info. This will avoid the checks and cache miss
-        * penalities associated with that.
-        */
-       if (idle_cpu(cpu) || cpu_rq(cpu)->cfs.nr_running > 1)
-               return cpu;
-       for_each_domain(cpu, sd) {
-               if ((sd->flags & SD_WAKE_IDLE)
-                   || ((sd->flags & SD_WAKE_IDLE_FAR)
-                       && !task_hot(p, task_rq->clock, sd))) {
-                       for_each_cpu_and(i, sched_domain_span(sd),
-                                        &p->cpus_allowed) {
-                               if (cpu_rd_active(i, task_rq) && idle_cpu(i)) {
-                                       if (i != task_cpu(p)) {
-                                               schedstat_inc(p,
-                                                      se.nr_wakeups_idle);
-                                       }
-                                       return i;
-                               }
-                       }
-               } else {
-                       break;
-               }
-       }
-       return cpu;
- }
- #else /* !ARCH_HAS_SCHED_WAKE_IDLE*/
- static inline int wake_idle(int cpu, struct task_struct *p)
- {
-       return cpu;
- }
- #endif
  #ifdef CONFIG_SMP
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
@@@ -1226,25 -1155,34 +1156,34 @@@ static inline unsigned long effective_l
  
  #endif
  
- static int
- wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
-           struct task_struct *p, int prev_cpu, int this_cpu, int sync,
-           int idx, unsigned long load, unsigned long this_load,
-           unsigned int imbalance)
+ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
  {
-       struct task_struct *curr = this_rq->curr;
-       struct task_group *tg;
-       unsigned long tl = this_load;
+       struct task_struct *curr = current;
+       unsigned long this_load, load;
+       int idx, this_cpu, prev_cpu;
        unsigned long tl_per_task;
+       unsigned int imbalance;
+       struct task_group *tg;
        unsigned long weight;
        int balanced;
  
-       if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS))
-               return 0;
+       idx       = sd->wake_idx;
+       this_cpu  = smp_processor_id();
+       prev_cpu  = task_cpu(p);
+       load      = source_load(prev_cpu, idx);
+       this_load = target_load(this_cpu, idx);
  
-       if (sync && (curr->se.avg_overlap > sysctl_sched_migration_cost ||
-                       p->se.avg_overlap > sysctl_sched_migration_cost))
-               sync = 0;
+       if (sync) {
+              if (sched_feat(SYNC_LESS) &&
+                  (curr->se.avg_overlap > sysctl_sched_migration_cost ||
+                   p->se.avg_overlap > sysctl_sched_migration_cost))
+                      sync = 0;
+       } else {
+               if (sched_feat(SYNC_MORE) &&
+                   (curr->se.avg_overlap < sysctl_sched_migration_cost &&
+                    p->se.avg_overlap < sysctl_sched_migration_cost))
+                       sync = 1;
+       }
  
        /*
         * If sync wakeup then subtract the (maximum possible)
                tg = task_group(current);
                weight = current->se.load.weight;
  
-               tl += effective_load(tg, this_cpu, -weight, -weight);
+               this_load += effective_load(tg, this_cpu, -weight, -weight);
                load += effective_load(tg, prev_cpu, 0, -weight);
        }
  
        tg = task_group(p);
        weight = p->se.load.weight;
  
+       imbalance = 100 + (sd->imbalance_pct - 100) / 2;
        /*
         * In low-load situations, where prev_cpu is idle and this_cpu is idle
-        * due to the sync cause above having dropped tl to 0, we'll always have
-        * an imbalance, but there's really nothing you can do about that, so
-        * that's good too.
+        * due to the sync cause above having dropped this_load to 0, we'll
+        * always have an imbalance, but there's really nothing you can do
+        * about that, so that's good too.
         *
         * Otherwise check if either cpus are near enough in load to allow this
         * task to be woken on this_cpu.
         */
-       balanced = !tl ||
-               100*(tl + effective_load(tg, this_cpu, weight, weight)) <=
+       balanced = !this_load ||
+               100*(this_load + effective_load(tg, this_cpu, weight, weight)) <=
                imbalance*(load + effective_load(tg, prev_cpu, 0, weight));
  
        /*
        schedstat_inc(p, se.nr_wakeups_affine_attempts);
        tl_per_task = cpu_avg_load_per_task(this_cpu);
  
-       if (balanced || (tl <= load && tl + target_load(prev_cpu, idx) <=
-                       tl_per_task)) {
+       if (balanced ||
+           (this_load <= load &&
+            this_load + target_load(prev_cpu, idx) <= tl_per_task)) {
                /*
                 * This domain has SD_WAKE_AFFINE and
                 * p is cache cold in this domain, and
                 * there is no bad imbalance.
                 */
-               schedstat_inc(this_sd, ttwu_move_affine);
+               schedstat_inc(sd, ttwu_move_affine);
                schedstat_inc(p, se.nr_wakeups_affine);
  
                return 1;
        return 0;
  }
  
- static int select_task_rq_fair(struct task_struct *p, int sync)
+ /*
+  * find_idlest_group finds and returns the least busy CPU group within the
+  * domain.
+  */
+ static struct sched_group *
+ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
+                 int this_cpu, int load_idx)
  {
-       struct sched_domain *sd, *this_sd = NULL;
-       int prev_cpu, this_cpu, new_cpu;
-       unsigned long load, this_load;
-       struct rq *this_rq;
-       unsigned int imbalance;
-       int idx;
+       struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
+       unsigned long min_load = ULONG_MAX, this_load = 0;
+       int imbalance = 100 + (sd->imbalance_pct-100)/2;
  
-       prev_cpu        = task_cpu(p);
-       this_cpu        = smp_processor_id();
-       this_rq         = cpu_rq(this_cpu);
-       new_cpu         = prev_cpu;
+       do {
+               unsigned long load, avg_load;
+               int local_group;
+               int i;
  
-       /*
-        * 'this_sd' is the first domain that both
-        * this_cpu and prev_cpu are present in:
-        */
-       for_each_domain(this_cpu, sd) {
-               if (cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) {
-                       this_sd = sd;
-                       break;
+               /* Skip over this group if it has no CPUs allowed */
+               if (!cpumask_intersects(sched_group_cpus(group),
+                                       &p->cpus_allowed))
+                       continue;
+               local_group = cpumask_test_cpu(this_cpu,
+                                              sched_group_cpus(group));
+               /* Tally up the load of all CPUs in the group */
+               avg_load = 0;
+               for_each_cpu(i, sched_group_cpus(group)) {
+                       /* Bias balancing toward cpus of our domain */
+                       if (local_group)
+                               load = source_load(i, load_idx);
+                       else
+                               load = target_load(i, load_idx);
+                       avg_load += load;
+               }
+               /* Adjust by relative CPU power of the group */
+               avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
+               if (local_group) {
+                       this_load = avg_load;
+                       this = group;
+               } else if (avg_load < min_load) {
+                       min_load = avg_load;
+                       idlest = group;
+               }
+       } while (group = group->next, group != sd->groups);
+       if (!idlest || 100*this_load < imbalance*min_load)
+               return NULL;
+       return idlest;
+ }
+ /*
+  * find_idlest_cpu - find the idlest cpu among the cpus in group.
+  */
+ static int
+ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
+ {
+       unsigned long load, min_load = ULONG_MAX;
+       int idlest = -1;
+       int i;
+       /* Traverse only the allowed CPUs */
+       for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) {
+               load = weighted_cpuload(i);
+               if (load < min_load || (load == min_load && i == this_cpu)) {
+                       min_load = load;
+                       idlest = i;
                }
        }
  
-       if (unlikely(!cpumask_test_cpu(this_cpu, &p->cpus_allowed)))
-               goto out;
+       return idlest;
+ }
  
-       /*
-        * Check for affine wakeup and passive balancing possibilities.
-        */
-       if (!this_sd)
+ /*
+  * sched_balance_self: balance the current task (running on cpu) in domains
+  * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
+  * SD_BALANCE_EXEC.
+  *
+  * Balance, ie. select the least loaded group.
+  *
+  * Returns the target CPU number, or the same CPU if no balancing is needed.
+  *
+  * preempt must be disabled.
+  */
+ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
+ {
+       struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
+       int cpu = smp_processor_id();
+       int prev_cpu = task_cpu(p);
+       int new_cpu = cpu;
+       int want_affine = 0;
+       int want_sd = 1;
+       int sync = wake_flags & WF_SYNC;
+       if (sd_flag & SD_BALANCE_WAKE) {
+               if (sched_feat(AFFINE_WAKEUPS))
+                       want_affine = 1;
+               new_cpu = prev_cpu;
+       }
+       rcu_read_lock();
+       for_each_domain(cpu, tmp) {
+               /*
+                * If power savings logic is enabled for a domain, see if we
+                * are not overloaded, if so, don't balance wider.
+                */
+               if (tmp->flags & (SD_POWERSAVINGS_BALANCE|SD_PREFER_LOCAL)) {
+                       unsigned long power = 0;
+                       unsigned long nr_running = 0;
+                       unsigned long capacity;
+                       int i;
+                       for_each_cpu(i, sched_domain_span(tmp)) {
+                               power += power_of(i);
+                               nr_running += cpu_rq(i)->cfs.nr_running;
+                       }
+                       capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
+                       if (tmp->flags & SD_POWERSAVINGS_BALANCE)
+                               nr_running /= 2;
+                       if (nr_running < capacity)
+                               want_sd = 0;
+               }
+               if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
+                   cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
+                       affine_sd = tmp;
+                       want_affine = 0;
+               }
+               if (!want_sd && !want_affine)
+                       break;
+               if (!(tmp->flags & sd_flag))
+                       continue;
+               if (want_sd)
+                       sd = tmp;
+       }
+       if (sched_feat(LB_SHARES_UPDATE)) {
+               /*
+                * Pick the largest domain to update shares over
+                */
+               tmp = sd;
+               if (affine_sd && (!tmp ||
+                                 cpumask_weight(sched_domain_span(affine_sd)) >
+                                 cpumask_weight(sched_domain_span(sd))))
+                       tmp = affine_sd;
+               if (tmp)
+                       update_shares(tmp);
+       }
+       if (affine_sd && wake_affine(affine_sd, p, sync)) {
+               new_cpu = cpu;
                goto out;
+       }
  
-       idx = this_sd->wake_idx;
+       while (sd) {
+               int load_idx = sd->forkexec_idx;
+               struct sched_group *group;
+               int weight;
  
-       imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
+               if (!(sd->flags & sd_flag)) {
+                       sd = sd->child;
+                       continue;
+               }
  
-       load = source_load(prev_cpu, idx);
-       this_load = target_load(this_cpu, idx);
+               if (sd_flag & SD_BALANCE_WAKE)
+                       load_idx = sd->wake_idx;
  
-       if (wake_affine(this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx,
-                                    load, this_load, imbalance))
-               return this_cpu;
+               group = find_idlest_group(sd, p, cpu, load_idx);
+               if (!group) {
+                       sd = sd->child;
+                       continue;
+               }
  
-       /*
-        * Start passive balancing when half the imbalance_pct
-        * limit is reached.
-        */
-       if (this_sd->flags & SD_WAKE_BALANCE) {
-               if (imbalance*this_load <= 100*load) {
-                       schedstat_inc(this_sd, ttwu_move_balance);
-                       schedstat_inc(p, se.nr_wakeups_passive);
-                       return this_cpu;
+               new_cpu = find_idlest_cpu(group, p, cpu);
+               if (new_cpu == -1 || new_cpu == cpu) {
+                       /* Now try balancing at a lower domain level of cpu */
+                       sd = sd->child;
+                       continue;
                }
+               /* Now try balancing at a lower domain level of new_cpu */
+               cpu = new_cpu;
+               weight = cpumask_weight(sched_domain_span(sd));
+               sd = NULL;
+               for_each_domain(cpu, tmp) {
+                       if (weight <= cpumask_weight(sched_domain_span(tmp)))
+                               break;
+                       if (tmp->flags & sd_flag)
+                               sd = tmp;
+               }
+               /* while loop will break here if sd == NULL */
        }
  
  out:
-       return wake_idle(new_cpu, p);
+       rcu_read_unlock();
+       return new_cpu;
  }
  #endif /* CONFIG_SMP */
  
@@@ -1472,11 -1563,12 +1564,12 @@@ static void set_next_buddy(struct sched
  /*
   * Preempt the current task with a newly woken task if needed:
   */
- static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
+ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
  {
        struct task_struct *curr = rq->curr;
        struct sched_entity *se = &curr->se, *pse = &p->se;
        struct cfs_rq *cfs_rq = task_cfs_rq(curr);
+       int sync = wake_flags & WF_SYNC;
  
        update_curr(cfs_rq);
  
         */
        if (sched_feat(LAST_BUDDY) && likely(se->on_rq && curr != rq->idle))
                set_last_buddy(se);
-       set_next_buddy(pse);
+       if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK))
+               set_next_buddy(pse);
  
        /*
         * We can come here with TIF_NEED_RESCHED already set from new task
                return;
        }
  
-       if (!sched_feat(WAKEUP_PREEMPT))
-               return;
-       if (sched_feat(WAKEUP_OVERLAP) && (sync ||
-                       (se->avg_overlap < sysctl_sched_migration_cost &&
-                        pse->avg_overlap < sysctl_sched_migration_cost))) {
+       if ((sched_feat(WAKEUP_SYNC) && sync) ||
+           (sched_feat(WAKEUP_OVERLAP) &&
+            (se->avg_overlap < sysctl_sched_migration_cost &&
+             pse->avg_overlap < sysctl_sched_migration_cost))) {
                resched_task(curr);
                return;
        }
  
+       if (sched_feat(WAKEUP_RUNNING)) {
+               if (pse->avg_running < se->avg_running) {
+                       set_next_buddy(pse);
+                       resched_task(curr);
+                       return;
+               }
+       }
+       if (!sched_feat(WAKEUP_PREEMPT))
+               return;
        find_matching_se(&se, &pse);
  
        BUG_ON(!pse);
@@@ -1556,8 -1658,13 +1659,13 @@@ static struct task_struct *pick_next_ta
                /*
                 * If se was a buddy, clear it so that it will have to earn
                 * the favour again.
+                *
+                * If se was not a buddy, clear the buddies because neither
+                * was elegible to run, let them earn it again.
+                *
+                * IOW. unconditionally clear buddies.
                 */
-               __clear_buddies(cfs_rq, se);
+               __clear_buddies(cfs_rq, NULL);
                set_next_entity(cfs_rq, se);
                cfs_rq = group_cfs_rq(se);
        } while (cfs_rq);