Merge branch 'linus' into perfcounters/core

author Ingo Molnar <mingo@elte.hu>

Sat, 19 Sep 2009 09:27:32 +0000 (11:27 +0200)

committer Ingo Molnar <mingo@elte.hu>

Sat, 19 Sep 2009 09:28:41 +0000 (11:28 +0200)
author Ingo Molnar <mingo@elte.hu>
Sat, 19 Sep 2009 09:27:32 +0000 (11:27 +0200)
committer Ingo Molnar <mingo@elte.hu>
Sat, 19 Sep 2009 09:28:41 +0000 (11:28 +0200)
diff --combined arch/x86/kernel/cpu/perf_counter.c

index 6a0e71b38126cc93f84624502d06e9abfca6f222,2732e2c1e4d340257e10a072fd7f4e2dbaa0a0a3..dbdf712fae9ec52f5a75c4a2b50fb6d272ca6f43
--- 1/arch/x86/kernel/cpu/perf_counter.c
--- 2/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@@ -36,10 -36,10 +36,10 @@@ static u64 perf_counter_mask __read_mos
   #define BTS_RECORD_SIZE               24
   
   /* The size of a per-cpu BTS buffer in bytes: */
- -#define BTS_BUFFER_SIZE               (BTS_RECORD_SIZE * 1024)
+ +#define BTS_BUFFER_SIZE               (BTS_RECORD_SIZE * 2048)
   
   /* The BTS overflow threshold in bytes from the end of the buffer: */
- -#define BTS_OVFL_TH           (BTS_RECORD_SIZE * 64)
+ +#define BTS_OVFL_TH           (BTS_RECORD_SIZE * 128)
   
   
   /*
@@@ -1211,7 -1211,7 +1211,7 @@@ amd_pmu_disable_counter(struct hw_perf_
         x86_pmu_disable_counter(hwc, idx);
   }
   
- static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]);
+ static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
   
   /*
    * Set the next IRQ period, based on the hwc->period_left value.
@@@ -1253,7 -1253,7 +1253,7 @@@ x86_perf_counter_set_period(struct perf
         if (left > x86_pmu.max_period)
                 left = x86_pmu.max_period;
   
-       per_cpu(prev_left[idx], smp_processor_id()) = left;
+       per_cpu(pmc_prev_left[idx], smp_processor_id()) = left;
   
         /*
          * The hw counter starts counting from this counter offset,
@@@ -1470,7 -1470,7 +1470,7 @@@ void perf_counter_print_debug(void
                 rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl);
                 rdmsrl(x86_pmu.perfctr  + idx, pmc_count);
   
-               prev_left = per_cpu(prev_left[idx], cpu);
+               prev_left = per_cpu(pmc_prev_left[idx], cpu);
   
                 pr_info("CPU#%d:   gen-PMC%d ctrl:  %016llx\n",
                         cpu, idx, pmc_ctrl);
@@@ -1488,7 -1488,8 +1488,7 @@@
         local_irq_restore(flags);
   }
   
- -static void intel_pmu_drain_bts_buffer(struct cpu_hw_counters *cpuc,
- -                                     struct perf_sample_data *data)
+ +static void intel_pmu_drain_bts_buffer(struct cpu_hw_counters *cpuc)
   {
         struct debug_store *ds = cpuc->ds;
         struct bts_record {
@@@ -1497,11 -1498,8 +1497,11 @@@
                 u64     flags;
         };
         struct perf_counter *counter = cpuc->counters[X86_PMC_IDX_FIXED_BTS];
- -      unsigned long orig_ip = data->regs->ip;
         struct bts_record *at, *top;
+ +      struct perf_output_handle handle;
+ +      struct perf_event_header header;
+ +      struct perf_sample_data data;
+ +      struct pt_regs regs;
   
         if (!counter)
                 return;
@@@ -1512,38 -1510,19 +1512,38 @@@
         at  = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
         top = (struct bts_record *)(unsigned long)ds->bts_index;
   
+ +      if (top <= at)
+ +              return;
+ +
         ds->bts_index = ds->bts_buffer_base;
   
+ +
+ +      data.period     = counter->hw.last_period;
+ +      data.addr       = 0;
+ +      regs.ip         = 0;
+ +
+ +      /*
+ +       * Prepare a generic sample, i.e. fill in the invariant fields.
+ +       * We will overwrite the from and to address before we output
+ +       * the sample.
+ +       */
+ +      perf_prepare_sample(&header, &data, counter, &regs);
+ +
+ +      if (perf_output_begin(&handle, counter,
+ +                            header.size * (top - at), 1, 1))
+ +              return;
+ +
         for (; at < top; at++) {
- -              data->regs->ip  = at->from;
- -              data->addr      = at->to;
+ +              data.ip         = at->from;
+ +              data.addr       = at->to;
   
- -              perf_counter_output(counter, 1, data);
+ +              perf_output_sample(&handle, &header, &data, counter);
         }
   
- -      data->regs->ip  = orig_ip;
- -      data->addr      = 0;
+ +      perf_output_end(&handle);
   
         /* There's new data available. */
+ +      counter->hw.interrupts++;
         counter->pending_kill = POLL_IN;
   }
   
@@@ -1573,9 -1552,13 +1573,9 @@@ static void x86_pmu_disable(struct perf
         x86_perf_counter_update(counter, hwc, idx);
   
         /* Drain the remaining BTS records. */
- -      if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) {
- -              struct perf_sample_data data;
- -              struct pt_regs regs;
+ +      if (unlikely(idx == X86_PMC_IDX_FIXED_BTS))
+ +              intel_pmu_drain_bts_buffer(cpuc);
   
- -              data.regs = &regs;
- -              intel_pmu_drain_bts_buffer(cpuc, &data);
- -      }
         cpuc->counters[idx] = NULL;
         clear_bit(idx, cpuc->used_mask);
   
@@@ -1636,6 -1619,7 +1636,6 @@@ static int p6_pmu_handle_irq(struct pt_
         int idx, handled = 0;
         u64 val;
   
- -      data.regs = regs;
         data.addr = 0;
   
         cpuc = &__get_cpu_var(cpu_hw_counters);
@@@ -1660,7 -1644,7 +1660,7 @@@
                 if (!x86_perf_counter_set_period(counter, hwc, idx))
                         continue;
   
- -              if (perf_counter_overflow(counter, 1, &data))
+ +              if (perf_counter_overflow(counter, 1, &data, regs))
                         p6_pmu_disable_counter(hwc, idx);
         }
   
@@@ -1681,12 -1665,13 +1681,12 @@@ static int intel_pmu_handle_irq(struct 
         int bit, loops;
         u64 ack, status;
   
- -      data.regs = regs;
         data.addr = 0;
   
         cpuc = &__get_cpu_var(cpu_hw_counters);
   
         perf_disable();
- -      intel_pmu_drain_bts_buffer(cpuc, &data);
+ +      intel_pmu_drain_bts_buffer(cpuc);
         status = intel_pmu_get_status();
         if (!status) {
                 perf_enable();
@@@ -1717,7 -1702,7 +1717,7 @@@ again
   
                 data.period = counter->hw.last_period;
   
- -              if (perf_counter_overflow(counter, 1, &data))
+ +              if (perf_counter_overflow(counter, 1, &data, regs))
                         intel_pmu_disable_counter(&counter->hw, bit);
         }
   
@@@ -1744,6 -1729,7 +1744,6 @@@ static int amd_pmu_handle_irq(struct pt
         int idx, handled = 0;
         u64 val;
   
- -      data.regs = regs;
         data.addr = 0;
   
         cpuc = &__get_cpu_var(cpu_hw_counters);
@@@ -1768,7 -1754,7 +1768,7 @@@
                 if (!x86_perf_counter_set_period(counter, hwc, idx))
                         continue;
   
- -              if (perf_counter_overflow(counter, 1, &data))
+ +              if (perf_counter_overflow(counter, 1, &data, regs))
                         amd_pmu_disable_counter(hwc, idx);
         }
   
@@@ -2124,8 -2110,8 +2124,8 @@@ void callchain_store(struct perf_callch
                 entry->ip[entry->nr++] = ip;
   }
   
- static DEFINE_PER_CPU(struct perf_callchain_entry, irq_entry);
- static DEFINE_PER_CPU(struct perf_callchain_entry, nmi_entry);
+ static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry);
+ static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry);
   static DEFINE_PER_CPU(int, in_nmi_frame);
   
   
@@@ -2278,9 -2264,9 +2278,9 @@@ struct perf_callchain_entry *perf_callc
         struct perf_callchain_entry *entry;
   
         if (in_nmi())
-               entry = &__get_cpu_var(nmi_entry);
+               entry = &__get_cpu_var(pmc_nmi_entry);
         else
-               entry = &__get_cpu_var(irq_entry);
+               entry = &__get_cpu_var(pmc_irq_entry);
   
         entry->nr = 0;
   
diff --combined kernel/perf_counter.c

index 06d233a06da59842a81f51daf24f57c333660d85,8cb94a52d1bb3c21154e8dd028eecc93e1588866..d013f4e89e9cdee913b117cbbc83e4e326889113
--- 1/kernel/perf_counter.c
--- 2/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@@ -106,16 -106,16 +106,16 @@@ hw_perf_group_sched_in(struct perf_coun
   
   void __weak perf_counter_print_debug(void)    { }
   
- static DEFINE_PER_CPU(int, disable_count);
+ static DEFINE_PER_CPU(int, perf_disable_count);
   
   void __perf_disable(void)
   {
-       __get_cpu_var(disable_count)++;
+       __get_cpu_var(perf_disable_count)++;
   }
   
   bool __perf_enable(void)
   {
-       return !--__get_cpu_var(disable_count);
+       return !--__get_cpu_var(perf_disable_count);
   }
   
   void perf_disable(void)
@@@ -2176,13 -2176,6 +2176,13 @@@ static int perf_mmap_data_alloc(struct 
         data->nr_pages = nr_pages;
         atomic_set(&data->lock, -1);
   
+ +      if (counter->attr.watermark) {
+ +              data->watermark = min_t(long, PAGE_SIZE * nr_pages,
+ +                                    counter->attr.wakeup_watermark);
+ +      }
+ +      if (!data->watermark)
+ +              data->watermark = max(PAGE_SIZE, PAGE_SIZE * nr_pages / 4);
+ +
         rcu_assign_pointer(counter->data, data);
   
         return 0;
@@@ -2322,8 -2315,7 +2322,8 @@@ static int perf_mmap(struct file *file
         lock_limit >>= PAGE_SHIFT;
         locked = vma->vm_mm->locked_vm + extra;
   
- -      if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) {
+ +      if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
+ +              !capable(CAP_IPC_LOCK)) {
                 ret = -EPERM;
                 goto unlock;
         }
@@@ -2512,15 -2504,35 +2512,15 @@@ __weak struct perf_callchain_entry *per
   /*
    * Output
    */
- -
- -struct perf_output_handle {
- -      struct perf_counter     *counter;
- -      struct perf_mmap_data   *data;
- -      unsigned long           head;
- -      unsigned long           offset;
- -      int                     nmi;
- -      int                     sample;
- -      int                     locked;
- -      unsigned long           flags;
- -};
- -
- -static bool perf_output_space(struct perf_mmap_data *data,
- -                            unsigned int offset, unsigned int head)
+ +static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail,
+ +                            unsigned long offset, unsigned long head)
   {
- -      unsigned long tail;
         unsigned long mask;
   
         if (!data->writable)
                 return true;
   
         mask = (data->nr_pages << PAGE_SHIFT) - 1;
- -      /*
- -       * Userspace could choose to issue a mb() before updating the tail
- -       * pointer. So that all reads will be completed before the write is
- -       * issued.
- -       */
- -      tail = ACCESS_ONCE(data->user_page->data_tail);
- -      smp_rmb();
   
         offset = (offset - tail) & mask;
         head   = (head   - tail) & mask;
@@@ -2621,8 -2633,8 +2621,8 @@@ out
         local_irq_restore(handle->flags);
   }
   
- -static void perf_output_copy(struct perf_output_handle *handle,
- -                           const void *buf, unsigned int len)
+ +void perf_output_copy(struct perf_output_handle *handle,
+ +                    const void *buf, unsigned int len)
   {
         unsigned int pages_mask;
         unsigned int offset;
@@@ -2657,13 -2669,16 +2657,13 @@@
         WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0);
   }
   
- -#define perf_output_put(handle, x) \
- -      perf_output_copy((handle), &(x), sizeof(x))
- -
- -static int perf_output_begin(struct perf_output_handle *handle,
- -                           struct perf_counter *counter, unsigned int size,
- -                           int nmi, int sample)
+ +int perf_output_begin(struct perf_output_handle *handle,
+ +                    struct perf_counter *counter, unsigned int size,
+ +                    int nmi, int sample)
   {
         struct perf_counter *output_counter;
         struct perf_mmap_data *data;
- -      unsigned int offset, head;
+ +      unsigned long tail, offset, head;
         int have_lost;
         struct {
                 struct perf_event_header header;
@@@ -2701,23 -2716,16 +2701,23 @@@
         perf_output_lock(handle);
   
         do {
+ +              /*
+ +               * Userspace could choose to issue a mb() before updating the
+ +               * tail pointer. So that all reads will be completed before the
+ +               * write is issued.
+ +               */
+ +              tail = ACCESS_ONCE(data->user_page->data_tail);
+ +              smp_rmb();
                 offset = head = atomic_long_read(&data->head);
                 head += size;
- -              if (unlikely(!perf_output_space(data, offset, head)))
+ +              if (unlikely(!perf_output_space(data, tail, offset, head)))
                         goto fail;
         } while (atomic_long_cmpxchg(&data->head, offset, head) != offset);
   
         handle->offset  = offset;
         handle->head    = head;
   
- -      if ((offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT))
+ +      if (head - tail > data->watermark)
                 atomic_set(&data->wakeup, 1);
   
         if (have_lost) {
@@@ -2741,7 -2749,7 +2741,7 @@@ out
         return -ENOSPC;
   }
   
- -static void perf_output_end(struct perf_output_handle *handle)
+ +void perf_output_end(struct perf_output_handle *handle)
   {
         struct perf_counter *counter = handle->counter;
         struct perf_mmap_data *data = handle->data;
@@@ -2855,176 -2863,156 +2855,176 @@@ static void perf_output_read(struct per
                 perf_output_read_one(handle, counter);
   }
   
- -void perf_counter_output(struct perf_counter *counter, int nmi,
- -                              struct perf_sample_data *data)
+ +void perf_output_sample(struct perf_output_handle *handle,
+ +                      struct perf_event_header *header,
+ +                      struct perf_sample_data *data,
+ +                      struct perf_counter *counter)
   {
- -      int ret;
- -      u64 sample_type = counter->attr.sample_type;
- -      struct perf_output_handle handle;
- -      struct perf_event_header header;
- -      u64 ip;
- -      struct {
- -              u32 pid, tid;
- -      } tid_entry;
- -      struct perf_callchain_entry *callchain = NULL;
- -      int callchain_size = 0;
- -      u64 time;
- -      struct {
- -              u32 cpu, reserved;
- -      } cpu_entry;
- -
- -      header.type = PERF_EVENT_SAMPLE;
- -      header.size = sizeof(header);
- -
- -      header.misc = 0;
- -      header.misc |= perf_misc_flags(data->regs);
- -
- -      if (sample_type & PERF_SAMPLE_IP) {
- -              ip = perf_instruction_pointer(data->regs);
- -              header.size += sizeof(ip);
- -      }
+ +      u64 sample_type = data->type;
   
- -      if (sample_type & PERF_SAMPLE_TID) {
- -              /* namespace issues */
- -              tid_entry.pid = perf_counter_pid(counter, current);
- -              tid_entry.tid = perf_counter_tid(counter, current);
+ +      perf_output_put(handle, *header);
   
- -              header.size += sizeof(tid_entry);
- -      }
+ +      if (sample_type & PERF_SAMPLE_IP)
+ +              perf_output_put(handle, data->ip);
   
- -      if (sample_type & PERF_SAMPLE_TIME) {
- -              /*
- -               * Maybe do better on x86 and provide cpu_clock_nmi()
- -               */
- -              time = sched_clock();
+ +      if (sample_type & PERF_SAMPLE_TID)
+ +              perf_output_put(handle, data->tid_entry);
   
- -              header.size += sizeof(u64);
- -      }
+ +      if (sample_type & PERF_SAMPLE_TIME)
+ +              perf_output_put(handle, data->time);
   
         if (sample_type & PERF_SAMPLE_ADDR)
- -              header.size += sizeof(u64);
+ +              perf_output_put(handle, data->addr);
   
         if (sample_type & PERF_SAMPLE_ID)
- -              header.size += sizeof(u64);
+ +              perf_output_put(handle, data->id);
   
         if (sample_type & PERF_SAMPLE_STREAM_ID)
- -              header.size += sizeof(u64);
+ +              perf_output_put(handle, data->stream_id);
   
- -      if (sample_type & PERF_SAMPLE_CPU) {
- -              header.size += sizeof(cpu_entry);
- -
- -              cpu_entry.cpu = raw_smp_processor_id();
- -              cpu_entry.reserved = 0;
- -      }
+ +      if (sample_type & PERF_SAMPLE_CPU)
+ +              perf_output_put(handle, data->cpu_entry);
   
         if (sample_type & PERF_SAMPLE_PERIOD)
- -              header.size += sizeof(u64);
+ +              perf_output_put(handle, data->period);
   
         if (sample_type & PERF_SAMPLE_READ)
- -              header.size += perf_counter_read_size(counter);
+ +              perf_output_read(handle, counter);
   
         if (sample_type & PERF_SAMPLE_CALLCHAIN) {
- -              callchain = perf_callchain(data->regs);
+ +              if (data->callchain) {
+ +                      int size = 1;
   
- -              if (callchain) {
- -                      callchain_size = (1 + callchain->nr) * sizeof(u64);
- -                      header.size += callchain_size;
- -              } else
- -                      header.size += sizeof(u64);
+ +                      if (data->callchain)
+ +                              size += data->callchain->nr;
+ +
+ +                      size *= sizeof(u64);
+ +
+ +                      perf_output_copy(handle, data->callchain, size);
+ +              } else {
+ +                      u64 nr = 0;
+ +                      perf_output_put(handle, nr);
+ +              }
         }
   
         if (sample_type & PERF_SAMPLE_RAW) {
- -              int size = sizeof(u32);
+ +              if (data->raw) {
+ +                      perf_output_put(handle, data->raw->size);
+ +                      perf_output_copy(handle, data->raw->data,
+ +                                       data->raw->size);
+ +              } else {
+ +                      struct {
+ +                              u32     size;
+ +                              u32     data;
+ +                      } raw = {
+ +                              .size = sizeof(u32),
+ +                              .data = 0,
+ +                      };
+ +                      perf_output_put(handle, raw);
+ +              }
+ +      }
+ +}
   
- -              if (data->raw)
- -                      size += data->raw->size;
- -              else
- -                      size += sizeof(u32);
+ +void perf_prepare_sample(struct perf_event_header *header,
+ +                       struct perf_sample_data *data,
+ +                       struct perf_counter *counter,
+ +                       struct pt_regs *regs)
+ +{
+ +      u64 sample_type = counter->attr.sample_type;
   
- -              WARN_ON_ONCE(size & (sizeof(u64)-1));
- -              header.size += size;
- -      }
+ +      data->type = sample_type;
   
- -      ret = perf_output_begin(&handle, counter, header.size, nmi, 1);
- -      if (ret)
- -              return;
+ +      header->type = PERF_EVENT_SAMPLE;
+ +      header->size = sizeof(*header);
   
- -      perf_output_put(&handle, header);
+ +      header->misc = 0;
+ +      header->misc |= perf_misc_flags(regs);
   
- -      if (sample_type & PERF_SAMPLE_IP)
- -              perf_output_put(&handle, ip);
+ +      if (sample_type & PERF_SAMPLE_IP) {
+ +              data->ip = perf_instruction_pointer(regs);
   
- -      if (sample_type & PERF_SAMPLE_TID)
- -              perf_output_put(&handle, tid_entry);
+ +              header->size += sizeof(data->ip);
+ +      }
   
- -      if (sample_type & PERF_SAMPLE_TIME)
- -              perf_output_put(&handle, time);
+ +      if (sample_type & PERF_SAMPLE_TID) {
+ +              /* namespace issues */
+ +              data->tid_entry.pid = perf_counter_pid(counter, current);
+ +              data->tid_entry.tid = perf_counter_tid(counter, current);
+ +
+ +              header->size += sizeof(data->tid_entry);
+ +      }
+ +
+ +      if (sample_type & PERF_SAMPLE_TIME) {
+ +              data->time = perf_clock();
+ +
+ +              header->size += sizeof(data->time);
+ +      }
   
         if (sample_type & PERF_SAMPLE_ADDR)
- -              perf_output_put(&handle, data->addr);
+ +              header->size += sizeof(data->addr);
   
         if (sample_type & PERF_SAMPLE_ID) {
- -              u64 id = primary_counter_id(counter);
+ +              data->id = primary_counter_id(counter);
   
- -              perf_output_put(&handle, id);
+ +              header->size += sizeof(data->id);
         }
   
- -      if (sample_type & PERF_SAMPLE_STREAM_ID)
- -              perf_output_put(&handle, counter->id);
+ +      if (sample_type & PERF_SAMPLE_STREAM_ID) {
+ +              data->stream_id = counter->id;
   
- -      if (sample_type & PERF_SAMPLE_CPU)
- -              perf_output_put(&handle, cpu_entry);
+ +              header->size += sizeof(data->stream_id);
+ +      }
+ +
+ +      if (sample_type & PERF_SAMPLE_CPU) {
+ +              data->cpu_entry.cpu             = raw_smp_processor_id();
+ +              data->cpu_entry.reserved        = 0;
+ +
+ +              header->size += sizeof(data->cpu_entry);
+ +      }
   
         if (sample_type & PERF_SAMPLE_PERIOD)
- -              perf_output_put(&handle, data->period);
+ +              header->size += sizeof(data->period);
   
         if (sample_type & PERF_SAMPLE_READ)
- -              perf_output_read(&handle, counter);
+ +              header->size += perf_counter_read_size(counter);
   
         if (sample_type & PERF_SAMPLE_CALLCHAIN) {
- -              if (callchain)
- -                      perf_output_copy(&handle, callchain, callchain_size);
- -              else {
- -                      u64 nr = 0;
- -                      perf_output_put(&handle, nr);
- -              }
+ +              int size = 1;
+ +
+ +              data->callchain = perf_callchain(regs);
+ +
+ +              if (data->callchain)
+ +                      size += data->callchain->nr;
+ +
+ +              header->size += size * sizeof(u64);
         }
   
         if (sample_type & PERF_SAMPLE_RAW) {
- -              if (data->raw) {
- -                      perf_output_put(&handle, data->raw->size);
- -                      perf_output_copy(&handle, data->raw->data, data->raw->size);
- -              } else {
- -                      struct {
- -                              u32     size;
- -                              u32     data;
- -                      } raw = {
- -                              .size = sizeof(u32),
- -                              .data = 0,
- -                      };
- -                      perf_output_put(&handle, raw);
- -              }
+ +              int size = sizeof(u32);
+ +
+ +              if (data->raw)
+ +                      size += data->raw->size;
+ +              else
+ +                      size += sizeof(u32);
+ +
+ +              WARN_ON_ONCE(size & (sizeof(u64)-1));
+ +              header->size += size;
         }
+ +}
+ +
+ +static void perf_counter_output(struct perf_counter *counter, int nmi,
+ +                              struct perf_sample_data *data,
+ +                              struct pt_regs *regs)
+ +{
+ +      struct perf_output_handle handle;
+ +      struct perf_event_header header;
+ +
+ +      perf_prepare_sample(&header, data, counter, regs);
+ +
+ +      if (perf_output_begin(&handle, counter, header.size, nmi, 1))
+ +              return;
+ +
+ +      perf_output_sample(&handle, &header, data, counter);
   
         perf_output_end(&handle);
   }
@@@ -3485,7 -3473,7 +3485,7 @@@ static void perf_log_throttle(struct pe
                         .misc = 0,
                         .size = sizeof(throttle_event),
                 },
- -              .time           = sched_clock(),
+ +              .time           = perf_clock(),
                 .id             = primary_counter_id(counter),
                 .stream_id      = counter->id,
         };
@@@ -3505,16 -3493,14 +3505,16 @@@
    * Generic counter overflow handling, sampling.
    */
   
- -int perf_counter_overflow(struct perf_counter *counter, int nmi,
- -                        struct perf_sample_data *data)
+ +static int __perf_counter_overflow(struct perf_counter *counter, int nmi,
+ +                                 int throttle, struct perf_sample_data *data,
+ +                                 struct pt_regs *regs)
   {
         int events = atomic_read(&counter->event_limit);
- -      int throttle = counter->pmu->unthrottle != NULL;
         struct hw_perf_counter *hwc = &counter->hw;
         int ret = 0;
   
+ +      throttle = (throttle && counter->pmu->unthrottle != NULL);
+ +
         if (!throttle) {
                 hwc->interrupts++;
         } else {
@@@ -3537,7 -3523,7 +3537,7 @@@
         }
   
         if (counter->attr.freq) {
- -              u64 now = sched_clock();
+ +              u64 now = perf_clock();
                 s64 delta = now - hwc->freq_stamp;
   
                 hwc->freq_stamp = now;
@@@ -3563,17 -3549,10 +3563,17 @@@
                         perf_counter_disable(counter);
         }
   
- -      perf_counter_output(counter, nmi, data);
+ +      perf_counter_output(counter, nmi, data, regs);
         return ret;
   }
   
+ +int perf_counter_overflow(struct perf_counter *counter, int nmi,
+ +                        struct perf_sample_data *data,
+ +                        struct pt_regs *regs)
+ +{
+ +      return __perf_counter_overflow(counter, nmi, 1, data, regs);
+ +}
+ +
   /*
    * Generic software counter infrastructure
    */
@@@ -3609,11 -3588,9 +3609,11 @@@ again
   }
   
   static void perf_swcounter_overflow(struct perf_counter *counter,
- -                                  int nmi, struct perf_sample_data *data)
+ +                                  int nmi, struct perf_sample_data *data,
+ +                                  struct pt_regs *regs)
   {
         struct hw_perf_counter *hwc = &counter->hw;
+ +      int throttle = 0;
         u64 overflow;
   
         data->period = counter->hw.last_period;
@@@ -3623,15 -3600,13 +3623,15 @@@
                 return;
   
         for (; overflow; overflow--) {
- -              if (perf_counter_overflow(counter, nmi, data)) {
+ +              if (__perf_counter_overflow(counter, nmi, throttle,
+ +                                          data, regs)) {
                         /*
                          * We inhibit the overflow from happening when
                          * hwc->interrupts == MAX_INTERRUPTS.
                          */
                         break;
                 }
+ +              throttle = 1;
         }
   }
   
@@@ -3643,8 -3618,7 +3643,8 @@@ static void perf_swcounter_unthrottle(s
   }
   
   static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
- -                             int nmi, struct perf_sample_data *data)
+ +                             int nmi, struct perf_sample_data *data,
+ +                             struct pt_regs *regs)
   {
         struct hw_perf_counter *hwc = &counter->hw;
   
@@@ -3653,11 -3627,11 +3653,11 @@@
         if (!hwc->sample_period)
                 return;
   
- -      if (!data->regs)
+ +      if (!regs)
                 return;
   
         if (!atomic64_add_negative(nr, &hwc->period_left))
- -              perf_swcounter_overflow(counter, nmi, data);
+ +              perf_swcounter_overflow(counter, nmi, data, regs);
   }
   
   static int perf_swcounter_is_counting(struct perf_counter *counter)
@@@ -3716,8 -3690,7 +3716,8 @@@ static int perf_swcounter_match(struct 
   static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
                                      enum perf_type_id type,
                                      u32 event, u64 nr, int nmi,
- -                                   struct perf_sample_data *data)
+ +                                   struct perf_sample_data *data,
+ +                                   struct pt_regs *regs)
   {
         struct perf_counter *counter;
   
@@@ -3726,8 -3699,8 +3726,8 @@@
   
         rcu_read_lock();
         list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
- -              if (perf_swcounter_match(counter, type, event, data->regs))
- -                      perf_swcounter_add(counter, nr, nmi, data);
+ +              if (perf_swcounter_match(counter, type, event, regs))
+ +                      perf_swcounter_add(counter, nr, nmi, data, regs);
         }
         rcu_read_unlock();
   }
@@@ -3748,8 -3721,7 +3748,8 @@@ static int *perf_swcounter_recursion_co
   
   static void do_perf_swcounter_event(enum perf_type_id type, u32 event,
                                     u64 nr, int nmi,
- -                                  struct perf_sample_data *data)
+ +                                  struct perf_sample_data *data,
+ +                                  struct pt_regs *regs)
   {
         struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
         int *recursion = perf_swcounter_recursion_context(cpuctx);
@@@ -3762,7 -3734,7 +3762,7 @@@
         barrier();
   
         perf_swcounter_ctx_event(&cpuctx->ctx, type, event,
- -                               nr, nmi, data);
+ +                               nr, nmi, data, regs);
         rcu_read_lock();
         /*
          * doesn't really matter which of the child contexts the
@@@ -3770,7 -3742,7 +3770,7 @@@
          */
         ctx = rcu_dereference(current->perf_counter_ctxp);
         if (ctx)
- -              perf_swcounter_ctx_event(ctx, type, event, nr, nmi, data);
+ +              perf_swcounter_ctx_event(ctx, type, event, nr, nmi, data, regs);
         rcu_read_unlock();
   
         barrier();
@@@ -3784,11 -3756,11 +3784,11 @@@ void __perf_swcounter_event(u32 event, 
                             struct pt_regs *regs, u64 addr)
   {
         struct perf_sample_data data = {
- -              .regs = regs,
                 .addr = addr,
         };
   
- -      do_perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, &data);
+ +      do_perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi,
+ +                              &data, regs);
   }
   
   static void perf_swcounter_read(struct perf_counter *counter)
@@@ -3825,7 -3797,6 +3825,7 @@@ static enum hrtimer_restart perf_swcoun
   {
         enum hrtimer_restart ret = HRTIMER_RESTART;
         struct perf_sample_data data;
+ +      struct pt_regs *regs;
         struct perf_counter *counter;
         u64 period;
   
@@@ -3833,17 -3804,17 +3833,17 @@@
         counter->pmu->read(counter);
   
         data.addr = 0;
- -      data.regs = get_irq_regs();
+ +      regs = get_irq_regs();
         /*
          * In case we exclude kernel IPs or are somehow not in interrupt
          * context, provide the next best thing, the user IP.
          */
- -      if ((counter->attr.exclude_kernel || !data.regs) &&
+ +      if ((counter->attr.exclude_kernel || !regs) &&
                         !counter->attr.exclude_user)
- -              data.regs = task_pt_regs(current);
+ +              regs = task_pt_regs(current);
   
- -      if (data.regs) {
- -              if (perf_counter_overflow(counter, 0, &data))
+ +      if (regs) {
+ +              if (perf_counter_overflow(counter, 0, &data, regs))
                         ret = HRTIMER_NORESTART;
         }
   
@@@ -3979,17 -3950,15 +3979,17 @@@ void perf_tpcounter_event(int event_id
         };
   
         struct perf_sample_data data = {
- -              .regs = get_irq_regs(),
                 .addr = addr,
                 .raw = &raw,
         };
   
- -      if (!data.regs)
- -              data.regs = task_pt_regs(current);
+ +      struct pt_regs *regs = get_irq_regs();
+ +
+ +      if (!regs)
+ +              regs = task_pt_regs(current);
   
- -      do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, &data);
+ +      do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, count, 1,
+ +                              &data, regs);
   }
   EXPORT_SYMBOL_GPL(perf_tpcounter_event);
   
@@@ -4246,6 -4215,7 +4246,7 @@@ static int perf_copy_attr(struct perf_c
                         if (val)
                                 goto err_size;
                 }
+               size = sizeof(*attr);
         }
   
         ret = copy_from_user(attr, uattr, size);
diff --combined kernel/sched_fair.c

index a097e909e80f1d38ec976b857f1b227efa920be7,10d218ab69f2ba4eac39a2d07461c0b2b256de34..990b188803ced00f8b3554bb1fe570b14b716c69
--- 1/kernel/sched_fair.c
--- 2/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@@ -513,7 -513,6 +513,7 @@@ static void update_curr(struct cfs_rq *
         if (entity_is_task(curr)) {
                 struct task_struct *curtask = task_of(curr);
   
+ +              trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
                 cpuacct_charge(curtask, delta_exec);
                 account_group_exec_runtime(curtask, delta_exec);
         }
@@@ -712,7 -711,7 +712,7 @@@ place_entity(struct cfs_rq *cfs_rq, str
   
         if (!initial) {
                 /* sleeps upto a single latency don't count. */
-               if (sched_feat(NEW_FAIR_SLEEPERS)) {
+               if (sched_feat(FAIR_SLEEPERS)) {
                         unsigned long thresh = sysctl_sched_latency;
   
                         /*
@@@ -726,6 -725,13 +726,13 @@@
                                          task_of(se)->policy != SCHED_IDLE))
                                 thresh = calc_delta_fair(thresh, se);
   
+                       /*
+                        * Halve their sleep time's effect, to allow
+                        * for a gentler effect of sleepers:
+                        */
+                       if (sched_feat(GENTLE_FAIR_SLEEPERS))
+                               thresh >>= 1;
+ 
                         vruntime -= thresh;
                 }
         }
@@@ -758,10 -764,10 +765,10 @@@ enqueue_entity(struct cfs_rq *cfs_rq, s
   
   static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
   {
-       if (cfs_rq->last == se)
+       if (!se || cfs_rq->last == se)
                 cfs_rq->last = NULL;
   
-       if (cfs_rq->next == se)
+       if (!se || cfs_rq->next == se)
                 cfs_rq->next = NULL;
   }
   
@@@ -1063,83 -1069,6 +1070,6 @@@ static void yield_task_fair(struct rq *
         se->vruntime = rightmost->vruntime + 1;
   }
   
- /*
-  * wake_idle() will wake a task on an idle cpu if task->cpu is
-  * not idle and an idle cpu is available.  The span of cpus to
-  * search starts with cpus closest then further out as needed,
-  * so we always favor a closer, idle cpu.
-  * Domains may include CPUs that are not usable for migration,
-  * hence we need to mask them out (rq->rd->online)
-  *
-  * Returns the CPU we should wake onto.
-  */
- #if defined(ARCH_HAS_SCHED_WAKE_IDLE)
- 
- #define cpu_rd_active(cpu, rq) cpumask_test_cpu(cpu, rq->rd->online)
- 
- static int wake_idle(int cpu, struct task_struct *p)
- {
-       struct sched_domain *sd;
-       int i;
-       unsigned int chosen_wakeup_cpu;
-       int this_cpu;
-       struct rq *task_rq = task_rq(p);
- 
-       /*
-        * At POWERSAVINGS_BALANCE_WAKEUP level, if both this_cpu and prev_cpu
-        * are idle and this is not a kernel thread and this task's affinity
-        * allows it to be moved to preferred cpu, then just move!
-        */
- 
-       this_cpu = smp_processor_id();
-       chosen_wakeup_cpu =
-               cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu;
- 
-       if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP &&
-               idle_cpu(cpu) && idle_cpu(this_cpu) &&
-               p->mm && !(p->flags & PF_KTHREAD) &&
-               cpu_isset(chosen_wakeup_cpu, p->cpus_allowed))
-               return chosen_wakeup_cpu;
- 
-       /*
-        * If it is idle, then it is the best cpu to run this task.
-        *
-        * This cpu is also the best, if it has more than one task already.
-        * Siblings must be also busy(in most cases) as they didn't already
-        * pickup the extra load from this cpu and hence we need not check
-        * sibling runqueue info. This will avoid the checks and cache miss
-        * penalities associated with that.
-        */
-       if (idle_cpu(cpu) || cpu_rq(cpu)->cfs.nr_running > 1)
-               return cpu;
- 
-       for_each_domain(cpu, sd) {
-               if ((sd->flags & SD_WAKE_IDLE)
-                   || ((sd->flags & SD_WAKE_IDLE_FAR)
-                       && !task_hot(p, task_rq->clock, sd))) {
-                       for_each_cpu_and(i, sched_domain_span(sd),
-                                        &p->cpus_allowed) {
-                               if (cpu_rd_active(i, task_rq) && idle_cpu(i)) {
-                                       if (i != task_cpu(p)) {
-                                               schedstat_inc(p,
-                                                      se.nr_wakeups_idle);
-                                       }
-                                       return i;
-                               }
-                       }
-               } else {
-                       break;
-               }
-       }
-       return cpu;
- }
- #else /* !ARCH_HAS_SCHED_WAKE_IDLE*/
- static inline int wake_idle(int cpu, struct task_struct *p)
- {
-       return cpu;
- }
- #endif
- 
   #ifdef CONFIG_SMP
   
   #ifdef CONFIG_FAIR_GROUP_SCHED
@@@ -1226,25 -1155,34 +1156,34 @@@ static inline unsigned long effective_l
   
   #endif
   
- static int
- wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
-           struct task_struct *p, int prev_cpu, int this_cpu, int sync,
-           int idx, unsigned long load, unsigned long this_load,
-           unsigned int imbalance)
+ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
   {
-       struct task_struct *curr = this_rq->curr;
-       struct task_group *tg;
-       unsigned long tl = this_load;
+       struct task_struct *curr = current;
+       unsigned long this_load, load;
+       int idx, this_cpu, prev_cpu;
         unsigned long tl_per_task;
+       unsigned int imbalance;
+       struct task_group *tg;
         unsigned long weight;
         int balanced;
   
-       if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS))
-               return 0;
+       idx       = sd->wake_idx;
+       this_cpu  = smp_processor_id();
+       prev_cpu  = task_cpu(p);
+       load      = source_load(prev_cpu, idx);
+       this_load = target_load(this_cpu, idx);
   
-       if (sync && (curr->se.avg_overlap > sysctl_sched_migration_cost ||
-                       p->se.avg_overlap > sysctl_sched_migration_cost))
-               sync = 0;
+       if (sync) {
+              if (sched_feat(SYNC_LESS) &&
+                  (curr->se.avg_overlap > sysctl_sched_migration_cost ||
+                   p->se.avg_overlap > sysctl_sched_migration_cost))
+                      sync = 0;
+       } else {
+               if (sched_feat(SYNC_MORE) &&
+                   (curr->se.avg_overlap < sysctl_sched_migration_cost &&
+                    p->se.avg_overlap < sysctl_sched_migration_cost))
+                       sync = 1;
+       }
   
         /*
          * If sync wakeup then subtract the (maximum possible)
@@@ -1255,24 -1193,26 +1194,26 @@@
                 tg = task_group(current);
                 weight = current->se.load.weight;
   
-               tl += effective_load(tg, this_cpu, -weight, -weight);
+               this_load += effective_load(tg, this_cpu, -weight, -weight);
                 load += effective_load(tg, prev_cpu, 0, -weight);
         }
   
         tg = task_group(p);
         weight = p->se.load.weight;
   
+       imbalance = 100 + (sd->imbalance_pct - 100) / 2;
+ 
         /*
          * In low-load situations, where prev_cpu is idle and this_cpu is idle
-        * due to the sync cause above having dropped tl to 0, we'll always have
-        * an imbalance, but there's really nothing you can do about that, so
-        * that's good too.
+        * due to the sync cause above having dropped this_load to 0, we'll
+        * always have an imbalance, but there's really nothing you can do
+        * about that, so that's good too.
          *
          * Otherwise check if either cpus are near enough in load to allow this
          * task to be woken on this_cpu.
          */
-       balanced = !tl ||
-               100*(tl + effective_load(tg, this_cpu, weight, weight)) <=
+       balanced = !this_load ||
+               100*(this_load + effective_load(tg, this_cpu, weight, weight)) <=
                 imbalance*(load + effective_load(tg, prev_cpu, 0, weight));
   
         /*
@@@ -1286,14 -1226,15 +1227,15 @@@
         schedstat_inc(p, se.nr_wakeups_affine_attempts);
         tl_per_task = cpu_avg_load_per_task(this_cpu);
   
-       if (balanced || (tl <= load && tl + target_load(prev_cpu, idx) <=
-                       tl_per_task)) {
+       if (balanced ||
+           (this_load <= load &&
+            this_load + target_load(prev_cpu, idx) <= tl_per_task)) {
                 /*
                  * This domain has SD_WAKE_AFFINE and
                  * p is cache cold in this domain, and
                  * there is no bad imbalance.
                  */
-               schedstat_inc(this_sd, ttwu_move_affine);
+               schedstat_inc(sd, ttwu_move_affine);
                 schedstat_inc(p, se.nr_wakeups_affine);
   
                 return 1;
@@@ -1301,65 -1242,215 +1243,215 @@@
         return 0;
   }
   
- static int select_task_rq_fair(struct task_struct *p, int sync)
+ /*
+  * find_idlest_group finds and returns the least busy CPU group within the
+  * domain.
+  */
+ static struct sched_group *
+ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
+                 int this_cpu, int load_idx)
   {
-       struct sched_domain *sd, *this_sd = NULL;
-       int prev_cpu, this_cpu, new_cpu;
-       unsigned long load, this_load;
-       struct rq *this_rq;
-       unsigned int imbalance;
-       int idx;
+       struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
+       unsigned long min_load = ULONG_MAX, this_load = 0;
+       int imbalance = 100 + (sd->imbalance_pct-100)/2;
   
-       prev_cpu        = task_cpu(p);
-       this_cpu        = smp_processor_id();
-       this_rq         = cpu_rq(this_cpu);
-       new_cpu         = prev_cpu;
+       do {
+               unsigned long load, avg_load;
+               int local_group;
+               int i;
   
-       /*
-        * 'this_sd' is the first domain that both
-        * this_cpu and prev_cpu are present in:
-        */
-       for_each_domain(this_cpu, sd) {
-               if (cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) {
-                       this_sd = sd;
-                       break;
+               /* Skip over this group if it has no CPUs allowed */
+               if (!cpumask_intersects(sched_group_cpus(group),
+                                       &p->cpus_allowed))
+                       continue;
+ 
+               local_group = cpumask_test_cpu(this_cpu,
+                                              sched_group_cpus(group));
+ 
+               /* Tally up the load of all CPUs in the group */
+               avg_load = 0;
+ 
+               for_each_cpu(i, sched_group_cpus(group)) {
+                       /* Bias balancing toward cpus of our domain */
+                       if (local_group)
+                               load = source_load(i, load_idx);
+                       else
+                               load = target_load(i, load_idx);
+ 
+                       avg_load += load;
+               }
+ 
+               /* Adjust by relative CPU power of the group */
+               avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
+ 
+               if (local_group) {
+                       this_load = avg_load;
+                       this = group;
+               } else if (avg_load < min_load) {
+                       min_load = avg_load;
+                       idlest = group;
+               }
+       } while (group = group->next, group != sd->groups);
+ 
+       if (!idlest || 100*this_load < imbalance*min_load)
+               return NULL;
+       return idlest;
+ }
+ 
+ /*
+  * find_idlest_cpu - find the idlest cpu among the cpus in group.
+  */
+ static int
+ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
+ {
+       unsigned long load, min_load = ULONG_MAX;
+       int idlest = -1;
+       int i;
+ 
+       /* Traverse only the allowed CPUs */
+       for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) {
+               load = weighted_cpuload(i);
+ 
+               if (load < min_load || (load == min_load && i == this_cpu)) {
+                       min_load = load;
+                       idlest = i;
                 }
         }
   
-       if (unlikely(!cpumask_test_cpu(this_cpu, &p->cpus_allowed)))
-               goto out;
+       return idlest;
+ }
   
-       /*
-        * Check for affine wakeup and passive balancing possibilities.
-        */
-       if (!this_sd)
+ /*
+  * sched_balance_self: balance the current task (running on cpu) in domains
+  * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
+  * SD_BALANCE_EXEC.
+  *
+  * Balance, ie. select the least loaded group.
+  *
+  * Returns the target CPU number, or the same CPU if no balancing is needed.
+  *
+  * preempt must be disabled.
+  */
+ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
+ {
+       struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
+       int cpu = smp_processor_id();
+       int prev_cpu = task_cpu(p);
+       int new_cpu = cpu;
+       int want_affine = 0;
+       int want_sd = 1;
+       int sync = wake_flags & WF_SYNC;
+ 
+       if (sd_flag & SD_BALANCE_WAKE) {
+               if (sched_feat(AFFINE_WAKEUPS))
+                       want_affine = 1;
+               new_cpu = prev_cpu;
+       }
+ 
+       rcu_read_lock();
+       for_each_domain(cpu, tmp) {
+               /*
+                * If power savings logic is enabled for a domain, see if we
+                * are not overloaded, if so, don't balance wider.
+                */
+               if (tmp->flags & (SD_POWERSAVINGS_BALANCE|SD_PREFER_LOCAL)) {
+                       unsigned long power = 0;
+                       unsigned long nr_running = 0;
+                       unsigned long capacity;
+                       int i;
+ 
+                       for_each_cpu(i, sched_domain_span(tmp)) {
+                               power += power_of(i);
+                               nr_running += cpu_rq(i)->cfs.nr_running;
+                       }
+ 
+                       capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
+ 
+                       if (tmp->flags & SD_POWERSAVINGS_BALANCE)
+                               nr_running /= 2;
+ 
+                       if (nr_running < capacity)
+                               want_sd = 0;
+               }
+ 
+               if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
+                   cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
+ 
+                       affine_sd = tmp;
+                       want_affine = 0;
+               }
+ 
+               if (!want_sd && !want_affine)
+                       break;
+ 
+               if (!(tmp->flags & sd_flag))
+                       continue;
+ 
+               if (want_sd)
+                       sd = tmp;
+       }
+ 
+       if (sched_feat(LB_SHARES_UPDATE)) {
+               /*
+                * Pick the largest domain to update shares over
+                */
+               tmp = sd;
+               if (affine_sd && (!tmp ||
+                                 cpumask_weight(sched_domain_span(affine_sd)) >
+                                 cpumask_weight(sched_domain_span(sd))))
+                       tmp = affine_sd;
+ 
+               if (tmp)
+                       update_shares(tmp);
+       }
+ 
+       if (affine_sd && wake_affine(affine_sd, p, sync)) {
+               new_cpu = cpu;
                 goto out;
+       }
   
-       idx = this_sd->wake_idx;
+       while (sd) {
+               int load_idx = sd->forkexec_idx;
+               struct sched_group *group;
+               int weight;
   
-       imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
+               if (!(sd->flags & sd_flag)) {
+                       sd = sd->child;
+                       continue;
+               }
   
-       load = source_load(prev_cpu, idx);
-       this_load = target_load(this_cpu, idx);
+               if (sd_flag & SD_BALANCE_WAKE)
+                       load_idx = sd->wake_idx;
   
-       if (wake_affine(this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx,
-                                    load, this_load, imbalance))
-               return this_cpu;
+               group = find_idlest_group(sd, p, cpu, load_idx);
+               if (!group) {
+                       sd = sd->child;
+                       continue;
+               }
   
-       /*
-        * Start passive balancing when half the imbalance_pct
-        * limit is reached.
-        */
-       if (this_sd->flags & SD_WAKE_BALANCE) {
-               if (imbalance*this_load <= 100*load) {
-                       schedstat_inc(this_sd, ttwu_move_balance);
-                       schedstat_inc(p, se.nr_wakeups_passive);
-                       return this_cpu;
+               new_cpu = find_idlest_cpu(group, p, cpu);
+               if (new_cpu == -1 || new_cpu == cpu) {
+                       /* Now try balancing at a lower domain level of cpu */
+                       sd = sd->child;
+                       continue;
                 }
+ 
+               /* Now try balancing at a lower domain level of new_cpu */
+               cpu = new_cpu;
+               weight = cpumask_weight(sched_domain_span(sd));
+               sd = NULL;
+               for_each_domain(cpu, tmp) {
+                       if (weight <= cpumask_weight(sched_domain_span(tmp)))
+                               break;
+                       if (tmp->flags & sd_flag)
+                               sd = tmp;
+               }
+               /* while loop will break here if sd == NULL */
         }
   
   out:
-       return wake_idle(new_cpu, p);
+       rcu_read_unlock();
+       return new_cpu;
   }
   #endif /* CONFIG_SMP */
   
@@@ -1472,11 -1563,12 +1564,12 @@@ static void set_next_buddy(struct sched
   /*
    * Preempt the current task with a newly woken task if needed:
    */
- static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
+ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
   {
         struct task_struct *curr = rq->curr;
         struct sched_entity *se = &curr->se, *pse = &p->se;
         struct cfs_rq *cfs_rq = task_cfs_rq(curr);
+       int sync = wake_flags & WF_SYNC;
   
         update_curr(cfs_rq);
   
@@@ -1502,7 -1594,8 +1595,8 @@@
          */
         if (sched_feat(LAST_BUDDY) && likely(se->on_rq && curr != rq->idle))
                 set_last_buddy(se);
-       set_next_buddy(pse);
+       if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK))
+               set_next_buddy(pse);
   
         /*
          * We can come here with TIF_NEED_RESCHED already set from new task
@@@ -1524,16 -1617,25 +1618,25 @@@
                 return;
         }
   
-       if (!sched_feat(WAKEUP_PREEMPT))
-               return;
- 
-       if (sched_feat(WAKEUP_OVERLAP) && (sync ||
-                       (se->avg_overlap < sysctl_sched_migration_cost &&
-                        pse->avg_overlap < sysctl_sched_migration_cost))) {
+       if ((sched_feat(WAKEUP_SYNC) && sync) ||
+           (sched_feat(WAKEUP_OVERLAP) &&
+            (se->avg_overlap < sysctl_sched_migration_cost &&
+             pse->avg_overlap < sysctl_sched_migration_cost))) {
                 resched_task(curr);
                 return;
         }
   
+       if (sched_feat(WAKEUP_RUNNING)) {
+               if (pse->avg_running < se->avg_running) {
+                       set_next_buddy(pse);
+                       resched_task(curr);
+                       return;
+               }
+       }
+ 
+       if (!sched_feat(WAKEUP_PREEMPT))
+               return;
+ 
         find_matching_se(&se, &pse);
   
         BUG_ON(!pse);
@@@ -1556,8 -1658,13 +1659,13 @@@ static struct task_struct *pick_next_ta
                 /*
                  * If se was a buddy, clear it so that it will have to earn
                  * the favour again.
+                *
+                * If se was not a buddy, clear the buddies because neither
+                * was elegible to run, let them earn it again.
+                *
+                * IOW. unconditionally clear buddies.
                  */
-               __clear_buddies(cfs_rq, se);
+               __clear_buddies(cfs_rq, NULL);
                 set_next_entity(cfs_rq, se);
                 cfs_rq = group_cfs_rq(se);
         } while (cfs_rq);
author	Ingo Molnar <mingo@elte.hu>
	Sat, 19 Sep 2009 09:27:32 +0000 (11:27 +0200)
committer	Ingo Molnar <mingo@elte.hu>
	Sat, 19 Sep 2009 09:28:41 +0000 (11:28 +0200)
		1	2
arch/x86/kernel/cpu/perf_counter.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/perf_counter.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched_fair.c	patch \|	diff1 \|	diff2 \|	blob \| history