Merge reason: Bring in tracing changes we depend on.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
#define BTS_RECORD_SIZE 24
/* The size of a per-cpu BTS buffer in bytes: */
-#define BTS_BUFFER_SIZE (BTS_RECORD_SIZE * 1024)
+#define BTS_BUFFER_SIZE (BTS_RECORD_SIZE * 2048)
/* The BTS overflow threshold in bytes from the end of the buffer: */
-#define BTS_OVFL_TH (BTS_RECORD_SIZE * 64)
+#define BTS_OVFL_TH (BTS_RECORD_SIZE * 128)
/*
x86_pmu_disable_counter(hwc, idx);
}
- static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]);
+ static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
/*
* Set the next IRQ period, based on the hwc->period_left value.
if (left > x86_pmu.max_period)
left = x86_pmu.max_period;
- per_cpu(prev_left[idx], smp_processor_id()) = left;
+ per_cpu(pmc_prev_left[idx], smp_processor_id()) = left;
/*
* The hw counter starts counting from this counter offset,
rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl);
rdmsrl(x86_pmu.perfctr + idx, pmc_count);
- prev_left = per_cpu(prev_left[idx], cpu);
+ prev_left = per_cpu(pmc_prev_left[idx], cpu);
pr_info("CPU#%d: gen-PMC%d ctrl: %016llx\n",
cpu, idx, pmc_ctrl);
local_irq_restore(flags);
}
-static void intel_pmu_drain_bts_buffer(struct cpu_hw_counters *cpuc,
- struct perf_sample_data *data)
+static void intel_pmu_drain_bts_buffer(struct cpu_hw_counters *cpuc)
{
struct debug_store *ds = cpuc->ds;
struct bts_record {
u64 flags;
};
struct perf_counter *counter = cpuc->counters[X86_PMC_IDX_FIXED_BTS];
- unsigned long orig_ip = data->regs->ip;
struct bts_record *at, *top;
+ struct perf_output_handle handle;
+ struct perf_event_header header;
+ struct perf_sample_data data;
+ struct pt_regs regs;
if (!counter)
return;
at = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
top = (struct bts_record *)(unsigned long)ds->bts_index;
+ if (top <= at)
+ return;
+
ds->bts_index = ds->bts_buffer_base;
+
+ data.period = counter->hw.last_period;
+ data.addr = 0;
+ regs.ip = 0;
+
+ /*
+ * Prepare a generic sample, i.e. fill in the invariant fields.
+ * We will overwrite the from and to address before we output
+ * the sample.
+ */
+ perf_prepare_sample(&header, &data, counter, ®s);
+
+ if (perf_output_begin(&handle, counter,
+ header.size * (top - at), 1, 1))
+ return;
+
for (; at < top; at++) {
- data->regs->ip = at->from;
- data->addr = at->to;
+ data.ip = at->from;
+ data.addr = at->to;
- perf_counter_output(counter, 1, data);
+ perf_output_sample(&handle, &header, &data, counter);
}
- data->regs->ip = orig_ip;
- data->addr = 0;
+ perf_output_end(&handle);
/* There's new data available. */
+ counter->hw.interrupts++;
counter->pending_kill = POLL_IN;
}
x86_perf_counter_update(counter, hwc, idx);
/* Drain the remaining BTS records. */
- if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) {
- struct perf_sample_data data;
- struct pt_regs regs;
+ if (unlikely(idx == X86_PMC_IDX_FIXED_BTS))
+ intel_pmu_drain_bts_buffer(cpuc);
- data.regs = ®s;
- intel_pmu_drain_bts_buffer(cpuc, &data);
- }
cpuc->counters[idx] = NULL;
clear_bit(idx, cpuc->used_mask);
int idx, handled = 0;
u64 val;
- data.regs = regs;
data.addr = 0;
cpuc = &__get_cpu_var(cpu_hw_counters);
if (!x86_perf_counter_set_period(counter, hwc, idx))
continue;
- if (perf_counter_overflow(counter, 1, &data))
+ if (perf_counter_overflow(counter, 1, &data, regs))
p6_pmu_disable_counter(hwc, idx);
}
int bit, loops;
u64 ack, status;
- data.regs = regs;
data.addr = 0;
cpuc = &__get_cpu_var(cpu_hw_counters);
perf_disable();
- intel_pmu_drain_bts_buffer(cpuc, &data);
+ intel_pmu_drain_bts_buffer(cpuc);
status = intel_pmu_get_status();
if (!status) {
perf_enable();
data.period = counter->hw.last_period;
- if (perf_counter_overflow(counter, 1, &data))
+ if (perf_counter_overflow(counter, 1, &data, regs))
intel_pmu_disable_counter(&counter->hw, bit);
}
int idx, handled = 0;
u64 val;
- data.regs = regs;
data.addr = 0;
cpuc = &__get_cpu_var(cpu_hw_counters);
if (!x86_perf_counter_set_period(counter, hwc, idx))
continue;
- if (perf_counter_overflow(counter, 1, &data))
+ if (perf_counter_overflow(counter, 1, &data, regs))
amd_pmu_disable_counter(hwc, idx);
}
entry->ip[entry->nr++] = ip;
}
- static DEFINE_PER_CPU(struct perf_callchain_entry, irq_entry);
- static DEFINE_PER_CPU(struct perf_callchain_entry, nmi_entry);
+ static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry);
+ static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry);
static DEFINE_PER_CPU(int, in_nmi_frame);
struct perf_callchain_entry *entry;
if (in_nmi())
- entry = &__get_cpu_var(nmi_entry);
+ entry = &__get_cpu_var(pmc_nmi_entry);
else
- entry = &__get_cpu_var(irq_entry);
+ entry = &__get_cpu_var(pmc_irq_entry);
entry->nr = 0;
void __weak perf_counter_print_debug(void) { }
- static DEFINE_PER_CPU(int, disable_count);
+ static DEFINE_PER_CPU(int, perf_disable_count);
void __perf_disable(void)
{
- __get_cpu_var(disable_count)++;
+ __get_cpu_var(perf_disable_count)++;
}
bool __perf_enable(void)
{
- return !--__get_cpu_var(disable_count);
+ return !--__get_cpu_var(perf_disable_count);
}
void perf_disable(void)
data->nr_pages = nr_pages;
atomic_set(&data->lock, -1);
+ if (counter->attr.watermark) {
+ data->watermark = min_t(long, PAGE_SIZE * nr_pages,
+ counter->attr.wakeup_watermark);
+ }
+ if (!data->watermark)
+ data->watermark = max(PAGE_SIZE, PAGE_SIZE * nr_pages / 4);
+
rcu_assign_pointer(counter->data, data);
return 0;
lock_limit >>= PAGE_SHIFT;
locked = vma->vm_mm->locked_vm + extra;
- if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) {
+ if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
+ !capable(CAP_IPC_LOCK)) {
ret = -EPERM;
goto unlock;
}
/*
* Output
*/
-
-struct perf_output_handle {
- struct perf_counter *counter;
- struct perf_mmap_data *data;
- unsigned long head;
- unsigned long offset;
- int nmi;
- int sample;
- int locked;
- unsigned long flags;
-};
-
-static bool perf_output_space(struct perf_mmap_data *data,
- unsigned int offset, unsigned int head)
+static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail,
+ unsigned long offset, unsigned long head)
{
- unsigned long tail;
unsigned long mask;
if (!data->writable)
return true;
mask = (data->nr_pages << PAGE_SHIFT) - 1;
- /*
- * Userspace could choose to issue a mb() before updating the tail
- * pointer. So that all reads will be completed before the write is
- * issued.
- */
- tail = ACCESS_ONCE(data->user_page->data_tail);
- smp_rmb();
offset = (offset - tail) & mask;
head = (head - tail) & mask;
local_irq_restore(handle->flags);
}
-static void perf_output_copy(struct perf_output_handle *handle,
- const void *buf, unsigned int len)
+void perf_output_copy(struct perf_output_handle *handle,
+ const void *buf, unsigned int len)
{
unsigned int pages_mask;
unsigned int offset;
WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0);
}
-#define perf_output_put(handle, x) \
- perf_output_copy((handle), &(x), sizeof(x))
-
-static int perf_output_begin(struct perf_output_handle *handle,
- struct perf_counter *counter, unsigned int size,
- int nmi, int sample)
+int perf_output_begin(struct perf_output_handle *handle,
+ struct perf_counter *counter, unsigned int size,
+ int nmi, int sample)
{
struct perf_counter *output_counter;
struct perf_mmap_data *data;
- unsigned int offset, head;
+ unsigned long tail, offset, head;
int have_lost;
struct {
struct perf_event_header header;
perf_output_lock(handle);
do {
+ /*
+ * Userspace could choose to issue a mb() before updating the
+ * tail pointer. So that all reads will be completed before the
+ * write is issued.
+ */
+ tail = ACCESS_ONCE(data->user_page->data_tail);
+ smp_rmb();
offset = head = atomic_long_read(&data->head);
head += size;
- if (unlikely(!perf_output_space(data, offset, head)))
+ if (unlikely(!perf_output_space(data, tail, offset, head)))
goto fail;
} while (atomic_long_cmpxchg(&data->head, offset, head) != offset);
handle->offset = offset;
handle->head = head;
- if ((offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT))
+ if (head - tail > data->watermark)
atomic_set(&data->wakeup, 1);
if (have_lost) {
return -ENOSPC;
}
-static void perf_output_end(struct perf_output_handle *handle)
+void perf_output_end(struct perf_output_handle *handle)
{
struct perf_counter *counter = handle->counter;
struct perf_mmap_data *data = handle->data;
perf_output_read_one(handle, counter);
}
-void perf_counter_output(struct perf_counter *counter, int nmi,
- struct perf_sample_data *data)
+void perf_output_sample(struct perf_output_handle *handle,
+ struct perf_event_header *header,
+ struct perf_sample_data *data,
+ struct perf_counter *counter)
{
- int ret;
- u64 sample_type = counter->attr.sample_type;
- struct perf_output_handle handle;
- struct perf_event_header header;
- u64 ip;
- struct {
- u32 pid, tid;
- } tid_entry;
- struct perf_callchain_entry *callchain = NULL;
- int callchain_size = 0;
- u64 time;
- struct {
- u32 cpu, reserved;
- } cpu_entry;
-
- header.type = PERF_EVENT_SAMPLE;
- header.size = sizeof(header);
-
- header.misc = 0;
- header.misc |= perf_misc_flags(data->regs);
-
- if (sample_type & PERF_SAMPLE_IP) {
- ip = perf_instruction_pointer(data->regs);
- header.size += sizeof(ip);
- }
+ u64 sample_type = data->type;
- if (sample_type & PERF_SAMPLE_TID) {
- /* namespace issues */
- tid_entry.pid = perf_counter_pid(counter, current);
- tid_entry.tid = perf_counter_tid(counter, current);
+ perf_output_put(handle, *header);
- header.size += sizeof(tid_entry);
- }
+ if (sample_type & PERF_SAMPLE_IP)
+ perf_output_put(handle, data->ip);
- if (sample_type & PERF_SAMPLE_TIME) {
- /*
- * Maybe do better on x86 and provide cpu_clock_nmi()
- */
- time = sched_clock();
+ if (sample_type & PERF_SAMPLE_TID)
+ perf_output_put(handle, data->tid_entry);
- header.size += sizeof(u64);
- }
+ if (sample_type & PERF_SAMPLE_TIME)
+ perf_output_put(handle, data->time);
if (sample_type & PERF_SAMPLE_ADDR)
- header.size += sizeof(u64);
+ perf_output_put(handle, data->addr);
if (sample_type & PERF_SAMPLE_ID)
- header.size += sizeof(u64);
+ perf_output_put(handle, data->id);
if (sample_type & PERF_SAMPLE_STREAM_ID)
- header.size += sizeof(u64);
+ perf_output_put(handle, data->stream_id);
- if (sample_type & PERF_SAMPLE_CPU) {
- header.size += sizeof(cpu_entry);
-
- cpu_entry.cpu = raw_smp_processor_id();
- cpu_entry.reserved = 0;
- }
+ if (sample_type & PERF_SAMPLE_CPU)
+ perf_output_put(handle, data->cpu_entry);
if (sample_type & PERF_SAMPLE_PERIOD)
- header.size += sizeof(u64);
+ perf_output_put(handle, data->period);
if (sample_type & PERF_SAMPLE_READ)
- header.size += perf_counter_read_size(counter);
+ perf_output_read(handle, counter);
if (sample_type & PERF_SAMPLE_CALLCHAIN) {
- callchain = perf_callchain(data->regs);
+ if (data->callchain) {
+ int size = 1;
- if (callchain) {
- callchain_size = (1 + callchain->nr) * sizeof(u64);
- header.size += callchain_size;
- } else
- header.size += sizeof(u64);
+ if (data->callchain)
+ size += data->callchain->nr;
+
+ size *= sizeof(u64);
+
+ perf_output_copy(handle, data->callchain, size);
+ } else {
+ u64 nr = 0;
+ perf_output_put(handle, nr);
+ }
}
if (sample_type & PERF_SAMPLE_RAW) {
- int size = sizeof(u32);
+ if (data->raw) {
+ perf_output_put(handle, data->raw->size);
+ perf_output_copy(handle, data->raw->data,
+ data->raw->size);
+ } else {
+ struct {
+ u32 size;
+ u32 data;
+ } raw = {
+ .size = sizeof(u32),
+ .data = 0,
+ };
+ perf_output_put(handle, raw);
+ }
+ }
+}
- if (data->raw)
- size += data->raw->size;
- else
- size += sizeof(u32);
+void perf_prepare_sample(struct perf_event_header *header,
+ struct perf_sample_data *data,
+ struct perf_counter *counter,
+ struct pt_regs *regs)
+{
+ u64 sample_type = counter->attr.sample_type;
- WARN_ON_ONCE(size & (sizeof(u64)-1));
- header.size += size;
- }
+ data->type = sample_type;
- ret = perf_output_begin(&handle, counter, header.size, nmi, 1);
- if (ret)
- return;
+ header->type = PERF_EVENT_SAMPLE;
+ header->size = sizeof(*header);
- perf_output_put(&handle, header);
+ header->misc = 0;
+ header->misc |= perf_misc_flags(regs);
- if (sample_type & PERF_SAMPLE_IP)
- perf_output_put(&handle, ip);
+ if (sample_type & PERF_SAMPLE_IP) {
+ data->ip = perf_instruction_pointer(regs);
- if (sample_type & PERF_SAMPLE_TID)
- perf_output_put(&handle, tid_entry);
+ header->size += sizeof(data->ip);
+ }
- if (sample_type & PERF_SAMPLE_TIME)
- perf_output_put(&handle, time);
+ if (sample_type & PERF_SAMPLE_TID) {
+ /* namespace issues */
+ data->tid_entry.pid = perf_counter_pid(counter, current);
+ data->tid_entry.tid = perf_counter_tid(counter, current);
+
+ header->size += sizeof(data->tid_entry);
+ }
+
+ if (sample_type & PERF_SAMPLE_TIME) {
+ data->time = perf_clock();
+
+ header->size += sizeof(data->time);
+ }
if (sample_type & PERF_SAMPLE_ADDR)
- perf_output_put(&handle, data->addr);
+ header->size += sizeof(data->addr);
if (sample_type & PERF_SAMPLE_ID) {
- u64 id = primary_counter_id(counter);
+ data->id = primary_counter_id(counter);
- perf_output_put(&handle, id);
+ header->size += sizeof(data->id);
}
- if (sample_type & PERF_SAMPLE_STREAM_ID)
- perf_output_put(&handle, counter->id);
+ if (sample_type & PERF_SAMPLE_STREAM_ID) {
+ data->stream_id = counter->id;
- if (sample_type & PERF_SAMPLE_CPU)
- perf_output_put(&handle, cpu_entry);
+ header->size += sizeof(data->stream_id);
+ }
+
+ if (sample_type & PERF_SAMPLE_CPU) {
+ data->cpu_entry.cpu = raw_smp_processor_id();
+ data->cpu_entry.reserved = 0;
+
+ header->size += sizeof(data->cpu_entry);
+ }
if (sample_type & PERF_SAMPLE_PERIOD)
- perf_output_put(&handle, data->period);
+ header->size += sizeof(data->period);
if (sample_type & PERF_SAMPLE_READ)
- perf_output_read(&handle, counter);
+ header->size += perf_counter_read_size(counter);
if (sample_type & PERF_SAMPLE_CALLCHAIN) {
- if (callchain)
- perf_output_copy(&handle, callchain, callchain_size);
- else {
- u64 nr = 0;
- perf_output_put(&handle, nr);
- }
+ int size = 1;
+
+ data->callchain = perf_callchain(regs);
+
+ if (data->callchain)
+ size += data->callchain->nr;
+
+ header->size += size * sizeof(u64);
}
if (sample_type & PERF_SAMPLE_RAW) {
- if (data->raw) {
- perf_output_put(&handle, data->raw->size);
- perf_output_copy(&handle, data->raw->data, data->raw->size);
- } else {
- struct {
- u32 size;
- u32 data;
- } raw = {
- .size = sizeof(u32),
- .data = 0,
- };
- perf_output_put(&handle, raw);
- }
+ int size = sizeof(u32);
+
+ if (data->raw)
+ size += data->raw->size;
+ else
+ size += sizeof(u32);
+
+ WARN_ON_ONCE(size & (sizeof(u64)-1));
+ header->size += size;
}
+}
+
+static void perf_counter_output(struct perf_counter *counter, int nmi,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+ struct perf_output_handle handle;
+ struct perf_event_header header;
+
+ perf_prepare_sample(&header, data, counter, regs);
+
+ if (perf_output_begin(&handle, counter, header.size, nmi, 1))
+ return;
+
+ perf_output_sample(&handle, &header, data, counter);
perf_output_end(&handle);
}
.misc = 0,
.size = sizeof(throttle_event),
},
- .time = sched_clock(),
+ .time = perf_clock(),
.id = primary_counter_id(counter),
.stream_id = counter->id,
};
* Generic counter overflow handling, sampling.
*/
-int perf_counter_overflow(struct perf_counter *counter, int nmi,
- struct perf_sample_data *data)
+static int __perf_counter_overflow(struct perf_counter *counter, int nmi,
+ int throttle, struct perf_sample_data *data,
+ struct pt_regs *regs)
{
int events = atomic_read(&counter->event_limit);
- int throttle = counter->pmu->unthrottle != NULL;
struct hw_perf_counter *hwc = &counter->hw;
int ret = 0;
+ throttle = (throttle && counter->pmu->unthrottle != NULL);
+
if (!throttle) {
hwc->interrupts++;
} else {
}
if (counter->attr.freq) {
- u64 now = sched_clock();
+ u64 now = perf_clock();
s64 delta = now - hwc->freq_stamp;
hwc->freq_stamp = now;
perf_counter_disable(counter);
}
- perf_counter_output(counter, nmi, data);
+ perf_counter_output(counter, nmi, data, regs);
return ret;
}
+int perf_counter_overflow(struct perf_counter *counter, int nmi,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+ return __perf_counter_overflow(counter, nmi, 1, data, regs);
+}
+
/*
* Generic software counter infrastructure
*/
}
static void perf_swcounter_overflow(struct perf_counter *counter,
- int nmi, struct perf_sample_data *data)
+ int nmi, struct perf_sample_data *data,
+ struct pt_regs *regs)
{
struct hw_perf_counter *hwc = &counter->hw;
+ int throttle = 0;
u64 overflow;
data->period = counter->hw.last_period;
return;
for (; overflow; overflow--) {
- if (perf_counter_overflow(counter, nmi, data)) {
+ if (__perf_counter_overflow(counter, nmi, throttle,
+ data, regs)) {
/*
* We inhibit the overflow from happening when
* hwc->interrupts == MAX_INTERRUPTS.
*/
break;
}
+ throttle = 1;
}
}
}
static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
- int nmi, struct perf_sample_data *data)
+ int nmi, struct perf_sample_data *data,
+ struct pt_regs *regs)
{
struct hw_perf_counter *hwc = &counter->hw;
if (!hwc->sample_period)
return;
- if (!data->regs)
+ if (!regs)
return;
if (!atomic64_add_negative(nr, &hwc->period_left))
- perf_swcounter_overflow(counter, nmi, data);
+ perf_swcounter_overflow(counter, nmi, data, regs);
}
static int perf_swcounter_is_counting(struct perf_counter *counter)
static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
enum perf_type_id type,
u32 event, u64 nr, int nmi,
- struct perf_sample_data *data)
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
{
struct perf_counter *counter;
rcu_read_lock();
list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
- if (perf_swcounter_match(counter, type, event, data->regs))
- perf_swcounter_add(counter, nr, nmi, data);
+ if (perf_swcounter_match(counter, type, event, regs))
+ perf_swcounter_add(counter, nr, nmi, data, regs);
}
rcu_read_unlock();
}
static void do_perf_swcounter_event(enum perf_type_id type, u32 event,
u64 nr, int nmi,
- struct perf_sample_data *data)
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
{
struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
int *recursion = perf_swcounter_recursion_context(cpuctx);
barrier();
perf_swcounter_ctx_event(&cpuctx->ctx, type, event,
- nr, nmi, data);
+ nr, nmi, data, regs);
rcu_read_lock();
/*
* doesn't really matter which of the child contexts the
*/
ctx = rcu_dereference(current->perf_counter_ctxp);
if (ctx)
- perf_swcounter_ctx_event(ctx, type, event, nr, nmi, data);
+ perf_swcounter_ctx_event(ctx, type, event, nr, nmi, data, regs);
rcu_read_unlock();
barrier();
struct pt_regs *regs, u64 addr)
{
struct perf_sample_data data = {
- .regs = regs,
.addr = addr,
};
- do_perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, &data);
+ do_perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi,
+ &data, regs);
}
static void perf_swcounter_read(struct perf_counter *counter)
{
enum hrtimer_restart ret = HRTIMER_RESTART;
struct perf_sample_data data;
+ struct pt_regs *regs;
struct perf_counter *counter;
u64 period;
counter->pmu->read(counter);
data.addr = 0;
- data.regs = get_irq_regs();
+ regs = get_irq_regs();
/*
* In case we exclude kernel IPs or are somehow not in interrupt
* context, provide the next best thing, the user IP.
*/
- if ((counter->attr.exclude_kernel || !data.regs) &&
+ if ((counter->attr.exclude_kernel || !regs) &&
!counter->attr.exclude_user)
- data.regs = task_pt_regs(current);
+ regs = task_pt_regs(current);
- if (data.regs) {
- if (perf_counter_overflow(counter, 0, &data))
+ if (regs) {
+ if (perf_counter_overflow(counter, 0, &data, regs))
ret = HRTIMER_NORESTART;
}
};
struct perf_sample_data data = {
- .regs = get_irq_regs(),
.addr = addr,
.raw = &raw,
};
- if (!data.regs)
- data.regs = task_pt_regs(current);
+ struct pt_regs *regs = get_irq_regs();
+
+ if (!regs)
+ regs = task_pt_regs(current);
- do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, &data);
+ do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, count, 1,
+ &data, regs);
}
EXPORT_SYMBOL_GPL(perf_tpcounter_event);
if (val)
goto err_size;
}
+ size = sizeof(*attr);
}
ret = copy_from_user(attr, uattr, size);
if (entity_is_task(curr)) {
struct task_struct *curtask = task_of(curr);
+ trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
cpuacct_charge(curtask, delta_exec);
account_group_exec_runtime(curtask, delta_exec);
}
if (!initial) {
/* sleeps upto a single latency don't count. */
- if (sched_feat(NEW_FAIR_SLEEPERS)) {
+ if (sched_feat(FAIR_SLEEPERS)) {
unsigned long thresh = sysctl_sched_latency;
/*
task_of(se)->policy != SCHED_IDLE))
thresh = calc_delta_fair(thresh, se);
+ /*
+ * Halve their sleep time's effect, to allow
+ * for a gentler effect of sleepers:
+ */
+ if (sched_feat(GENTLE_FAIR_SLEEPERS))
+ thresh >>= 1;
+
vruntime -= thresh;
}
}
static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- if (cfs_rq->last == se)
+ if (!se || cfs_rq->last == se)
cfs_rq->last = NULL;
- if (cfs_rq->next == se)
+ if (!se || cfs_rq->next == se)
cfs_rq->next = NULL;
}
se->vruntime = rightmost->vruntime + 1;
}
- /*
- * wake_idle() will wake a task on an idle cpu if task->cpu is
- * not idle and an idle cpu is available. The span of cpus to
- * search starts with cpus closest then further out as needed,
- * so we always favor a closer, idle cpu.
- * Domains may include CPUs that are not usable for migration,
- * hence we need to mask them out (rq->rd->online)
- *
- * Returns the CPU we should wake onto.
- */
- #if defined(ARCH_HAS_SCHED_WAKE_IDLE)
-
- #define cpu_rd_active(cpu, rq) cpumask_test_cpu(cpu, rq->rd->online)
-
- static int wake_idle(int cpu, struct task_struct *p)
- {
- struct sched_domain *sd;
- int i;
- unsigned int chosen_wakeup_cpu;
- int this_cpu;
- struct rq *task_rq = task_rq(p);
-
- /*
- * At POWERSAVINGS_BALANCE_WAKEUP level, if both this_cpu and prev_cpu
- * are idle and this is not a kernel thread and this task's affinity
- * allows it to be moved to preferred cpu, then just move!
- */
-
- this_cpu = smp_processor_id();
- chosen_wakeup_cpu =
- cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu;
-
- if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP &&
- idle_cpu(cpu) && idle_cpu(this_cpu) &&
- p->mm && !(p->flags & PF_KTHREAD) &&
- cpu_isset(chosen_wakeup_cpu, p->cpus_allowed))
- return chosen_wakeup_cpu;
-
- /*
- * If it is idle, then it is the best cpu to run this task.
- *
- * This cpu is also the best, if it has more than one task already.
- * Siblings must be also busy(in most cases) as they didn't already
- * pickup the extra load from this cpu and hence we need not check
- * sibling runqueue info. This will avoid the checks and cache miss
- * penalities associated with that.
- */
- if (idle_cpu(cpu) || cpu_rq(cpu)->cfs.nr_running > 1)
- return cpu;
-
- for_each_domain(cpu, sd) {
- if ((sd->flags & SD_WAKE_IDLE)
- || ((sd->flags & SD_WAKE_IDLE_FAR)
- && !task_hot(p, task_rq->clock, sd))) {
- for_each_cpu_and(i, sched_domain_span(sd),
- &p->cpus_allowed) {
- if (cpu_rd_active(i, task_rq) && idle_cpu(i)) {
- if (i != task_cpu(p)) {
- schedstat_inc(p,
- se.nr_wakeups_idle);
- }
- return i;
- }
- }
- } else {
- break;
- }
- }
- return cpu;
- }
- #else /* !ARCH_HAS_SCHED_WAKE_IDLE*/
- static inline int wake_idle(int cpu, struct task_struct *p)
- {
- return cpu;
- }
- #endif
-
#ifdef CONFIG_SMP
#ifdef CONFIG_FAIR_GROUP_SCHED
#endif
- static int
- wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
- struct task_struct *p, int prev_cpu, int this_cpu, int sync,
- int idx, unsigned long load, unsigned long this_load,
- unsigned int imbalance)
+ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
{
- struct task_struct *curr = this_rq->curr;
- struct task_group *tg;
- unsigned long tl = this_load;
+ struct task_struct *curr = current;
+ unsigned long this_load, load;
+ int idx, this_cpu, prev_cpu;
unsigned long tl_per_task;
+ unsigned int imbalance;
+ struct task_group *tg;
unsigned long weight;
int balanced;
- if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS))
- return 0;
+ idx = sd->wake_idx;
+ this_cpu = smp_processor_id();
+ prev_cpu = task_cpu(p);
+ load = source_load(prev_cpu, idx);
+ this_load = target_load(this_cpu, idx);
- if (sync && (curr->se.avg_overlap > sysctl_sched_migration_cost ||
- p->se.avg_overlap > sysctl_sched_migration_cost))
- sync = 0;
+ if (sync) {
+ if (sched_feat(SYNC_LESS) &&
+ (curr->se.avg_overlap > sysctl_sched_migration_cost ||
+ p->se.avg_overlap > sysctl_sched_migration_cost))
+ sync = 0;
+ } else {
+ if (sched_feat(SYNC_MORE) &&
+ (curr->se.avg_overlap < sysctl_sched_migration_cost &&
+ p->se.avg_overlap < sysctl_sched_migration_cost))
+ sync = 1;
+ }
/*
* If sync wakeup then subtract the (maximum possible)
tg = task_group(current);
weight = current->se.load.weight;
- tl += effective_load(tg, this_cpu, -weight, -weight);
+ this_load += effective_load(tg, this_cpu, -weight, -weight);
load += effective_load(tg, prev_cpu, 0, -weight);
}
tg = task_group(p);
weight = p->se.load.weight;
+ imbalance = 100 + (sd->imbalance_pct - 100) / 2;
+
/*
* In low-load situations, where prev_cpu is idle and this_cpu is idle
- * due to the sync cause above having dropped tl to 0, we'll always have
- * an imbalance, but there's really nothing you can do about that, so
- * that's good too.
+ * due to the sync cause above having dropped this_load to 0, we'll
+ * always have an imbalance, but there's really nothing you can do
+ * about that, so that's good too.
*
* Otherwise check if either cpus are near enough in load to allow this
* task to be woken on this_cpu.
*/
- balanced = !tl ||
- 100*(tl + effective_load(tg, this_cpu, weight, weight)) <=
+ balanced = !this_load ||
+ 100*(this_load + effective_load(tg, this_cpu, weight, weight)) <=
imbalance*(load + effective_load(tg, prev_cpu, 0, weight));
/*
schedstat_inc(p, se.nr_wakeups_affine_attempts);
tl_per_task = cpu_avg_load_per_task(this_cpu);
- if (balanced || (tl <= load && tl + target_load(prev_cpu, idx) <=
- tl_per_task)) {
+ if (balanced ||
+ (this_load <= load &&
+ this_load + target_load(prev_cpu, idx) <= tl_per_task)) {
/*
* This domain has SD_WAKE_AFFINE and
* p is cache cold in this domain, and
* there is no bad imbalance.
*/
- schedstat_inc(this_sd, ttwu_move_affine);
+ schedstat_inc(sd, ttwu_move_affine);
schedstat_inc(p, se.nr_wakeups_affine);
return 1;
return 0;
}
- static int select_task_rq_fair(struct task_struct *p, int sync)
+ /*
+ * find_idlest_group finds and returns the least busy CPU group within the
+ * domain.
+ */
+ static struct sched_group *
+ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
+ int this_cpu, int load_idx)
{
- struct sched_domain *sd, *this_sd = NULL;
- int prev_cpu, this_cpu, new_cpu;
- unsigned long load, this_load;
- struct rq *this_rq;
- unsigned int imbalance;
- int idx;
+ struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
+ unsigned long min_load = ULONG_MAX, this_load = 0;
+ int imbalance = 100 + (sd->imbalance_pct-100)/2;
- prev_cpu = task_cpu(p);
- this_cpu = smp_processor_id();
- this_rq = cpu_rq(this_cpu);
- new_cpu = prev_cpu;
+ do {
+ unsigned long load, avg_load;
+ int local_group;
+ int i;
- /*
- * 'this_sd' is the first domain that both
- * this_cpu and prev_cpu are present in:
- */
- for_each_domain(this_cpu, sd) {
- if (cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) {
- this_sd = sd;
- break;
+ /* Skip over this group if it has no CPUs allowed */
+ if (!cpumask_intersects(sched_group_cpus(group),
+ &p->cpus_allowed))
+ continue;
+
+ local_group = cpumask_test_cpu(this_cpu,
+ sched_group_cpus(group));
+
+ /* Tally up the load of all CPUs in the group */
+ avg_load = 0;
+
+ for_each_cpu(i, sched_group_cpus(group)) {
+ /* Bias balancing toward cpus of our domain */
+ if (local_group)
+ load = source_load(i, load_idx);
+ else
+ load = target_load(i, load_idx);
+
+ avg_load += load;
+ }
+
+ /* Adjust by relative CPU power of the group */
+ avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
+
+ if (local_group) {
+ this_load = avg_load;
+ this = group;
+ } else if (avg_load < min_load) {
+ min_load = avg_load;
+ idlest = group;
+ }
+ } while (group = group->next, group != sd->groups);
+
+ if (!idlest || 100*this_load < imbalance*min_load)
+ return NULL;
+ return idlest;
+ }
+
+ /*
+ * find_idlest_cpu - find the idlest cpu among the cpus in group.
+ */
+ static int
+ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
+ {
+ unsigned long load, min_load = ULONG_MAX;
+ int idlest = -1;
+ int i;
+
+ /* Traverse only the allowed CPUs */
+ for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) {
+ load = weighted_cpuload(i);
+
+ if (load < min_load || (load == min_load && i == this_cpu)) {
+ min_load = load;
+ idlest = i;
}
}
- if (unlikely(!cpumask_test_cpu(this_cpu, &p->cpus_allowed)))
- goto out;
+ return idlest;
+ }
- /*
- * Check for affine wakeup and passive balancing possibilities.
- */
- if (!this_sd)
+ /*
+ * sched_balance_self: balance the current task (running on cpu) in domains
+ * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
+ * SD_BALANCE_EXEC.
+ *
+ * Balance, ie. select the least loaded group.
+ *
+ * Returns the target CPU number, or the same CPU if no balancing is needed.
+ *
+ * preempt must be disabled.
+ */
+ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
+ {
+ struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
+ int cpu = smp_processor_id();
+ int prev_cpu = task_cpu(p);
+ int new_cpu = cpu;
+ int want_affine = 0;
+ int want_sd = 1;
+ int sync = wake_flags & WF_SYNC;
+
+ if (sd_flag & SD_BALANCE_WAKE) {
+ if (sched_feat(AFFINE_WAKEUPS))
+ want_affine = 1;
+ new_cpu = prev_cpu;
+ }
+
+ rcu_read_lock();
+ for_each_domain(cpu, tmp) {
+ /*
+ * If power savings logic is enabled for a domain, see if we
+ * are not overloaded, if so, don't balance wider.
+ */
+ if (tmp->flags & (SD_POWERSAVINGS_BALANCE|SD_PREFER_LOCAL)) {
+ unsigned long power = 0;
+ unsigned long nr_running = 0;
+ unsigned long capacity;
+ int i;
+
+ for_each_cpu(i, sched_domain_span(tmp)) {
+ power += power_of(i);
+ nr_running += cpu_rq(i)->cfs.nr_running;
+ }
+
+ capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
+
+ if (tmp->flags & SD_POWERSAVINGS_BALANCE)
+ nr_running /= 2;
+
+ if (nr_running < capacity)
+ want_sd = 0;
+ }
+
+ if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
+ cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
+
+ affine_sd = tmp;
+ want_affine = 0;
+ }
+
+ if (!want_sd && !want_affine)
+ break;
+
+ if (!(tmp->flags & sd_flag))
+ continue;
+
+ if (want_sd)
+ sd = tmp;
+ }
+
+ if (sched_feat(LB_SHARES_UPDATE)) {
+ /*
+ * Pick the largest domain to update shares over
+ */
+ tmp = sd;
+ if (affine_sd && (!tmp ||
+ cpumask_weight(sched_domain_span(affine_sd)) >
+ cpumask_weight(sched_domain_span(sd))))
+ tmp = affine_sd;
+
+ if (tmp)
+ update_shares(tmp);
+ }
+
+ if (affine_sd && wake_affine(affine_sd, p, sync)) {
+ new_cpu = cpu;
goto out;
+ }
- idx = this_sd->wake_idx;
+ while (sd) {
+ int load_idx = sd->forkexec_idx;
+ struct sched_group *group;
+ int weight;
- imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
+ if (!(sd->flags & sd_flag)) {
+ sd = sd->child;
+ continue;
+ }
- load = source_load(prev_cpu, idx);
- this_load = target_load(this_cpu, idx);
+ if (sd_flag & SD_BALANCE_WAKE)
+ load_idx = sd->wake_idx;
- if (wake_affine(this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx,
- load, this_load, imbalance))
- return this_cpu;
+ group = find_idlest_group(sd, p, cpu, load_idx);
+ if (!group) {
+ sd = sd->child;
+ continue;
+ }
- /*
- * Start passive balancing when half the imbalance_pct
- * limit is reached.
- */
- if (this_sd->flags & SD_WAKE_BALANCE) {
- if (imbalance*this_load <= 100*load) {
- schedstat_inc(this_sd, ttwu_move_balance);
- schedstat_inc(p, se.nr_wakeups_passive);
- return this_cpu;
+ new_cpu = find_idlest_cpu(group, p, cpu);
+ if (new_cpu == -1 || new_cpu == cpu) {
+ /* Now try balancing at a lower domain level of cpu */
+ sd = sd->child;
+ continue;
}
+
+ /* Now try balancing at a lower domain level of new_cpu */
+ cpu = new_cpu;
+ weight = cpumask_weight(sched_domain_span(sd));
+ sd = NULL;
+ for_each_domain(cpu, tmp) {
+ if (weight <= cpumask_weight(sched_domain_span(tmp)))
+ break;
+ if (tmp->flags & sd_flag)
+ sd = tmp;
+ }
+ /* while loop will break here if sd == NULL */
}
out:
- return wake_idle(new_cpu, p);
+ rcu_read_unlock();
+ return new_cpu;
}
#endif /* CONFIG_SMP */
/*
* Preempt the current task with a newly woken task if needed:
*/
- static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
+ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
{
struct task_struct *curr = rq->curr;
struct sched_entity *se = &curr->se, *pse = &p->se;
struct cfs_rq *cfs_rq = task_cfs_rq(curr);
+ int sync = wake_flags & WF_SYNC;
update_curr(cfs_rq);
*/
if (sched_feat(LAST_BUDDY) && likely(se->on_rq && curr != rq->idle))
set_last_buddy(se);
- set_next_buddy(pse);
+ if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK))
+ set_next_buddy(pse);
/*
* We can come here with TIF_NEED_RESCHED already set from new task
return;
}
- if (!sched_feat(WAKEUP_PREEMPT))
- return;
-
- if (sched_feat(WAKEUP_OVERLAP) && (sync ||
- (se->avg_overlap < sysctl_sched_migration_cost &&
- pse->avg_overlap < sysctl_sched_migration_cost))) {
+ if ((sched_feat(WAKEUP_SYNC) && sync) ||
+ (sched_feat(WAKEUP_OVERLAP) &&
+ (se->avg_overlap < sysctl_sched_migration_cost &&
+ pse->avg_overlap < sysctl_sched_migration_cost))) {
resched_task(curr);
return;
}
+ if (sched_feat(WAKEUP_RUNNING)) {
+ if (pse->avg_running < se->avg_running) {
+ set_next_buddy(pse);
+ resched_task(curr);
+ return;
+ }
+ }
+
+ if (!sched_feat(WAKEUP_PREEMPT))
+ return;
+
find_matching_se(&se, &pse);
BUG_ON(!pse);
/*
* If se was a buddy, clear it so that it will have to earn
* the favour again.
+ *
+ * If se was not a buddy, clear the buddies because neither
+ * was elegible to run, let them earn it again.
+ *
+ * IOW. unconditionally clear buddies.
*/
- __clear_buddies(cfs_rq, se);
+ __clear_buddies(cfs_rq, NULL);
set_next_entity(cfs_rq, se);
cfs_rq = group_cfs_rq(se);
} while (cfs_rq);