5 #include "util/evlist.h"
6 #include "util/cache.h"
7 #include "util/evsel.h"
8 #include "util/symbol.h"
9 #include "util/thread.h"
10 #include "util/header.h"
11 #include "util/session.h"
12 #include "util/tool.h"
14 #include "util/parse-options.h"
15 #include "util/trace-event.h"
17 #include "util/debug.h"
19 #include <sys/prctl.h>
20 #include <sys/resource.h>
22 #include <semaphore.h>
26 #define PR_SET_NAME 15 /* Set process name */
39 unsigned long nr_events;
40 unsigned long curr_event;
41 struct sched_atom **atoms;
52 enum sched_event_type {
56 SCHED_EVENT_MIGRATION,
60 enum sched_event_type type;
66 struct task_desc *wakee;
69 #define TASK_STATE_TO_CHAR_STR "RSDTtZX"
79 struct list_head list;
80 enum thread_state state;
88 struct list_head work_list;
89 struct thread *thread;
98 typedef int (*sort_fn_t)(struct work_atoms *, struct work_atoms *);
102 struct trace_sched_handler {
103 int (*switch_event)(struct perf_sched *sched, struct perf_evsel *evsel,
104 struct perf_sample *sample, struct machine *machine);
106 int (*runtime_event)(struct perf_sched *sched, struct perf_evsel *evsel,
107 struct perf_sample *sample, struct machine *machine);
109 int (*wakeup_event)(struct perf_sched *sched, struct perf_evsel *evsel,
110 struct perf_sample *sample, struct machine *machine);
112 /* PERF_RECORD_FORK event, not sched_process_fork tracepoint */
113 int (*fork_event)(struct perf_sched *sched, union perf_event *event,
114 struct machine *machine);
116 int (*migrate_task_event)(struct perf_sched *sched,
117 struct perf_evsel *evsel,
118 struct perf_sample *sample,
119 struct machine *machine);
123 struct perf_tool tool;
124 const char *sort_order;
125 unsigned long nr_tasks;
126 struct task_desc *pid_to_task[MAX_PID];
127 struct task_desc **tasks;
128 const struct trace_sched_handler *tp_handler;
129 pthread_mutex_t start_work_mutex;
130 pthread_mutex_t work_done_wait_mutex;
133 * Track the current task - that way we can know whether there's any
134 * weird events, such as a task being switched away that is not current.
137 u32 curr_pid[MAX_CPUS];
138 struct thread *curr_thread[MAX_CPUS];
139 char next_shortname1;
140 char next_shortname2;
141 unsigned int replay_repeat;
142 unsigned long nr_run_events;
143 unsigned long nr_sleep_events;
144 unsigned long nr_wakeup_events;
145 unsigned long nr_sleep_corrections;
146 unsigned long nr_run_events_optimized;
147 unsigned long targetless_wakeups;
148 unsigned long multitarget_wakeups;
149 unsigned long nr_runs;
150 unsigned long nr_timestamps;
151 unsigned long nr_unordered_timestamps;
152 unsigned long nr_state_machine_bugs;
153 unsigned long nr_context_switch_bugs;
154 unsigned long nr_events;
155 unsigned long nr_lost_chunks;
156 unsigned long nr_lost_events;
157 u64 run_measurement_overhead;
158 u64 sleep_measurement_overhead;
161 u64 runavg_cpu_usage;
162 u64 parent_cpu_usage;
163 u64 runavg_parent_cpu_usage;
169 u64 cpu_last_switched[MAX_CPUS];
170 struct rb_root atom_root, sorted_atom_root;
171 struct list_head sort_list, cmp_pid;
174 static u64 get_nsecs(void)
178 clock_gettime(CLOCK_MONOTONIC, &ts);
180 return ts.tv_sec * 1000000000ULL + ts.tv_nsec;
183 static void burn_nsecs(struct perf_sched *sched, u64 nsecs)
185 u64 T0 = get_nsecs(), T1;
189 } while (T1 + sched->run_measurement_overhead < T0 + nsecs);
192 static void sleep_nsecs(u64 nsecs)
196 ts.tv_nsec = nsecs % 999999999;
197 ts.tv_sec = nsecs / 999999999;
199 nanosleep(&ts, NULL);
202 static void calibrate_run_measurement_overhead(struct perf_sched *sched)
204 u64 T0, T1, delta, min_delta = 1000000000ULL;
207 for (i = 0; i < 10; i++) {
209 burn_nsecs(sched, 0);
212 min_delta = min(min_delta, delta);
214 sched->run_measurement_overhead = min_delta;
216 printf("run measurement overhead: %" PRIu64 " nsecs\n", min_delta);
219 static void calibrate_sleep_measurement_overhead(struct perf_sched *sched)
221 u64 T0, T1, delta, min_delta = 1000000000ULL;
224 for (i = 0; i < 10; i++) {
229 min_delta = min(min_delta, delta);
232 sched->sleep_measurement_overhead = min_delta;
234 printf("sleep measurement overhead: %" PRIu64 " nsecs\n", min_delta);
237 static struct sched_atom *
238 get_new_event(struct task_desc *task, u64 timestamp)
240 struct sched_atom *event = zalloc(sizeof(*event));
241 unsigned long idx = task->nr_events;
244 event->timestamp = timestamp;
248 size = sizeof(struct sched_atom *) * task->nr_events;
249 task->atoms = realloc(task->atoms, size);
250 BUG_ON(!task->atoms);
252 task->atoms[idx] = event;
257 static struct sched_atom *last_event(struct task_desc *task)
259 if (!task->nr_events)
262 return task->atoms[task->nr_events - 1];
265 static void add_sched_event_run(struct perf_sched *sched, struct task_desc *task,
266 u64 timestamp, u64 duration)
268 struct sched_atom *event, *curr_event = last_event(task);
271 * optimize an existing RUN event by merging this one
274 if (curr_event && curr_event->type == SCHED_EVENT_RUN) {
275 sched->nr_run_events_optimized++;
276 curr_event->duration += duration;
280 event = get_new_event(task, timestamp);
282 event->type = SCHED_EVENT_RUN;
283 event->duration = duration;
285 sched->nr_run_events++;
288 static void add_sched_event_wakeup(struct perf_sched *sched, struct task_desc *task,
289 u64 timestamp, struct task_desc *wakee)
291 struct sched_atom *event, *wakee_event;
293 event = get_new_event(task, timestamp);
294 event->type = SCHED_EVENT_WAKEUP;
295 event->wakee = wakee;
297 wakee_event = last_event(wakee);
298 if (!wakee_event || wakee_event->type != SCHED_EVENT_SLEEP) {
299 sched->targetless_wakeups++;
302 if (wakee_event->wait_sem) {
303 sched->multitarget_wakeups++;
307 wakee_event->wait_sem = zalloc(sizeof(*wakee_event->wait_sem));
308 sem_init(wakee_event->wait_sem, 0, 0);
309 wakee_event->specific_wait = 1;
310 event->wait_sem = wakee_event->wait_sem;
312 sched->nr_wakeup_events++;
315 static void add_sched_event_sleep(struct perf_sched *sched, struct task_desc *task,
316 u64 timestamp, u64 task_state __maybe_unused)
318 struct sched_atom *event = get_new_event(task, timestamp);
320 event->type = SCHED_EVENT_SLEEP;
322 sched->nr_sleep_events++;
325 static struct task_desc *register_pid(struct perf_sched *sched,
326 unsigned long pid, const char *comm)
328 struct task_desc *task;
330 BUG_ON(pid >= MAX_PID);
332 task = sched->pid_to_task[pid];
337 task = zalloc(sizeof(*task));
339 task->nr = sched->nr_tasks;
340 strcpy(task->comm, comm);
342 * every task starts in sleeping state - this gets ignored
343 * if there's no wakeup pointing to this sleep state:
345 add_sched_event_sleep(sched, task, 0, 0);
347 sched->pid_to_task[pid] = task;
349 sched->tasks = realloc(sched->tasks, sched->nr_tasks * sizeof(struct task_task *));
350 BUG_ON(!sched->tasks);
351 sched->tasks[task->nr] = task;
354 printf("registered task #%ld, PID %ld (%s)\n", sched->nr_tasks, pid, comm);
360 static void print_task_traces(struct perf_sched *sched)
362 struct task_desc *task;
365 for (i = 0; i < sched->nr_tasks; i++) {
366 task = sched->tasks[i];
367 printf("task %6ld (%20s:%10ld), nr_events: %ld\n",
368 task->nr, task->comm, task->pid, task->nr_events);
372 static void add_cross_task_wakeups(struct perf_sched *sched)
374 struct task_desc *task1, *task2;
377 for (i = 0; i < sched->nr_tasks; i++) {
378 task1 = sched->tasks[i];
380 if (j == sched->nr_tasks)
382 task2 = sched->tasks[j];
383 add_sched_event_wakeup(sched, task1, 0, task2);
387 static void perf_sched__process_event(struct perf_sched *sched,
388 struct sched_atom *atom)
392 switch (atom->type) {
393 case SCHED_EVENT_RUN:
394 burn_nsecs(sched, atom->duration);
396 case SCHED_EVENT_SLEEP:
398 ret = sem_wait(atom->wait_sem);
401 case SCHED_EVENT_WAKEUP:
403 ret = sem_post(atom->wait_sem);
406 case SCHED_EVENT_MIGRATION:
413 static u64 get_cpu_usage_nsec_parent(void)
419 err = getrusage(RUSAGE_SELF, &ru);
422 sum = ru.ru_utime.tv_sec*1e9 + ru.ru_utime.tv_usec*1e3;
423 sum += ru.ru_stime.tv_sec*1e9 + ru.ru_stime.tv_usec*1e3;
428 static int self_open_counters(void)
430 struct perf_event_attr attr;
433 memset(&attr, 0, sizeof(attr));
435 attr.type = PERF_TYPE_SOFTWARE;
436 attr.config = PERF_COUNT_SW_TASK_CLOCK;
438 fd = sys_perf_event_open(&attr, 0, -1, -1, 0);
441 pr_err("Error: sys_perf_event_open() syscall returned "
442 "with %d (%s)\n", fd, strerror(errno));
446 static u64 get_cpu_usage_nsec_self(int fd)
451 ret = read(fd, &runtime, sizeof(runtime));
452 BUG_ON(ret != sizeof(runtime));
457 struct sched_thread_parms {
458 struct task_desc *task;
459 struct perf_sched *sched;
462 static void *thread_func(void *ctx)
464 struct sched_thread_parms *parms = ctx;
465 struct task_desc *this_task = parms->task;
466 struct perf_sched *sched = parms->sched;
467 u64 cpu_usage_0, cpu_usage_1;
468 unsigned long i, ret;
474 sprintf(comm2, ":%s", this_task->comm);
475 prctl(PR_SET_NAME, comm2);
476 fd = self_open_counters();
480 ret = sem_post(&this_task->ready_for_work);
482 ret = pthread_mutex_lock(&sched->start_work_mutex);
484 ret = pthread_mutex_unlock(&sched->start_work_mutex);
487 cpu_usage_0 = get_cpu_usage_nsec_self(fd);
489 for (i = 0; i < this_task->nr_events; i++) {
490 this_task->curr_event = i;
491 perf_sched__process_event(sched, this_task->atoms[i]);
494 cpu_usage_1 = get_cpu_usage_nsec_self(fd);
495 this_task->cpu_usage = cpu_usage_1 - cpu_usage_0;
496 ret = sem_post(&this_task->work_done_sem);
499 ret = pthread_mutex_lock(&sched->work_done_wait_mutex);
501 ret = pthread_mutex_unlock(&sched->work_done_wait_mutex);
507 static void create_tasks(struct perf_sched *sched)
509 struct task_desc *task;
514 err = pthread_attr_init(&attr);
516 err = pthread_attr_setstacksize(&attr,
517 (size_t) max(16 * 1024, PTHREAD_STACK_MIN));
519 err = pthread_mutex_lock(&sched->start_work_mutex);
521 err = pthread_mutex_lock(&sched->work_done_wait_mutex);
523 for (i = 0; i < sched->nr_tasks; i++) {
524 struct sched_thread_parms *parms = malloc(sizeof(*parms));
525 BUG_ON(parms == NULL);
526 parms->task = task = sched->tasks[i];
527 parms->sched = sched;
528 sem_init(&task->sleep_sem, 0, 0);
529 sem_init(&task->ready_for_work, 0, 0);
530 sem_init(&task->work_done_sem, 0, 0);
531 task->curr_event = 0;
532 err = pthread_create(&task->thread, &attr, thread_func, parms);
537 static void wait_for_tasks(struct perf_sched *sched)
539 u64 cpu_usage_0, cpu_usage_1;
540 struct task_desc *task;
541 unsigned long i, ret;
543 sched->start_time = get_nsecs();
544 sched->cpu_usage = 0;
545 pthread_mutex_unlock(&sched->work_done_wait_mutex);
547 for (i = 0; i < sched->nr_tasks; i++) {
548 task = sched->tasks[i];
549 ret = sem_wait(&task->ready_for_work);
551 sem_init(&task->ready_for_work, 0, 0);
553 ret = pthread_mutex_lock(&sched->work_done_wait_mutex);
556 cpu_usage_0 = get_cpu_usage_nsec_parent();
558 pthread_mutex_unlock(&sched->start_work_mutex);
560 for (i = 0; i < sched->nr_tasks; i++) {
561 task = sched->tasks[i];
562 ret = sem_wait(&task->work_done_sem);
564 sem_init(&task->work_done_sem, 0, 0);
565 sched->cpu_usage += task->cpu_usage;
569 cpu_usage_1 = get_cpu_usage_nsec_parent();
570 if (!sched->runavg_cpu_usage)
571 sched->runavg_cpu_usage = sched->cpu_usage;
572 sched->runavg_cpu_usage = (sched->runavg_cpu_usage * 9 + sched->cpu_usage) / 10;
574 sched->parent_cpu_usage = cpu_usage_1 - cpu_usage_0;
575 if (!sched->runavg_parent_cpu_usage)
576 sched->runavg_parent_cpu_usage = sched->parent_cpu_usage;
577 sched->runavg_parent_cpu_usage = (sched->runavg_parent_cpu_usage * 9 +
578 sched->parent_cpu_usage)/10;
580 ret = pthread_mutex_lock(&sched->start_work_mutex);
583 for (i = 0; i < sched->nr_tasks; i++) {
584 task = sched->tasks[i];
585 sem_init(&task->sleep_sem, 0, 0);
586 task->curr_event = 0;
590 static void run_one_test(struct perf_sched *sched)
592 u64 T0, T1, delta, avg_delta, fluct;
595 wait_for_tasks(sched);
599 sched->sum_runtime += delta;
602 avg_delta = sched->sum_runtime / sched->nr_runs;
603 if (delta < avg_delta)
604 fluct = avg_delta - delta;
606 fluct = delta - avg_delta;
607 sched->sum_fluct += fluct;
609 sched->run_avg = delta;
610 sched->run_avg = (sched->run_avg * 9 + delta) / 10;
612 printf("#%-3ld: %0.3f, ", sched->nr_runs, (double)delta / 1000000.0);
614 printf("ravg: %0.2f, ", (double)sched->run_avg / 1e6);
616 printf("cpu: %0.2f / %0.2f",
617 (double)sched->cpu_usage / 1e6, (double)sched->runavg_cpu_usage / 1e6);
621 * rusage statistics done by the parent, these are less
622 * accurate than the sched->sum_exec_runtime based statistics:
624 printf(" [%0.2f / %0.2f]",
625 (double)sched->parent_cpu_usage/1e6,
626 (double)sched->runavg_parent_cpu_usage/1e6);
631 if (sched->nr_sleep_corrections)
632 printf(" (%ld sleep corrections)\n", sched->nr_sleep_corrections);
633 sched->nr_sleep_corrections = 0;
636 static void test_calibrations(struct perf_sched *sched)
641 burn_nsecs(sched, 1e6);
644 printf("the run test took %" PRIu64 " nsecs\n", T1 - T0);
650 printf("the sleep test took %" PRIu64 " nsecs\n", T1 - T0);
654 replay_wakeup_event(struct perf_sched *sched,
655 struct perf_evsel *evsel, struct perf_sample *sample,
656 struct machine *machine __maybe_unused)
658 const char *comm = perf_evsel__strval(evsel, sample, "comm");
659 const u32 pid = perf_evsel__intval(evsel, sample, "pid");
660 struct task_desc *waker, *wakee;
663 printf("sched_wakeup event %p\n", evsel);
665 printf(" ... pid %d woke up %s/%d\n", sample->tid, comm, pid);
668 waker = register_pid(sched, sample->tid, "<unknown>");
669 wakee = register_pid(sched, pid, comm);
671 add_sched_event_wakeup(sched, waker, sample->time, wakee);
675 static int replay_switch_event(struct perf_sched *sched,
676 struct perf_evsel *evsel,
677 struct perf_sample *sample,
678 struct machine *machine __maybe_unused)
680 const char *prev_comm = perf_evsel__strval(evsel, sample, "prev_comm"),
681 *next_comm = perf_evsel__strval(evsel, sample, "next_comm");
682 const u32 prev_pid = perf_evsel__intval(evsel, sample, "prev_pid"),
683 next_pid = perf_evsel__intval(evsel, sample, "next_pid");
684 const u64 prev_state = perf_evsel__intval(evsel, sample, "prev_state");
685 struct task_desc *prev, __maybe_unused *next;
686 u64 timestamp0, timestamp = sample->time;
687 int cpu = sample->cpu;
691 printf("sched_switch event %p\n", evsel);
693 if (cpu >= MAX_CPUS || cpu < 0)
696 timestamp0 = sched->cpu_last_switched[cpu];
698 delta = timestamp - timestamp0;
703 pr_err("hm, delta: %" PRIu64 " < 0 ?\n", delta);
707 pr_debug(" ... switch from %s/%d to %s/%d [ran %" PRIu64 " nsecs]\n",
708 prev_comm, prev_pid, next_comm, next_pid, delta);
710 prev = register_pid(sched, prev_pid, prev_comm);
711 next = register_pid(sched, next_pid, next_comm);
713 sched->cpu_last_switched[cpu] = timestamp;
715 add_sched_event_run(sched, prev, timestamp, delta);
716 add_sched_event_sleep(sched, prev, timestamp, prev_state);
721 static int replay_fork_event(struct perf_sched *sched,
722 union perf_event *event,
723 struct machine *machine)
725 struct thread *child, *parent;
727 child = machine__findnew_thread(machine, event->fork.tid);
728 parent = machine__findnew_thread(machine, event->fork.ptid);
730 if (child == NULL || parent == NULL) {
731 pr_debug("thread does not exist on fork event: child %p, parent %p\n",
737 printf("fork event\n");
738 printf("... parent: %s/%d\n", parent->comm, parent->tid);
739 printf("... child: %s/%d\n", child->comm, child->tid);
742 register_pid(sched, parent->tid, parent->comm);
743 register_pid(sched, child->tid, child->comm);
747 struct sort_dimension {
750 struct list_head list;
754 thread_lat_cmp(struct list_head *list, struct work_atoms *l, struct work_atoms *r)
756 struct sort_dimension *sort;
759 BUG_ON(list_empty(list));
761 list_for_each_entry(sort, list, list) {
762 ret = sort->cmp(l, r);
770 static struct work_atoms *
771 thread_atoms_search(struct rb_root *root, struct thread *thread,
772 struct list_head *sort_list)
774 struct rb_node *node = root->rb_node;
775 struct work_atoms key = { .thread = thread };
778 struct work_atoms *atoms;
781 atoms = container_of(node, struct work_atoms, node);
783 cmp = thread_lat_cmp(sort_list, &key, atoms);
785 node = node->rb_left;
787 node = node->rb_right;
789 BUG_ON(thread != atoms->thread);
797 __thread_latency_insert(struct rb_root *root, struct work_atoms *data,
798 struct list_head *sort_list)
800 struct rb_node **new = &(root->rb_node), *parent = NULL;
803 struct work_atoms *this;
806 this = container_of(*new, struct work_atoms, node);
809 cmp = thread_lat_cmp(sort_list, data, this);
812 new = &((*new)->rb_left);
814 new = &((*new)->rb_right);
817 rb_link_node(&data->node, parent, new);
818 rb_insert_color(&data->node, root);
821 static int thread_atoms_insert(struct perf_sched *sched, struct thread *thread)
823 struct work_atoms *atoms = zalloc(sizeof(*atoms));
825 pr_err("No memory at %s\n", __func__);
829 atoms->thread = thread;
830 INIT_LIST_HEAD(&atoms->work_list);
831 __thread_latency_insert(&sched->atom_root, atoms, &sched->cmp_pid);
835 static char sched_out_state(u64 prev_state)
837 const char *str = TASK_STATE_TO_CHAR_STR;
839 return str[prev_state];
843 add_sched_out_event(struct work_atoms *atoms,
847 struct work_atom *atom = zalloc(sizeof(*atom));
849 pr_err("Non memory at %s", __func__);
853 atom->sched_out_time = timestamp;
855 if (run_state == 'R') {
856 atom->state = THREAD_WAIT_CPU;
857 atom->wake_up_time = atom->sched_out_time;
860 list_add_tail(&atom->list, &atoms->work_list);
865 add_runtime_event(struct work_atoms *atoms, u64 delta,
866 u64 timestamp __maybe_unused)
868 struct work_atom *atom;
870 BUG_ON(list_empty(&atoms->work_list));
872 atom = list_entry(atoms->work_list.prev, struct work_atom, list);
874 atom->runtime += delta;
875 atoms->total_runtime += delta;
879 add_sched_in_event(struct work_atoms *atoms, u64 timestamp)
881 struct work_atom *atom;
884 if (list_empty(&atoms->work_list))
887 atom = list_entry(atoms->work_list.prev, struct work_atom, list);
889 if (atom->state != THREAD_WAIT_CPU)
892 if (timestamp < atom->wake_up_time) {
893 atom->state = THREAD_IGNORE;
897 atom->state = THREAD_SCHED_IN;
898 atom->sched_in_time = timestamp;
900 delta = atom->sched_in_time - atom->wake_up_time;
901 atoms->total_lat += delta;
902 if (delta > atoms->max_lat) {
903 atoms->max_lat = delta;
904 atoms->max_lat_at = timestamp;
909 static int latency_switch_event(struct perf_sched *sched,
910 struct perf_evsel *evsel,
911 struct perf_sample *sample,
912 struct machine *machine)
914 const u32 prev_pid = perf_evsel__intval(evsel, sample, "prev_pid"),
915 next_pid = perf_evsel__intval(evsel, sample, "next_pid");
916 const u64 prev_state = perf_evsel__intval(evsel, sample, "prev_state");
917 struct work_atoms *out_events, *in_events;
918 struct thread *sched_out, *sched_in;
919 u64 timestamp0, timestamp = sample->time;
920 int cpu = sample->cpu;
923 BUG_ON(cpu >= MAX_CPUS || cpu < 0);
925 timestamp0 = sched->cpu_last_switched[cpu];
926 sched->cpu_last_switched[cpu] = timestamp;
928 delta = timestamp - timestamp0;
933 pr_err("hm, delta: %" PRIu64 " < 0 ?\n", delta);
937 sched_out = machine__findnew_thread(machine, prev_pid);
938 sched_in = machine__findnew_thread(machine, next_pid);
940 out_events = thread_atoms_search(&sched->atom_root, sched_out, &sched->cmp_pid);
942 if (thread_atoms_insert(sched, sched_out))
944 out_events = thread_atoms_search(&sched->atom_root, sched_out, &sched->cmp_pid);
946 pr_err("out-event: Internal tree error");
950 if (add_sched_out_event(out_events, sched_out_state(prev_state), timestamp))
953 in_events = thread_atoms_search(&sched->atom_root, sched_in, &sched->cmp_pid);
955 if (thread_atoms_insert(sched, sched_in))
957 in_events = thread_atoms_search(&sched->atom_root, sched_in, &sched->cmp_pid);
959 pr_err("in-event: Internal tree error");
963 * Take came in we have not heard about yet,
964 * add in an initial atom in runnable state:
966 if (add_sched_out_event(in_events, 'R', timestamp))
969 add_sched_in_event(in_events, timestamp);
974 static int latency_runtime_event(struct perf_sched *sched,
975 struct perf_evsel *evsel,
976 struct perf_sample *sample,
977 struct machine *machine)
979 const u32 pid = perf_evsel__intval(evsel, sample, "pid");
980 const u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
981 struct thread *thread = machine__findnew_thread(machine, pid);
982 struct work_atoms *atoms = thread_atoms_search(&sched->atom_root, thread, &sched->cmp_pid);
983 u64 timestamp = sample->time;
984 int cpu = sample->cpu;
986 BUG_ON(cpu >= MAX_CPUS || cpu < 0);
988 if (thread_atoms_insert(sched, thread))
990 atoms = thread_atoms_search(&sched->atom_root, thread, &sched->cmp_pid);
992 pr_err("in-event: Internal tree error");
995 if (add_sched_out_event(atoms, 'R', timestamp))
999 add_runtime_event(atoms, runtime, timestamp);
1003 static int latency_wakeup_event(struct perf_sched *sched,
1004 struct perf_evsel *evsel,
1005 struct perf_sample *sample,
1006 struct machine *machine)
1008 const u32 pid = perf_evsel__intval(evsel, sample, "pid"),
1009 success = perf_evsel__intval(evsel, sample, "success");
1010 struct work_atoms *atoms;
1011 struct work_atom *atom;
1012 struct thread *wakee;
1013 u64 timestamp = sample->time;
1015 /* Note for later, it may be interesting to observe the failing cases */
1019 wakee = machine__findnew_thread(machine, pid);
1020 atoms = thread_atoms_search(&sched->atom_root, wakee, &sched->cmp_pid);
1022 if (thread_atoms_insert(sched, wakee))
1024 atoms = thread_atoms_search(&sched->atom_root, wakee, &sched->cmp_pid);
1026 pr_err("wakeup-event: Internal tree error");
1029 if (add_sched_out_event(atoms, 'S', timestamp))
1033 BUG_ON(list_empty(&atoms->work_list));
1035 atom = list_entry(atoms->work_list.prev, struct work_atom, list);
1038 * You WILL be missing events if you've recorded only
1039 * one CPU, or are only looking at only one, so don't
1040 * make useless noise.
1042 if (sched->profile_cpu == -1 && atom->state != THREAD_SLEEPING)
1043 sched->nr_state_machine_bugs++;
1045 sched->nr_timestamps++;
1046 if (atom->sched_out_time > timestamp) {
1047 sched->nr_unordered_timestamps++;
1051 atom->state = THREAD_WAIT_CPU;
1052 atom->wake_up_time = timestamp;
1056 static int latency_migrate_task_event(struct perf_sched *sched,
1057 struct perf_evsel *evsel,
1058 struct perf_sample *sample,
1059 struct machine *machine)
1061 const u32 pid = perf_evsel__intval(evsel, sample, "pid");
1062 u64 timestamp = sample->time;
1063 struct work_atoms *atoms;
1064 struct work_atom *atom;
1065 struct thread *migrant;
1068 * Only need to worry about migration when profiling one CPU.
1070 if (sched->profile_cpu == -1)
1073 migrant = machine__findnew_thread(machine, pid);
1074 atoms = thread_atoms_search(&sched->atom_root, migrant, &sched->cmp_pid);
1076 if (thread_atoms_insert(sched, migrant))
1078 register_pid(sched, migrant->tid, migrant->comm);
1079 atoms = thread_atoms_search(&sched->atom_root, migrant, &sched->cmp_pid);
1081 pr_err("migration-event: Internal tree error");
1084 if (add_sched_out_event(atoms, 'R', timestamp))
1088 BUG_ON(list_empty(&atoms->work_list));
1090 atom = list_entry(atoms->work_list.prev, struct work_atom, list);
1091 atom->sched_in_time = atom->sched_out_time = atom->wake_up_time = timestamp;
1093 sched->nr_timestamps++;
1095 if (atom->sched_out_time > timestamp)
1096 sched->nr_unordered_timestamps++;
1101 static void output_lat_thread(struct perf_sched *sched, struct work_atoms *work_list)
1107 if (!work_list->nb_atoms)
1110 * Ignore idle threads:
1112 if (!strcmp(work_list->thread->comm, "swapper"))
1115 sched->all_runtime += work_list->total_runtime;
1116 sched->all_count += work_list->nb_atoms;
1118 ret = printf(" %s:%d ", work_list->thread->comm, work_list->thread->tid);
1120 for (i = 0; i < 24 - ret; i++)
1123 avg = work_list->total_lat / work_list->nb_atoms;
1125 printf("|%11.3f ms |%9" PRIu64 " | avg:%9.3f ms | max:%9.3f ms | max at: %9.6f s\n",
1126 (double)work_list->total_runtime / 1e6,
1127 work_list->nb_atoms, (double)avg / 1e6,
1128 (double)work_list->max_lat / 1e6,
1129 (double)work_list->max_lat_at / 1e9);
1132 static int pid_cmp(struct work_atoms *l, struct work_atoms *r)
1134 if (l->thread->tid < r->thread->tid)
1136 if (l->thread->tid > r->thread->tid)
1142 static int avg_cmp(struct work_atoms *l, struct work_atoms *r)
1152 avgl = l->total_lat / l->nb_atoms;
1153 avgr = r->total_lat / r->nb_atoms;
1163 static int max_cmp(struct work_atoms *l, struct work_atoms *r)
1165 if (l->max_lat < r->max_lat)
1167 if (l->max_lat > r->max_lat)
1173 static int switch_cmp(struct work_atoms *l, struct work_atoms *r)
1175 if (l->nb_atoms < r->nb_atoms)
1177 if (l->nb_atoms > r->nb_atoms)
1183 static int runtime_cmp(struct work_atoms *l, struct work_atoms *r)
1185 if (l->total_runtime < r->total_runtime)
1187 if (l->total_runtime > r->total_runtime)
1193 static int sort_dimension__add(const char *tok, struct list_head *list)
1196 static struct sort_dimension avg_sort_dimension = {
1200 static struct sort_dimension max_sort_dimension = {
1204 static struct sort_dimension pid_sort_dimension = {
1208 static struct sort_dimension runtime_sort_dimension = {
1212 static struct sort_dimension switch_sort_dimension = {
1216 struct sort_dimension *available_sorts[] = {
1217 &pid_sort_dimension,
1218 &avg_sort_dimension,
1219 &max_sort_dimension,
1220 &switch_sort_dimension,
1221 &runtime_sort_dimension,
1224 for (i = 0; i < ARRAY_SIZE(available_sorts); i++) {
1225 if (!strcmp(available_sorts[i]->name, tok)) {
1226 list_add_tail(&available_sorts[i]->list, list);
1235 static void perf_sched__sort_lat(struct perf_sched *sched)
1237 struct rb_node *node;
1240 struct work_atoms *data;
1241 node = rb_first(&sched->atom_root);
1245 rb_erase(node, &sched->atom_root);
1246 data = rb_entry(node, struct work_atoms, node);
1247 __thread_latency_insert(&sched->sorted_atom_root, data, &sched->sort_list);
1251 static int process_sched_wakeup_event(struct perf_tool *tool,
1252 struct perf_evsel *evsel,
1253 struct perf_sample *sample,
1254 struct machine *machine)
1256 struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
1258 if (sched->tp_handler->wakeup_event)
1259 return sched->tp_handler->wakeup_event(sched, evsel, sample, machine);
1264 static int map_switch_event(struct perf_sched *sched, struct perf_evsel *evsel,
1265 struct perf_sample *sample, struct machine *machine)
1267 const u32 prev_pid = perf_evsel__intval(evsel, sample, "prev_pid"),
1268 next_pid = perf_evsel__intval(evsel, sample, "next_pid");
1269 struct thread *sched_out __maybe_unused, *sched_in;
1271 u64 timestamp0, timestamp = sample->time;
1273 int cpu, this_cpu = sample->cpu;
1275 BUG_ON(this_cpu >= MAX_CPUS || this_cpu < 0);
1277 if (this_cpu > sched->max_cpu)
1278 sched->max_cpu = this_cpu;
1280 timestamp0 = sched->cpu_last_switched[this_cpu];
1281 sched->cpu_last_switched[this_cpu] = timestamp;
1283 delta = timestamp - timestamp0;
1288 pr_err("hm, delta: %" PRIu64 " < 0 ?\n", delta);
1292 sched_out = machine__findnew_thread(machine, prev_pid);
1293 sched_in = machine__findnew_thread(machine, next_pid);
1295 sched->curr_thread[this_cpu] = sched_in;
1300 if (!sched_in->shortname[0]) {
1301 sched_in->shortname[0] = sched->next_shortname1;
1302 sched_in->shortname[1] = sched->next_shortname2;
1304 if (sched->next_shortname1 < 'Z') {
1305 sched->next_shortname1++;
1307 sched->next_shortname1='A';
1308 if (sched->next_shortname2 < '9') {
1309 sched->next_shortname2++;
1311 sched->next_shortname2='0';
1317 for (cpu = 0; cpu <= sched->max_cpu; cpu++) {
1318 if (cpu != this_cpu)
1323 if (sched->curr_thread[cpu]) {
1324 if (sched->curr_thread[cpu]->tid)
1325 printf("%2s ", sched->curr_thread[cpu]->shortname);
1332 printf(" %12.6f secs ", (double)timestamp/1e9);
1333 if (new_shortname) {
1334 printf("%s => %s:%d\n",
1335 sched_in->shortname, sched_in->comm, sched_in->tid);
1343 static int process_sched_switch_event(struct perf_tool *tool,
1344 struct perf_evsel *evsel,
1345 struct perf_sample *sample,
1346 struct machine *machine)
1348 struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
1349 int this_cpu = sample->cpu, err = 0;
1350 u32 prev_pid = perf_evsel__intval(evsel, sample, "prev_pid"),
1351 next_pid = perf_evsel__intval(evsel, sample, "next_pid");
1353 if (sched->curr_pid[this_cpu] != (u32)-1) {
1355 * Are we trying to switch away a PID that is
1358 if (sched->curr_pid[this_cpu] != prev_pid)
1359 sched->nr_context_switch_bugs++;
1362 if (sched->tp_handler->switch_event)
1363 err = sched->tp_handler->switch_event(sched, evsel, sample, machine);
1365 sched->curr_pid[this_cpu] = next_pid;
1369 static int process_sched_runtime_event(struct perf_tool *tool,
1370 struct perf_evsel *evsel,
1371 struct perf_sample *sample,
1372 struct machine *machine)
1374 struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
1376 if (sched->tp_handler->runtime_event)
1377 return sched->tp_handler->runtime_event(sched, evsel, sample, machine);
1382 static int perf_sched__process_fork_event(struct perf_tool *tool,
1383 union perf_event *event,
1384 struct perf_sample *sample,
1385 struct machine *machine)
1387 struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
1389 /* run the fork event through the perf machineruy */
1390 perf_event__process_fork(tool, event, sample, machine);
1392 /* and then run additional processing needed for this command */
1393 if (sched->tp_handler->fork_event)
1394 return sched->tp_handler->fork_event(sched, event, machine);
1399 static int process_sched_migrate_task_event(struct perf_tool *tool,
1400 struct perf_evsel *evsel,
1401 struct perf_sample *sample,
1402 struct machine *machine)
1404 struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
1406 if (sched->tp_handler->migrate_task_event)
1407 return sched->tp_handler->migrate_task_event(sched, evsel, sample, machine);
1412 typedef int (*tracepoint_handler)(struct perf_tool *tool,
1413 struct perf_evsel *evsel,
1414 struct perf_sample *sample,
1415 struct machine *machine);
1417 static int perf_sched__process_tracepoint_sample(struct perf_tool *tool __maybe_unused,
1418 union perf_event *event __maybe_unused,
1419 struct perf_sample *sample,
1420 struct perf_evsel *evsel,
1421 struct machine *machine)
1425 evsel->hists.stats.total_period += sample->period;
1426 hists__inc_nr_events(&evsel->hists, PERF_RECORD_SAMPLE);
1428 if (evsel->handler.func != NULL) {
1429 tracepoint_handler f = evsel->handler.func;
1430 err = f(tool, evsel, sample, machine);
1436 static int perf_sched__read_events(struct perf_sched *sched,
1437 struct perf_session **psession)
1439 const struct perf_evsel_str_handler handlers[] = {
1440 { "sched:sched_switch", process_sched_switch_event, },
1441 { "sched:sched_stat_runtime", process_sched_runtime_event, },
1442 { "sched:sched_wakeup", process_sched_wakeup_event, },
1443 { "sched:sched_wakeup_new", process_sched_wakeup_event, },
1444 { "sched:sched_migrate_task", process_sched_migrate_task_event, },
1446 struct perf_session *session;
1448 session = perf_session__new(input_name, O_RDONLY, 0, false, &sched->tool);
1449 if (session == NULL) {
1450 pr_debug("No Memory for session\n");
1454 if (perf_session__set_tracepoints_handlers(session, handlers))
1457 if (perf_session__has_traces(session, "record -R")) {
1458 int err = perf_session__process_events(session, &sched->tool);
1460 pr_err("Failed to process events, error %d", err);
1464 sched->nr_events = session->stats.nr_events[0];
1465 sched->nr_lost_events = session->stats.total_lost;
1466 sched->nr_lost_chunks = session->stats.nr_events[PERF_RECORD_LOST];
1470 *psession = session;
1472 perf_session__delete(session);
1477 perf_session__delete(session);
1481 static void print_bad_events(struct perf_sched *sched)
1483 if (sched->nr_unordered_timestamps && sched->nr_timestamps) {
1484 printf(" INFO: %.3f%% unordered timestamps (%ld out of %ld)\n",
1485 (double)sched->nr_unordered_timestamps/(double)sched->nr_timestamps*100.0,
1486 sched->nr_unordered_timestamps, sched->nr_timestamps);
1488 if (sched->nr_lost_events && sched->nr_events) {
1489 printf(" INFO: %.3f%% lost events (%ld out of %ld, in %ld chunks)\n",
1490 (double)sched->nr_lost_events/(double)sched->nr_events * 100.0,
1491 sched->nr_lost_events, sched->nr_events, sched->nr_lost_chunks);
1493 if (sched->nr_state_machine_bugs && sched->nr_timestamps) {
1494 printf(" INFO: %.3f%% state machine bugs (%ld out of %ld)",
1495 (double)sched->nr_state_machine_bugs/(double)sched->nr_timestamps*100.0,
1496 sched->nr_state_machine_bugs, sched->nr_timestamps);
1497 if (sched->nr_lost_events)
1498 printf(" (due to lost events?)");
1501 if (sched->nr_context_switch_bugs && sched->nr_timestamps) {
1502 printf(" INFO: %.3f%% context switch bugs (%ld out of %ld)",
1503 (double)sched->nr_context_switch_bugs/(double)sched->nr_timestamps*100.0,
1504 sched->nr_context_switch_bugs, sched->nr_timestamps);
1505 if (sched->nr_lost_events)
1506 printf(" (due to lost events?)");
1511 static int perf_sched__lat(struct perf_sched *sched)
1513 struct rb_node *next;
1514 struct perf_session *session;
1518 /* save session -- references to threads are held in work_list */
1519 if (perf_sched__read_events(sched, &session))
1522 perf_sched__sort_lat(sched);
1524 printf("\n ---------------------------------------------------------------------------------------------------------------\n");
1525 printf(" Task | Runtime ms | Switches | Average delay ms | Maximum delay ms | Maximum delay at |\n");
1526 printf(" ---------------------------------------------------------------------------------------------------------------\n");
1528 next = rb_first(&sched->sorted_atom_root);
1531 struct work_atoms *work_list;
1533 work_list = rb_entry(next, struct work_atoms, node);
1534 output_lat_thread(sched, work_list);
1535 next = rb_next(next);
1538 printf(" -----------------------------------------------------------------------------------------\n");
1539 printf(" TOTAL: |%11.3f ms |%9" PRIu64 " |\n",
1540 (double)sched->all_runtime / 1e6, sched->all_count);
1542 printf(" ---------------------------------------------------\n");
1544 print_bad_events(sched);
1547 perf_session__delete(session);
1551 static int perf_sched__map(struct perf_sched *sched)
1553 sched->max_cpu = sysconf(_SC_NPROCESSORS_CONF);
1556 if (perf_sched__read_events(sched, NULL))
1558 print_bad_events(sched);
1562 static int perf_sched__replay(struct perf_sched *sched)
1566 calibrate_run_measurement_overhead(sched);
1567 calibrate_sleep_measurement_overhead(sched);
1569 test_calibrations(sched);
1571 if (perf_sched__read_events(sched, NULL))
1574 printf("nr_run_events: %ld\n", sched->nr_run_events);
1575 printf("nr_sleep_events: %ld\n", sched->nr_sleep_events);
1576 printf("nr_wakeup_events: %ld\n", sched->nr_wakeup_events);
1578 if (sched->targetless_wakeups)
1579 printf("target-less wakeups: %ld\n", sched->targetless_wakeups);
1580 if (sched->multitarget_wakeups)
1581 printf("multi-target wakeups: %ld\n", sched->multitarget_wakeups);
1582 if (sched->nr_run_events_optimized)
1583 printf("run atoms optimized: %ld\n",
1584 sched->nr_run_events_optimized);
1586 print_task_traces(sched);
1587 add_cross_task_wakeups(sched);
1589 create_tasks(sched);
1590 printf("------------------------------------------------------------\n");
1591 for (i = 0; i < sched->replay_repeat; i++)
1592 run_one_test(sched);
1597 static void setup_sorting(struct perf_sched *sched, const struct option *options,
1598 const char * const usage_msg[])
1600 char *tmp, *tok, *str = strdup(sched->sort_order);
1602 for (tok = strtok_r(str, ", ", &tmp);
1603 tok; tok = strtok_r(NULL, ", ", &tmp)) {
1604 if (sort_dimension__add(tok, &sched->sort_list) < 0) {
1605 error("Unknown --sort key: `%s'", tok);
1606 usage_with_options(usage_msg, options);
1612 sort_dimension__add("pid", &sched->cmp_pid);
1615 static int __cmd_record(int argc, const char **argv)
1617 unsigned int rec_argc, i, j;
1618 const char **rec_argv;
1619 const char * const record_args[] = {
1625 "-e", "sched:sched_switch",
1626 "-e", "sched:sched_stat_wait",
1627 "-e", "sched:sched_stat_sleep",
1628 "-e", "sched:sched_stat_iowait",
1629 "-e", "sched:sched_stat_runtime",
1630 "-e", "sched:sched_process_fork",
1631 "-e", "sched:sched_wakeup",
1632 "-e", "sched:sched_migrate_task",
1635 rec_argc = ARRAY_SIZE(record_args) + argc - 1;
1636 rec_argv = calloc(rec_argc + 1, sizeof(char *));
1638 if (rec_argv == NULL)
1641 for (i = 0; i < ARRAY_SIZE(record_args); i++)
1642 rec_argv[i] = strdup(record_args[i]);
1644 for (j = 1; j < (unsigned int)argc; j++, i++)
1645 rec_argv[i] = argv[j];
1647 BUG_ON(i != rec_argc);
1649 return cmd_record(i, rec_argv, NULL);
1652 static const char default_sort_order[] = "avg, max, switch, runtime";
1653 static struct perf_sched sched = {
1655 .sample = perf_sched__process_tracepoint_sample,
1656 .comm = perf_event__process_comm,
1657 .lost = perf_event__process_lost,
1658 .fork = perf_sched__process_fork_event,
1659 .ordered_samples = true,
1661 .cmp_pid = LIST_HEAD_INIT(sched.cmp_pid),
1662 .sort_list = LIST_HEAD_INIT(sched.sort_list),
1663 .start_work_mutex = PTHREAD_MUTEX_INITIALIZER,
1664 .work_done_wait_mutex = PTHREAD_MUTEX_INITIALIZER,
1665 .curr_pid = { [0 ... MAX_CPUS - 1] = -1 },
1666 .sort_order = default_sort_order,
1667 .replay_repeat = 10,
1669 .next_shortname1 = 'A',
1670 .next_shortname2 = '0',
1673 int cmd_sched(int argc, const char **argv, const char *prefix __maybe_unused)
1675 const struct option latency_options[] = {
1676 OPT_STRING('s', "sort", &sched.sort_order, "key[,key2...]",
1677 "sort by key(s): runtime, switch, avg, max"),
1678 OPT_INCR('v', "verbose", &verbose,
1679 "be more verbose (show symbol address, etc)"),
1680 OPT_INTEGER('C', "CPU", &sched.profile_cpu,
1681 "CPU to profile on"),
1682 OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
1683 "dump raw trace in ASCII"),
1686 const struct option replay_options[] = {
1687 OPT_UINTEGER('r', "repeat", &sched.replay_repeat,
1688 "repeat the workload replay N times (-1: infinite)"),
1689 OPT_INCR('v', "verbose", &verbose,
1690 "be more verbose (show symbol address, etc)"),
1691 OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
1692 "dump raw trace in ASCII"),
1695 const struct option sched_options[] = {
1696 OPT_STRING('i', "input", &input_name, "file",
1698 OPT_INCR('v', "verbose", &verbose,
1699 "be more verbose (show symbol address, etc)"),
1700 OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
1701 "dump raw trace in ASCII"),
1704 const char * const latency_usage[] = {
1705 "perf sched latency [<options>]",
1708 const char * const replay_usage[] = {
1709 "perf sched replay [<options>]",
1712 const char * const sched_usage[] = {
1713 "perf sched [<options>] {record|latency|map|replay|script}",
1716 struct trace_sched_handler lat_ops = {
1717 .wakeup_event = latency_wakeup_event,
1718 .switch_event = latency_switch_event,
1719 .runtime_event = latency_runtime_event,
1720 .migrate_task_event = latency_migrate_task_event,
1722 struct trace_sched_handler map_ops = {
1723 .switch_event = map_switch_event,
1725 struct trace_sched_handler replay_ops = {
1726 .wakeup_event = replay_wakeup_event,
1727 .switch_event = replay_switch_event,
1728 .fork_event = replay_fork_event,
1731 argc = parse_options(argc, argv, sched_options, sched_usage,
1732 PARSE_OPT_STOP_AT_NON_OPTION);
1734 usage_with_options(sched_usage, sched_options);
1737 * Aliased to 'perf script' for now:
1739 if (!strcmp(argv[0], "script"))
1740 return cmd_script(argc, argv, prefix);
1743 if (!strncmp(argv[0], "rec", 3)) {
1744 return __cmd_record(argc, argv);
1745 } else if (!strncmp(argv[0], "lat", 3)) {
1746 sched.tp_handler = &lat_ops;
1748 argc = parse_options(argc, argv, latency_options, latency_usage, 0);
1750 usage_with_options(latency_usage, latency_options);
1752 setup_sorting(&sched, latency_options, latency_usage);
1753 return perf_sched__lat(&sched);
1754 } else if (!strcmp(argv[0], "map")) {
1755 sched.tp_handler = &map_ops;
1756 setup_sorting(&sched, latency_options, latency_usage);
1757 return perf_sched__map(&sched);
1758 } else if (!strncmp(argv[0], "rep", 3)) {
1759 sched.tp_handler = &replay_ops;
1761 argc = parse_options(argc, argv, replay_options, replay_usage, 0);
1763 usage_with_options(replay_usage, replay_options);
1765 return perf_sched__replay(&sched);
1767 usage_with_options(sched_usage, sched_options);