perf kmem: Support field "node" in evsel__process_alloc_event() coping with recent...
[linux-2.6-block.git] / tools / perf / builtin-kmem.c
1 // SPDX-License-Identifier: GPL-2.0
2 #include "builtin.h"
3 #include "perf.h"
4
5 #include "util/dso.h"
6 #include "util/evlist.h"
7 #include "util/evsel.h"
8 #include "util/config.h"
9 #include "util/map.h"
10 #include "util/symbol.h"
11 #include "util/thread.h"
12 #include "util/header.h"
13 #include "util/session.h"
14 #include "util/tool.h"
15 #include "util/callchain.h"
16 #include "util/time-utils.h"
17 #include <linux/err.h>
18
19 #include <subcmd/pager.h>
20 #include <subcmd/parse-options.h>
21 #include "util/trace-event.h"
22 #include "util/data.h"
23 #include "util/cpumap.h"
24
25 #include "util/debug.h"
26 #include "util/string2.h"
27
28 #include <linux/kernel.h>
29 #include <linux/numa.h>
30 #include <linux/rbtree.h>
31 #include <linux/string.h>
32 #include <linux/zalloc.h>
33 #include <errno.h>
34 #include <inttypes.h>
35 #include <locale.h>
36 #include <regex.h>
37
38 #include <linux/ctype.h>
39 #include <traceevent/event-parse.h>
40
41 static int      kmem_slab;
42 static int      kmem_page;
43
44 static long     kmem_page_size;
45 static enum {
46         KMEM_SLAB,
47         KMEM_PAGE,
48 } kmem_default = KMEM_SLAB;  /* for backward compatibility */
49
50 struct alloc_stat;
51 typedef int (*sort_fn_t)(void *, void *);
52
53 static int                      alloc_flag;
54 static int                      caller_flag;
55
56 static int                      alloc_lines = -1;
57 static int                      caller_lines = -1;
58
59 static bool                     raw_ip;
60
61 struct alloc_stat {
62         u64     call_site;
63         u64     ptr;
64         u64     bytes_req;
65         u64     bytes_alloc;
66         u64     last_alloc;
67         u32     hit;
68         u32     pingpong;
69
70         short   alloc_cpu;
71
72         struct rb_node node;
73 };
74
75 static struct rb_root root_alloc_stat;
76 static struct rb_root root_alloc_sorted;
77 static struct rb_root root_caller_stat;
78 static struct rb_root root_caller_sorted;
79
80 static unsigned long total_requested, total_allocated, total_freed;
81 static unsigned long nr_allocs, nr_cross_allocs;
82
83 /* filters for controlling start and stop of time of analysis */
84 static struct perf_time_interval ptime;
85 const char *time_str;
86
87 static int insert_alloc_stat(unsigned long call_site, unsigned long ptr,
88                              int bytes_req, int bytes_alloc, int cpu)
89 {
90         struct rb_node **node = &root_alloc_stat.rb_node;
91         struct rb_node *parent = NULL;
92         struct alloc_stat *data = NULL;
93
94         while (*node) {
95                 parent = *node;
96                 data = rb_entry(*node, struct alloc_stat, node);
97
98                 if (ptr > data->ptr)
99                         node = &(*node)->rb_right;
100                 else if (ptr < data->ptr)
101                         node = &(*node)->rb_left;
102                 else
103                         break;
104         }
105
106         if (data && data->ptr == ptr) {
107                 data->hit++;
108                 data->bytes_req += bytes_req;
109                 data->bytes_alloc += bytes_alloc;
110         } else {
111                 data = malloc(sizeof(*data));
112                 if (!data) {
113                         pr_err("%s: malloc failed\n", __func__);
114                         return -1;
115                 }
116                 data->ptr = ptr;
117                 data->pingpong = 0;
118                 data->hit = 1;
119                 data->bytes_req = bytes_req;
120                 data->bytes_alloc = bytes_alloc;
121
122                 rb_link_node(&data->node, parent, node);
123                 rb_insert_color(&data->node, &root_alloc_stat);
124         }
125         data->call_site = call_site;
126         data->alloc_cpu = cpu;
127         data->last_alloc = bytes_alloc;
128
129         return 0;
130 }
131
132 static int insert_caller_stat(unsigned long call_site,
133                               int bytes_req, int bytes_alloc)
134 {
135         struct rb_node **node = &root_caller_stat.rb_node;
136         struct rb_node *parent = NULL;
137         struct alloc_stat *data = NULL;
138
139         while (*node) {
140                 parent = *node;
141                 data = rb_entry(*node, struct alloc_stat, node);
142
143                 if (call_site > data->call_site)
144                         node = &(*node)->rb_right;
145                 else if (call_site < data->call_site)
146                         node = &(*node)->rb_left;
147                 else
148                         break;
149         }
150
151         if (data && data->call_site == call_site) {
152                 data->hit++;
153                 data->bytes_req += bytes_req;
154                 data->bytes_alloc += bytes_alloc;
155         } else {
156                 data = malloc(sizeof(*data));
157                 if (!data) {
158                         pr_err("%s: malloc failed\n", __func__);
159                         return -1;
160                 }
161                 data->call_site = call_site;
162                 data->pingpong = 0;
163                 data->hit = 1;
164                 data->bytes_req = bytes_req;
165                 data->bytes_alloc = bytes_alloc;
166
167                 rb_link_node(&data->node, parent, node);
168                 rb_insert_color(&data->node, &root_caller_stat);
169         }
170
171         return 0;
172 }
173
174 static int evsel__process_alloc_event(struct evsel *evsel, struct perf_sample *sample)
175 {
176         unsigned long ptr = evsel__intval(evsel, sample, "ptr"),
177                       call_site = evsel__intval(evsel, sample, "call_site");
178         int bytes_req = evsel__intval(evsel, sample, "bytes_req"),
179             bytes_alloc = evsel__intval(evsel, sample, "bytes_alloc");
180
181         if (insert_alloc_stat(call_site, ptr, bytes_req, bytes_alloc, sample->cpu) ||
182             insert_caller_stat(call_site, bytes_req, bytes_alloc))
183                 return -1;
184
185         total_requested += bytes_req;
186         total_allocated += bytes_alloc;
187
188         nr_allocs++;
189
190         /*
191          * Commit 11e9734bcb6a ("mm/slab_common: unify NUMA and UMA
192          * version of tracepoints") adds the field "node" into the
193          * tracepoints 'kmalloc' and 'kmem_cache_alloc'.
194          *
195          * The legacy tracepoints 'kmalloc_node' and 'kmem_cache_alloc_node'
196          * also contain the field "node".
197          *
198          * If the tracepoint contains the field "node" the tool stats the
199          * cross allocation.
200          */
201         if (evsel__field(evsel, "node")) {
202                 int node1, node2;
203
204                 node1 = cpu__get_node((struct perf_cpu){.cpu = sample->cpu});
205                 node2 = evsel__intval(evsel, sample, "node");
206
207                 /*
208                  * If the field "node" is NUMA_NO_NODE (-1), we don't take it
209                  * as a cross allocation.
210                  */
211                 if ((node2 != NUMA_NO_NODE) && (node1 != node2))
212                         nr_cross_allocs++;
213         }
214
215         return 0;
216 }
217
218 static int ptr_cmp(void *, void *);
219 static int slab_callsite_cmp(void *, void *);
220
221 static struct alloc_stat *search_alloc_stat(unsigned long ptr,
222                                             unsigned long call_site,
223                                             struct rb_root *root,
224                                             sort_fn_t sort_fn)
225 {
226         struct rb_node *node = root->rb_node;
227         struct alloc_stat key = { .ptr = ptr, .call_site = call_site };
228
229         while (node) {
230                 struct alloc_stat *data;
231                 int cmp;
232
233                 data = rb_entry(node, struct alloc_stat, node);
234
235                 cmp = sort_fn(&key, data);
236                 if (cmp < 0)
237                         node = node->rb_left;
238                 else if (cmp > 0)
239                         node = node->rb_right;
240                 else
241                         return data;
242         }
243         return NULL;
244 }
245
246 static int evsel__process_free_event(struct evsel *evsel, struct perf_sample *sample)
247 {
248         unsigned long ptr = evsel__intval(evsel, sample, "ptr");
249         struct alloc_stat *s_alloc, *s_caller;
250
251         s_alloc = search_alloc_stat(ptr, 0, &root_alloc_stat, ptr_cmp);
252         if (!s_alloc)
253                 return 0;
254
255         total_freed += s_alloc->last_alloc;
256
257         if ((short)sample->cpu != s_alloc->alloc_cpu) {
258                 s_alloc->pingpong++;
259
260                 s_caller = search_alloc_stat(0, s_alloc->call_site,
261                                              &root_caller_stat,
262                                              slab_callsite_cmp);
263                 if (!s_caller)
264                         return -1;
265                 s_caller->pingpong++;
266         }
267         s_alloc->alloc_cpu = -1;
268
269         return 0;
270 }
271
272 static u64 total_page_alloc_bytes;
273 static u64 total_page_free_bytes;
274 static u64 total_page_nomatch_bytes;
275 static u64 total_page_fail_bytes;
276 static unsigned long nr_page_allocs;
277 static unsigned long nr_page_frees;
278 static unsigned long nr_page_fails;
279 static unsigned long nr_page_nomatch;
280
281 static bool use_pfn;
282 static bool live_page;
283 static struct perf_session *kmem_session;
284
285 #define MAX_MIGRATE_TYPES  6
286 #define MAX_PAGE_ORDER     11
287
288 static int order_stats[MAX_PAGE_ORDER][MAX_MIGRATE_TYPES];
289
290 struct page_stat {
291         struct rb_node  node;
292         u64             page;
293         u64             callsite;
294         int             order;
295         unsigned        gfp_flags;
296         unsigned        migrate_type;
297         u64             alloc_bytes;
298         u64             free_bytes;
299         int             nr_alloc;
300         int             nr_free;
301 };
302
303 static struct rb_root page_live_tree;
304 static struct rb_root page_alloc_tree;
305 static struct rb_root page_alloc_sorted;
306 static struct rb_root page_caller_tree;
307 static struct rb_root page_caller_sorted;
308
309 struct alloc_func {
310         u64 start;
311         u64 end;
312         char *name;
313 };
314
315 static int nr_alloc_funcs;
316 static struct alloc_func *alloc_func_list;
317
318 static int funcmp(const void *a, const void *b)
319 {
320         const struct alloc_func *fa = a;
321         const struct alloc_func *fb = b;
322
323         if (fa->start > fb->start)
324                 return 1;
325         else
326                 return -1;
327 }
328
329 static int callcmp(const void *a, const void *b)
330 {
331         const struct alloc_func *fa = a;
332         const struct alloc_func *fb = b;
333
334         if (fb->start <= fa->start && fa->end < fb->end)
335                 return 0;
336
337         if (fa->start > fb->start)
338                 return 1;
339         else
340                 return -1;
341 }
342
343 static int build_alloc_func_list(void)
344 {
345         int ret;
346         struct map *kernel_map;
347         struct symbol *sym;
348         struct rb_node *node;
349         struct alloc_func *func;
350         struct machine *machine = &kmem_session->machines.host;
351         regex_t alloc_func_regex;
352         static const char pattern[] = "^_?_?(alloc|get_free|get_zeroed)_pages?";
353
354         ret = regcomp(&alloc_func_regex, pattern, REG_EXTENDED);
355         if (ret) {
356                 char err[BUFSIZ];
357
358                 regerror(ret, &alloc_func_regex, err, sizeof(err));
359                 pr_err("Invalid regex: %s\n%s", pattern, err);
360                 return -EINVAL;
361         }
362
363         kernel_map = machine__kernel_map(machine);
364         if (map__load(kernel_map) < 0) {
365                 pr_err("cannot load kernel map\n");
366                 return -ENOENT;
367         }
368
369         map__for_each_symbol(kernel_map, sym, node) {
370                 if (regexec(&alloc_func_regex, sym->name, 0, NULL, 0))
371                         continue;
372
373                 func = realloc(alloc_func_list,
374                                (nr_alloc_funcs + 1) * sizeof(*func));
375                 if (func == NULL)
376                         return -ENOMEM;
377
378                 pr_debug("alloc func: %s\n", sym->name);
379                 func[nr_alloc_funcs].start = sym->start;
380                 func[nr_alloc_funcs].end   = sym->end;
381                 func[nr_alloc_funcs].name  = sym->name;
382
383                 alloc_func_list = func;
384                 nr_alloc_funcs++;
385         }
386
387         qsort(alloc_func_list, nr_alloc_funcs, sizeof(*func), funcmp);
388
389         regfree(&alloc_func_regex);
390         return 0;
391 }
392
393 /*
394  * Find first non-memory allocation function from callchain.
395  * The allocation functions are in the 'alloc_func_list'.
396  */
397 static u64 find_callsite(struct evsel *evsel, struct perf_sample *sample)
398 {
399         struct addr_location al;
400         struct machine *machine = &kmem_session->machines.host;
401         struct callchain_cursor_node *node;
402
403         if (alloc_func_list == NULL) {
404                 if (build_alloc_func_list() < 0)
405                         goto out;
406         }
407
408         al.thread = machine__findnew_thread(machine, sample->pid, sample->tid);
409         sample__resolve_callchain(sample, &callchain_cursor, NULL, evsel, &al, 16);
410
411         callchain_cursor_commit(&callchain_cursor);
412         while (true) {
413                 struct alloc_func key, *caller;
414                 u64 addr;
415
416                 node = callchain_cursor_current(&callchain_cursor);
417                 if (node == NULL)
418                         break;
419
420                 key.start = key.end = node->ip;
421                 caller = bsearch(&key, alloc_func_list, nr_alloc_funcs,
422                                  sizeof(key), callcmp);
423                 if (!caller) {
424                         /* found */
425                         if (node->ms.map)
426                                 addr = map__unmap_ip(node->ms.map, node->ip);
427                         else
428                                 addr = node->ip;
429
430                         return addr;
431                 } else
432                         pr_debug3("skipping alloc function: %s\n", caller->name);
433
434                 callchain_cursor_advance(&callchain_cursor);
435         }
436
437 out:
438         pr_debug2("unknown callsite: %"PRIx64 "\n", sample->ip);
439         return sample->ip;
440 }
441
442 struct sort_dimension {
443         const char              name[20];
444         sort_fn_t               cmp;
445         struct list_head        list;
446 };
447
448 static LIST_HEAD(page_alloc_sort_input);
449 static LIST_HEAD(page_caller_sort_input);
450
451 static struct page_stat *
452 __page_stat__findnew_page(struct page_stat *pstat, bool create)
453 {
454         struct rb_node **node = &page_live_tree.rb_node;
455         struct rb_node *parent = NULL;
456         struct page_stat *data;
457
458         while (*node) {
459                 s64 cmp;
460
461                 parent = *node;
462                 data = rb_entry(*node, struct page_stat, node);
463
464                 cmp = data->page - pstat->page;
465                 if (cmp < 0)
466                         node = &parent->rb_left;
467                 else if (cmp > 0)
468                         node = &parent->rb_right;
469                 else
470                         return data;
471         }
472
473         if (!create)
474                 return NULL;
475
476         data = zalloc(sizeof(*data));
477         if (data != NULL) {
478                 data->page = pstat->page;
479                 data->order = pstat->order;
480                 data->gfp_flags = pstat->gfp_flags;
481                 data->migrate_type = pstat->migrate_type;
482
483                 rb_link_node(&data->node, parent, node);
484                 rb_insert_color(&data->node, &page_live_tree);
485         }
486
487         return data;
488 }
489
490 static struct page_stat *page_stat__find_page(struct page_stat *pstat)
491 {
492         return __page_stat__findnew_page(pstat, false);
493 }
494
495 static struct page_stat *page_stat__findnew_page(struct page_stat *pstat)
496 {
497         return __page_stat__findnew_page(pstat, true);
498 }
499
500 static struct page_stat *
501 __page_stat__findnew_alloc(struct page_stat *pstat, bool create)
502 {
503         struct rb_node **node = &page_alloc_tree.rb_node;
504         struct rb_node *parent = NULL;
505         struct page_stat *data;
506         struct sort_dimension *sort;
507
508         while (*node) {
509                 int cmp = 0;
510
511                 parent = *node;
512                 data = rb_entry(*node, struct page_stat, node);
513
514                 list_for_each_entry(sort, &page_alloc_sort_input, list) {
515                         cmp = sort->cmp(pstat, data);
516                         if (cmp)
517                                 break;
518                 }
519
520                 if (cmp < 0)
521                         node = &parent->rb_left;
522                 else if (cmp > 0)
523                         node = &parent->rb_right;
524                 else
525                         return data;
526         }
527
528         if (!create)
529                 return NULL;
530
531         data = zalloc(sizeof(*data));
532         if (data != NULL) {
533                 data->page = pstat->page;
534                 data->order = pstat->order;
535                 data->gfp_flags = pstat->gfp_flags;
536                 data->migrate_type = pstat->migrate_type;
537
538                 rb_link_node(&data->node, parent, node);
539                 rb_insert_color(&data->node, &page_alloc_tree);
540         }
541
542         return data;
543 }
544
545 static struct page_stat *page_stat__find_alloc(struct page_stat *pstat)
546 {
547         return __page_stat__findnew_alloc(pstat, false);
548 }
549
550 static struct page_stat *page_stat__findnew_alloc(struct page_stat *pstat)
551 {
552         return __page_stat__findnew_alloc(pstat, true);
553 }
554
555 static struct page_stat *
556 __page_stat__findnew_caller(struct page_stat *pstat, bool create)
557 {
558         struct rb_node **node = &page_caller_tree.rb_node;
559         struct rb_node *parent = NULL;
560         struct page_stat *data;
561         struct sort_dimension *sort;
562
563         while (*node) {
564                 int cmp = 0;
565
566                 parent = *node;
567                 data = rb_entry(*node, struct page_stat, node);
568
569                 list_for_each_entry(sort, &page_caller_sort_input, list) {
570                         cmp = sort->cmp(pstat, data);
571                         if (cmp)
572                                 break;
573                 }
574
575                 if (cmp < 0)
576                         node = &parent->rb_left;
577                 else if (cmp > 0)
578                         node = &parent->rb_right;
579                 else
580                         return data;
581         }
582
583         if (!create)
584                 return NULL;
585
586         data = zalloc(sizeof(*data));
587         if (data != NULL) {
588                 data->callsite = pstat->callsite;
589                 data->order = pstat->order;
590                 data->gfp_flags = pstat->gfp_flags;
591                 data->migrate_type = pstat->migrate_type;
592
593                 rb_link_node(&data->node, parent, node);
594                 rb_insert_color(&data->node, &page_caller_tree);
595         }
596
597         return data;
598 }
599
600 static struct page_stat *page_stat__find_caller(struct page_stat *pstat)
601 {
602         return __page_stat__findnew_caller(pstat, false);
603 }
604
605 static struct page_stat *page_stat__findnew_caller(struct page_stat *pstat)
606 {
607         return __page_stat__findnew_caller(pstat, true);
608 }
609
610 static bool valid_page(u64 pfn_or_page)
611 {
612         if (use_pfn && pfn_or_page == -1UL)
613                 return false;
614         if (!use_pfn && pfn_or_page == 0)
615                 return false;
616         return true;
617 }
618
619 struct gfp_flag {
620         unsigned int flags;
621         char *compact_str;
622         char *human_readable;
623 };
624
625 static struct gfp_flag *gfps;
626 static int nr_gfps;
627
628 static int gfpcmp(const void *a, const void *b)
629 {
630         const struct gfp_flag *fa = a;
631         const struct gfp_flag *fb = b;
632
633         return fa->flags - fb->flags;
634 }
635
636 /* see include/trace/events/mmflags.h */
637 static const struct {
638         const char *original;
639         const char *compact;
640 } gfp_compact_table[] = {
641         { "GFP_TRANSHUGE",              "THP" },
642         { "GFP_TRANSHUGE_LIGHT",        "THL" },
643         { "GFP_HIGHUSER_MOVABLE",       "HUM" },
644         { "GFP_HIGHUSER",               "HU" },
645         { "GFP_USER",                   "U" },
646         { "GFP_KERNEL_ACCOUNT",         "KAC" },
647         { "GFP_KERNEL",                 "K" },
648         { "GFP_NOFS",                   "NF" },
649         { "GFP_ATOMIC",                 "A" },
650         { "GFP_NOIO",                   "NI" },
651         { "GFP_NOWAIT",                 "NW" },
652         { "GFP_DMA",                    "D" },
653         { "__GFP_HIGHMEM",              "HM" },
654         { "GFP_DMA32",                  "D32" },
655         { "__GFP_HIGH",                 "H" },
656         { "__GFP_ATOMIC",               "_A" },
657         { "__GFP_IO",                   "I" },
658         { "__GFP_FS",                   "F" },
659         { "__GFP_NOWARN",               "NWR" },
660         { "__GFP_RETRY_MAYFAIL",        "R" },
661         { "__GFP_NOFAIL",               "NF" },
662         { "__GFP_NORETRY",              "NR" },
663         { "__GFP_COMP",                 "C" },
664         { "__GFP_ZERO",                 "Z" },
665         { "__GFP_NOMEMALLOC",           "NMA" },
666         { "__GFP_MEMALLOC",             "MA" },
667         { "__GFP_HARDWALL",             "HW" },
668         { "__GFP_THISNODE",             "TN" },
669         { "__GFP_RECLAIMABLE",          "RC" },
670         { "__GFP_MOVABLE",              "M" },
671         { "__GFP_ACCOUNT",              "AC" },
672         { "__GFP_WRITE",                "WR" },
673         { "__GFP_RECLAIM",              "R" },
674         { "__GFP_DIRECT_RECLAIM",       "DR" },
675         { "__GFP_KSWAPD_RECLAIM",       "KR" },
676 };
677
678 static size_t max_gfp_len;
679
680 static char *compact_gfp_flags(char *gfp_flags)
681 {
682         char *orig_flags = strdup(gfp_flags);
683         char *new_flags = NULL;
684         char *str, *pos = NULL;
685         size_t len = 0;
686
687         if (orig_flags == NULL)
688                 return NULL;
689
690         str = strtok_r(orig_flags, "|", &pos);
691         while (str) {
692                 size_t i;
693                 char *new;
694                 const char *cpt;
695
696                 for (i = 0; i < ARRAY_SIZE(gfp_compact_table); i++) {
697                         if (strcmp(gfp_compact_table[i].original, str))
698                                 continue;
699
700                         cpt = gfp_compact_table[i].compact;
701                         new = realloc(new_flags, len + strlen(cpt) + 2);
702                         if (new == NULL) {
703                                 free(new_flags);
704                                 free(orig_flags);
705                                 return NULL;
706                         }
707
708                         new_flags = new;
709
710                         if (!len) {
711                                 strcpy(new_flags, cpt);
712                         } else {
713                                 strcat(new_flags, "|");
714                                 strcat(new_flags, cpt);
715                                 len++;
716                         }
717
718                         len += strlen(cpt);
719                 }
720
721                 str = strtok_r(NULL, "|", &pos);
722         }
723
724         if (max_gfp_len < len)
725                 max_gfp_len = len;
726
727         free(orig_flags);
728         return new_flags;
729 }
730
731 static char *compact_gfp_string(unsigned long gfp_flags)
732 {
733         struct gfp_flag key = {
734                 .flags = gfp_flags,
735         };
736         struct gfp_flag *gfp;
737
738         gfp = bsearch(&key, gfps, nr_gfps, sizeof(*gfps), gfpcmp);
739         if (gfp)
740                 return gfp->compact_str;
741
742         return NULL;
743 }
744
745 static int parse_gfp_flags(struct evsel *evsel, struct perf_sample *sample,
746                            unsigned int gfp_flags)
747 {
748         struct tep_record record = {
749                 .cpu = sample->cpu,
750                 .data = sample->raw_data,
751                 .size = sample->raw_size,
752         };
753         struct trace_seq seq;
754         char *str, *pos = NULL;
755
756         if (nr_gfps) {
757                 struct gfp_flag key = {
758                         .flags = gfp_flags,
759                 };
760
761                 if (bsearch(&key, gfps, nr_gfps, sizeof(*gfps), gfpcmp))
762                         return 0;
763         }
764
765         trace_seq_init(&seq);
766         tep_print_event(evsel->tp_format->tep,
767                         &seq, &record, "%s", TEP_PRINT_INFO);
768
769         str = strtok_r(seq.buffer, " ", &pos);
770         while (str) {
771                 if (!strncmp(str, "gfp_flags=", 10)) {
772                         struct gfp_flag *new;
773
774                         new = realloc(gfps, (nr_gfps + 1) * sizeof(*gfps));
775                         if (new == NULL)
776                                 return -ENOMEM;
777
778                         gfps = new;
779                         new += nr_gfps++;
780
781                         new->flags = gfp_flags;
782                         new->human_readable = strdup(str + 10);
783                         new->compact_str = compact_gfp_flags(str + 10);
784                         if (!new->human_readable || !new->compact_str)
785                                 return -ENOMEM;
786
787                         qsort(gfps, nr_gfps, sizeof(*gfps), gfpcmp);
788                 }
789
790                 str = strtok_r(NULL, " ", &pos);
791         }
792
793         trace_seq_destroy(&seq);
794         return 0;
795 }
796
797 static int evsel__process_page_alloc_event(struct evsel *evsel, struct perf_sample *sample)
798 {
799         u64 page;
800         unsigned int order = evsel__intval(evsel, sample, "order");
801         unsigned int gfp_flags = evsel__intval(evsel, sample, "gfp_flags");
802         unsigned int migrate_type = evsel__intval(evsel, sample,
803                                                        "migratetype");
804         u64 bytes = kmem_page_size << order;
805         u64 callsite;
806         struct page_stat *pstat;
807         struct page_stat this = {
808                 .order = order,
809                 .gfp_flags = gfp_flags,
810                 .migrate_type = migrate_type,
811         };
812
813         if (use_pfn)
814                 page = evsel__intval(evsel, sample, "pfn");
815         else
816                 page = evsel__intval(evsel, sample, "page");
817
818         nr_page_allocs++;
819         total_page_alloc_bytes += bytes;
820
821         if (!valid_page(page)) {
822                 nr_page_fails++;
823                 total_page_fail_bytes += bytes;
824
825                 return 0;
826         }
827
828         if (parse_gfp_flags(evsel, sample, gfp_flags) < 0)
829                 return -1;
830
831         callsite = find_callsite(evsel, sample);
832
833         /*
834          * This is to find the current page (with correct gfp flags and
835          * migrate type) at free event.
836          */
837         this.page = page;
838         pstat = page_stat__findnew_page(&this);
839         if (pstat == NULL)
840                 return -ENOMEM;
841
842         pstat->nr_alloc++;
843         pstat->alloc_bytes += bytes;
844         pstat->callsite = callsite;
845
846         if (!live_page) {
847                 pstat = page_stat__findnew_alloc(&this);
848                 if (pstat == NULL)
849                         return -ENOMEM;
850
851                 pstat->nr_alloc++;
852                 pstat->alloc_bytes += bytes;
853                 pstat->callsite = callsite;
854         }
855
856         this.callsite = callsite;
857         pstat = page_stat__findnew_caller(&this);
858         if (pstat == NULL)
859                 return -ENOMEM;
860
861         pstat->nr_alloc++;
862         pstat->alloc_bytes += bytes;
863
864         order_stats[order][migrate_type]++;
865
866         return 0;
867 }
868
869 static int evsel__process_page_free_event(struct evsel *evsel, struct perf_sample *sample)
870 {
871         u64 page;
872         unsigned int order = evsel__intval(evsel, sample, "order");
873         u64 bytes = kmem_page_size << order;
874         struct page_stat *pstat;
875         struct page_stat this = {
876                 .order = order,
877         };
878
879         if (use_pfn)
880                 page = evsel__intval(evsel, sample, "pfn");
881         else
882                 page = evsel__intval(evsel, sample, "page");
883
884         nr_page_frees++;
885         total_page_free_bytes += bytes;
886
887         this.page = page;
888         pstat = page_stat__find_page(&this);
889         if (pstat == NULL) {
890                 pr_debug2("missing free at page %"PRIx64" (order: %d)\n",
891                           page, order);
892
893                 nr_page_nomatch++;
894                 total_page_nomatch_bytes += bytes;
895
896                 return 0;
897         }
898
899         this.gfp_flags = pstat->gfp_flags;
900         this.migrate_type = pstat->migrate_type;
901         this.callsite = pstat->callsite;
902
903         rb_erase(&pstat->node, &page_live_tree);
904         free(pstat);
905
906         if (live_page) {
907                 order_stats[this.order][this.migrate_type]--;
908         } else {
909                 pstat = page_stat__find_alloc(&this);
910                 if (pstat == NULL)
911                         return -ENOMEM;
912
913                 pstat->nr_free++;
914                 pstat->free_bytes += bytes;
915         }
916
917         pstat = page_stat__find_caller(&this);
918         if (pstat == NULL)
919                 return -ENOENT;
920
921         pstat->nr_free++;
922         pstat->free_bytes += bytes;
923
924         if (live_page) {
925                 pstat->nr_alloc--;
926                 pstat->alloc_bytes -= bytes;
927
928                 if (pstat->nr_alloc == 0) {
929                         rb_erase(&pstat->node, &page_caller_tree);
930                         free(pstat);
931                 }
932         }
933
934         return 0;
935 }
936
937 static bool perf_kmem__skip_sample(struct perf_sample *sample)
938 {
939         /* skip sample based on time? */
940         if (perf_time__skip_sample(&ptime, sample->time))
941                 return true;
942
943         return false;
944 }
945
946 typedef int (*tracepoint_handler)(struct evsel *evsel,
947                                   struct perf_sample *sample);
948
949 static int process_sample_event(struct perf_tool *tool __maybe_unused,
950                                 union perf_event *event,
951                                 struct perf_sample *sample,
952                                 struct evsel *evsel,
953                                 struct machine *machine)
954 {
955         int err = 0;
956         struct thread *thread = machine__findnew_thread(machine, sample->pid,
957                                                         sample->tid);
958
959         if (thread == NULL) {
960                 pr_debug("problem processing %d event, skipping it.\n",
961                          event->header.type);
962                 return -1;
963         }
964
965         if (perf_kmem__skip_sample(sample))
966                 return 0;
967
968         dump_printf(" ... thread: %s:%d\n", thread__comm_str(thread), thread->tid);
969
970         if (evsel->handler != NULL) {
971                 tracepoint_handler f = evsel->handler;
972                 err = f(evsel, sample);
973         }
974
975         thread__put(thread);
976
977         return err;
978 }
979
980 static struct perf_tool perf_kmem = {
981         .sample          = process_sample_event,
982         .comm            = perf_event__process_comm,
983         .mmap            = perf_event__process_mmap,
984         .mmap2           = perf_event__process_mmap2,
985         .namespaces      = perf_event__process_namespaces,
986         .ordered_events  = true,
987 };
988
989 static double fragmentation(unsigned long n_req, unsigned long n_alloc)
990 {
991         if (n_alloc == 0)
992                 return 0.0;
993         else
994                 return 100.0 - (100.0 * n_req / n_alloc);
995 }
996
997 static void __print_slab_result(struct rb_root *root,
998                                 struct perf_session *session,
999                                 int n_lines, int is_caller)
1000 {
1001         struct rb_node *next;
1002         struct machine *machine = &session->machines.host;
1003
1004         printf("%.105s\n", graph_dotted_line);
1005         printf(" %-34s |",  is_caller ? "Callsite": "Alloc Ptr");
1006         printf(" Total_alloc/Per | Total_req/Per   | Hit      | Ping-pong | Frag\n");
1007         printf("%.105s\n", graph_dotted_line);
1008
1009         next = rb_first(root);
1010
1011         while (next && n_lines--) {
1012                 struct alloc_stat *data = rb_entry(next, struct alloc_stat,
1013                                                    node);
1014                 struct symbol *sym = NULL;
1015                 struct map *map;
1016                 char buf[BUFSIZ];
1017                 u64 addr;
1018
1019                 if (is_caller) {
1020                         addr = data->call_site;
1021                         if (!raw_ip)
1022                                 sym = machine__find_kernel_symbol(machine, addr, &map);
1023                 } else
1024                         addr = data->ptr;
1025
1026                 if (sym != NULL)
1027                         snprintf(buf, sizeof(buf), "%s+%" PRIx64 "", sym->name,
1028                                  addr - map->unmap_ip(map, sym->start));
1029                 else
1030                         snprintf(buf, sizeof(buf), "%#" PRIx64 "", addr);
1031                 printf(" %-34s |", buf);
1032
1033                 printf(" %9llu/%-5lu | %9llu/%-5lu | %8lu | %9lu | %6.3f%%\n",
1034                        (unsigned long long)data->bytes_alloc,
1035                        (unsigned long)data->bytes_alloc / data->hit,
1036                        (unsigned long long)data->bytes_req,
1037                        (unsigned long)data->bytes_req / data->hit,
1038                        (unsigned long)data->hit,
1039                        (unsigned long)data->pingpong,
1040                        fragmentation(data->bytes_req, data->bytes_alloc));
1041
1042                 next = rb_next(next);
1043         }
1044
1045         if (n_lines == -1)
1046                 printf(" ...                                | ...             | ...             | ...      | ...       | ...   \n");
1047
1048         printf("%.105s\n", graph_dotted_line);
1049 }
1050
1051 static const char * const migrate_type_str[] = {
1052         "UNMOVABL",
1053         "RECLAIM",
1054         "MOVABLE",
1055         "RESERVED",
1056         "CMA/ISLT",
1057         "UNKNOWN",
1058 };
1059
1060 static void __print_page_alloc_result(struct perf_session *session, int n_lines)
1061 {
1062         struct rb_node *next = rb_first(&page_alloc_sorted);
1063         struct machine *machine = &session->machines.host;
1064         const char *format;
1065         int gfp_len = max(strlen("GFP flags"), max_gfp_len);
1066
1067         printf("\n%.105s\n", graph_dotted_line);
1068         printf(" %-16s | %5s alloc (KB) | Hits      | Order | Mig.type | %-*s | Callsite\n",
1069                use_pfn ? "PFN" : "Page", live_page ? "Live" : "Total",
1070                gfp_len, "GFP flags");
1071         printf("%.105s\n", graph_dotted_line);
1072
1073         if (use_pfn)
1074                 format = " %16llu | %'16llu | %'9d | %5d | %8s | %-*s | %s\n";
1075         else
1076                 format = " %016llx | %'16llu | %'9d | %5d | %8s | %-*s | %s\n";
1077
1078         while (next && n_lines--) {
1079                 struct page_stat *data;
1080                 struct symbol *sym;
1081                 struct map *map;
1082                 char buf[32];
1083                 char *caller = buf;
1084
1085                 data = rb_entry(next, struct page_stat, node);
1086                 sym = machine__find_kernel_symbol(machine, data->callsite, &map);
1087                 if (sym)
1088                         caller = sym->name;
1089                 else
1090                         scnprintf(buf, sizeof(buf), "%"PRIx64, data->callsite);
1091
1092                 printf(format, (unsigned long long)data->page,
1093                        (unsigned long long)data->alloc_bytes / 1024,
1094                        data->nr_alloc, data->order,
1095                        migrate_type_str[data->migrate_type],
1096                        gfp_len, compact_gfp_string(data->gfp_flags), caller);
1097
1098                 next = rb_next(next);
1099         }
1100
1101         if (n_lines == -1) {
1102                 printf(" ...              | ...              | ...       | ...   | ...      | %-*s | ...\n",
1103                        gfp_len, "...");
1104         }
1105
1106         printf("%.105s\n", graph_dotted_line);
1107 }
1108
1109 static void __print_page_caller_result(struct perf_session *session, int n_lines)
1110 {
1111         struct rb_node *next = rb_first(&page_caller_sorted);
1112         struct machine *machine = &session->machines.host;
1113         int gfp_len = max(strlen("GFP flags"), max_gfp_len);
1114
1115         printf("\n%.105s\n", graph_dotted_line);
1116         printf(" %5s alloc (KB) | Hits      | Order | Mig.type | %-*s | Callsite\n",
1117                live_page ? "Live" : "Total", gfp_len, "GFP flags");
1118         printf("%.105s\n", graph_dotted_line);
1119
1120         while (next && n_lines--) {
1121                 struct page_stat *data;
1122                 struct symbol *sym;
1123                 struct map *map;
1124                 char buf[32];
1125                 char *caller = buf;
1126
1127                 data = rb_entry(next, struct page_stat, node);
1128                 sym = machine__find_kernel_symbol(machine, data->callsite, &map);
1129                 if (sym)
1130                         caller = sym->name;
1131                 else
1132                         scnprintf(buf, sizeof(buf), "%"PRIx64, data->callsite);
1133
1134                 printf(" %'16llu | %'9d | %5d | %8s | %-*s | %s\n",
1135                        (unsigned long long)data->alloc_bytes / 1024,
1136                        data->nr_alloc, data->order,
1137                        migrate_type_str[data->migrate_type],
1138                        gfp_len, compact_gfp_string(data->gfp_flags), caller);
1139
1140                 next = rb_next(next);
1141         }
1142
1143         if (n_lines == -1) {
1144                 printf(" ...              | ...       | ...   | ...      | %-*s | ...\n",
1145                        gfp_len, "...");
1146         }
1147
1148         printf("%.105s\n", graph_dotted_line);
1149 }
1150
1151 static void print_gfp_flags(void)
1152 {
1153         int i;
1154
1155         printf("#\n");
1156         printf("# GFP flags\n");
1157         printf("# ---------\n");
1158         for (i = 0; i < nr_gfps; i++) {
1159                 printf("# %08x: %*s: %s\n", gfps[i].flags,
1160                        (int) max_gfp_len, gfps[i].compact_str,
1161                        gfps[i].human_readable);
1162         }
1163 }
1164
1165 static void print_slab_summary(void)
1166 {
1167         printf("\nSUMMARY (SLAB allocator)");
1168         printf("\n========================\n");
1169         printf("Total bytes requested: %'lu\n", total_requested);
1170         printf("Total bytes allocated: %'lu\n", total_allocated);
1171         printf("Total bytes freed:     %'lu\n", total_freed);
1172         if (total_allocated > total_freed) {
1173                 printf("Net total bytes allocated: %'lu\n",
1174                 total_allocated - total_freed);
1175         }
1176         printf("Total bytes wasted on internal fragmentation: %'lu\n",
1177                total_allocated - total_requested);
1178         printf("Internal fragmentation: %f%%\n",
1179                fragmentation(total_requested, total_allocated));
1180         printf("Cross CPU allocations: %'lu/%'lu\n", nr_cross_allocs, nr_allocs);
1181 }
1182
1183 static void print_page_summary(void)
1184 {
1185         int o, m;
1186         u64 nr_alloc_freed = nr_page_frees - nr_page_nomatch;
1187         u64 total_alloc_freed_bytes = total_page_free_bytes - total_page_nomatch_bytes;
1188
1189         printf("\nSUMMARY (page allocator)");
1190         printf("\n========================\n");
1191         printf("%-30s: %'16lu   [ %'16"PRIu64" KB ]\n", "Total allocation requests",
1192                nr_page_allocs, total_page_alloc_bytes / 1024);
1193         printf("%-30s: %'16lu   [ %'16"PRIu64" KB ]\n", "Total free requests",
1194                nr_page_frees, total_page_free_bytes / 1024);
1195         printf("\n");
1196
1197         printf("%-30s: %'16"PRIu64"   [ %'16"PRIu64" KB ]\n", "Total alloc+freed requests",
1198                nr_alloc_freed, (total_alloc_freed_bytes) / 1024);
1199         printf("%-30s: %'16"PRIu64"   [ %'16"PRIu64" KB ]\n", "Total alloc-only requests",
1200                nr_page_allocs - nr_alloc_freed,
1201                (total_page_alloc_bytes - total_alloc_freed_bytes) / 1024);
1202         printf("%-30s: %'16lu   [ %'16"PRIu64" KB ]\n", "Total free-only requests",
1203                nr_page_nomatch, total_page_nomatch_bytes / 1024);
1204         printf("\n");
1205
1206         printf("%-30s: %'16lu   [ %'16"PRIu64" KB ]\n", "Total allocation failures",
1207                nr_page_fails, total_page_fail_bytes / 1024);
1208         printf("\n");
1209
1210         printf("%5s  %12s  %12s  %12s  %12s  %12s\n", "Order",  "Unmovable",
1211                "Reclaimable", "Movable", "Reserved", "CMA/Isolated");
1212         printf("%.5s  %.12s  %.12s  %.12s  %.12s  %.12s\n", graph_dotted_line,
1213                graph_dotted_line, graph_dotted_line, graph_dotted_line,
1214                graph_dotted_line, graph_dotted_line);
1215
1216         for (o = 0; o < MAX_PAGE_ORDER; o++) {
1217                 printf("%5d", o);
1218                 for (m = 0; m < MAX_MIGRATE_TYPES - 1; m++) {
1219                         if (order_stats[o][m])
1220                                 printf("  %'12d", order_stats[o][m]);
1221                         else
1222                                 printf("  %12c", '.');
1223                 }
1224                 printf("\n");
1225         }
1226 }
1227
1228 static void print_slab_result(struct perf_session *session)
1229 {
1230         if (caller_flag)
1231                 __print_slab_result(&root_caller_sorted, session, caller_lines, 1);
1232         if (alloc_flag)
1233                 __print_slab_result(&root_alloc_sorted, session, alloc_lines, 0);
1234         print_slab_summary();
1235 }
1236
1237 static void print_page_result(struct perf_session *session)
1238 {
1239         if (caller_flag || alloc_flag)
1240                 print_gfp_flags();
1241         if (caller_flag)
1242                 __print_page_caller_result(session, caller_lines);
1243         if (alloc_flag)
1244                 __print_page_alloc_result(session, alloc_lines);
1245         print_page_summary();
1246 }
1247
1248 static void print_result(struct perf_session *session)
1249 {
1250         if (kmem_slab)
1251                 print_slab_result(session);
1252         if (kmem_page)
1253                 print_page_result(session);
1254 }
1255
1256 static LIST_HEAD(slab_caller_sort);
1257 static LIST_HEAD(slab_alloc_sort);
1258 static LIST_HEAD(page_caller_sort);
1259 static LIST_HEAD(page_alloc_sort);
1260
1261 static void sort_slab_insert(struct rb_root *root, struct alloc_stat *data,
1262                              struct list_head *sort_list)
1263 {
1264         struct rb_node **new = &(root->rb_node);
1265         struct rb_node *parent = NULL;
1266         struct sort_dimension *sort;
1267
1268         while (*new) {
1269                 struct alloc_stat *this;
1270                 int cmp = 0;
1271
1272                 this = rb_entry(*new, struct alloc_stat, node);
1273                 parent = *new;
1274
1275                 list_for_each_entry(sort, sort_list, list) {
1276                         cmp = sort->cmp(data, this);
1277                         if (cmp)
1278                                 break;
1279                 }
1280
1281                 if (cmp > 0)
1282                         new = &((*new)->rb_left);
1283                 else
1284                         new = &((*new)->rb_right);
1285         }
1286
1287         rb_link_node(&data->node, parent, new);
1288         rb_insert_color(&data->node, root);
1289 }
1290
1291 static void __sort_slab_result(struct rb_root *root, struct rb_root *root_sorted,
1292                                struct list_head *sort_list)
1293 {
1294         struct rb_node *node;
1295         struct alloc_stat *data;
1296
1297         for (;;) {
1298                 node = rb_first(root);
1299                 if (!node)
1300                         break;
1301
1302                 rb_erase(node, root);
1303                 data = rb_entry(node, struct alloc_stat, node);
1304                 sort_slab_insert(root_sorted, data, sort_list);
1305         }
1306 }
1307
1308 static void sort_page_insert(struct rb_root *root, struct page_stat *data,
1309                              struct list_head *sort_list)
1310 {
1311         struct rb_node **new = &root->rb_node;
1312         struct rb_node *parent = NULL;
1313         struct sort_dimension *sort;
1314
1315         while (*new) {
1316                 struct page_stat *this;
1317                 int cmp = 0;
1318
1319                 this = rb_entry(*new, struct page_stat, node);
1320                 parent = *new;
1321
1322                 list_for_each_entry(sort, sort_list, list) {
1323                         cmp = sort->cmp(data, this);
1324                         if (cmp)
1325                                 break;
1326                 }
1327
1328                 if (cmp > 0)
1329                         new = &parent->rb_left;
1330                 else
1331                         new = &parent->rb_right;
1332         }
1333
1334         rb_link_node(&data->node, parent, new);
1335         rb_insert_color(&data->node, root);
1336 }
1337
1338 static void __sort_page_result(struct rb_root *root, struct rb_root *root_sorted,
1339                                struct list_head *sort_list)
1340 {
1341         struct rb_node *node;
1342         struct page_stat *data;
1343
1344         for (;;) {
1345                 node = rb_first(root);
1346                 if (!node)
1347                         break;
1348
1349                 rb_erase(node, root);
1350                 data = rb_entry(node, struct page_stat, node);
1351                 sort_page_insert(root_sorted, data, sort_list);
1352         }
1353 }
1354
1355 static void sort_result(void)
1356 {
1357         if (kmem_slab) {
1358                 __sort_slab_result(&root_alloc_stat, &root_alloc_sorted,
1359                                    &slab_alloc_sort);
1360                 __sort_slab_result(&root_caller_stat, &root_caller_sorted,
1361                                    &slab_caller_sort);
1362         }
1363         if (kmem_page) {
1364                 if (live_page)
1365                         __sort_page_result(&page_live_tree, &page_alloc_sorted,
1366                                            &page_alloc_sort);
1367                 else
1368                         __sort_page_result(&page_alloc_tree, &page_alloc_sorted,
1369                                            &page_alloc_sort);
1370
1371                 __sort_page_result(&page_caller_tree, &page_caller_sorted,
1372                                    &page_caller_sort);
1373         }
1374 }
1375
1376 static int __cmd_kmem(struct perf_session *session)
1377 {
1378         int err = -EINVAL;
1379         struct evsel *evsel;
1380         const struct evsel_str_handler kmem_tracepoints[] = {
1381                 /* slab allocator */
1382                 { "kmem:kmalloc",               evsel__process_alloc_event, },
1383                 { "kmem:kmem_cache_alloc",      evsel__process_alloc_event, },
1384                 { "kmem:kmalloc_node",          evsel__process_alloc_event, },
1385                 { "kmem:kmem_cache_alloc_node", evsel__process_alloc_event, },
1386                 { "kmem:kfree",                 evsel__process_free_event, },
1387                 { "kmem:kmem_cache_free",       evsel__process_free_event, },
1388                 /* page allocator */
1389                 { "kmem:mm_page_alloc",         evsel__process_page_alloc_event, },
1390                 { "kmem:mm_page_free",          evsel__process_page_free_event, },
1391         };
1392
1393         if (!perf_session__has_traces(session, "kmem record"))
1394                 goto out;
1395
1396         if (perf_session__set_tracepoints_handlers(session, kmem_tracepoints)) {
1397                 pr_err("Initializing perf session tracepoint handlers failed\n");
1398                 goto out;
1399         }
1400
1401         evlist__for_each_entry(session->evlist, evsel) {
1402                 if (!strcmp(evsel__name(evsel), "kmem:mm_page_alloc") &&
1403                     evsel__field(evsel, "pfn")) {
1404                         use_pfn = true;
1405                         break;
1406                 }
1407         }
1408
1409         setup_pager();
1410         err = perf_session__process_events(session);
1411         if (err != 0) {
1412                 pr_err("error during process events: %d\n", err);
1413                 goto out;
1414         }
1415         sort_result();
1416         print_result(session);
1417 out:
1418         return err;
1419 }
1420
1421 /* slab sort keys */
1422 static int ptr_cmp(void *a, void *b)
1423 {
1424         struct alloc_stat *l = a;
1425         struct alloc_stat *r = b;
1426
1427         if (l->ptr < r->ptr)
1428                 return -1;
1429         else if (l->ptr > r->ptr)
1430                 return 1;
1431         return 0;
1432 }
1433
1434 static struct sort_dimension ptr_sort_dimension = {
1435         .name   = "ptr",
1436         .cmp    = ptr_cmp,
1437 };
1438
1439 static int slab_callsite_cmp(void *a, void *b)
1440 {
1441         struct alloc_stat *l = a;
1442         struct alloc_stat *r = b;
1443
1444         if (l->call_site < r->call_site)
1445                 return -1;
1446         else if (l->call_site > r->call_site)
1447                 return 1;
1448         return 0;
1449 }
1450
1451 static struct sort_dimension callsite_sort_dimension = {
1452         .name   = "callsite",
1453         .cmp    = slab_callsite_cmp,
1454 };
1455
1456 static int hit_cmp(void *a, void *b)
1457 {
1458         struct alloc_stat *l = a;
1459         struct alloc_stat *r = b;
1460
1461         if (l->hit < r->hit)
1462                 return -1;
1463         else if (l->hit > r->hit)
1464                 return 1;
1465         return 0;
1466 }
1467
1468 static struct sort_dimension hit_sort_dimension = {
1469         .name   = "hit",
1470         .cmp    = hit_cmp,
1471 };
1472
1473 static int bytes_cmp(void *a, void *b)
1474 {
1475         struct alloc_stat *l = a;
1476         struct alloc_stat *r = b;
1477
1478         if (l->bytes_alloc < r->bytes_alloc)
1479                 return -1;
1480         else if (l->bytes_alloc > r->bytes_alloc)
1481                 return 1;
1482         return 0;
1483 }
1484
1485 static struct sort_dimension bytes_sort_dimension = {
1486         .name   = "bytes",
1487         .cmp    = bytes_cmp,
1488 };
1489
1490 static int frag_cmp(void *a, void *b)
1491 {
1492         double x, y;
1493         struct alloc_stat *l = a;
1494         struct alloc_stat *r = b;
1495
1496         x = fragmentation(l->bytes_req, l->bytes_alloc);
1497         y = fragmentation(r->bytes_req, r->bytes_alloc);
1498
1499         if (x < y)
1500                 return -1;
1501         else if (x > y)
1502                 return 1;
1503         return 0;
1504 }
1505
1506 static struct sort_dimension frag_sort_dimension = {
1507         .name   = "frag",
1508         .cmp    = frag_cmp,
1509 };
1510
1511 static int pingpong_cmp(void *a, void *b)
1512 {
1513         struct alloc_stat *l = a;
1514         struct alloc_stat *r = b;
1515
1516         if (l->pingpong < r->pingpong)
1517                 return -1;
1518         else if (l->pingpong > r->pingpong)
1519                 return 1;
1520         return 0;
1521 }
1522
1523 static struct sort_dimension pingpong_sort_dimension = {
1524         .name   = "pingpong",
1525         .cmp    = pingpong_cmp,
1526 };
1527
1528 /* page sort keys */
1529 static int page_cmp(void *a, void *b)
1530 {
1531         struct page_stat *l = a;
1532         struct page_stat *r = b;
1533
1534         if (l->page < r->page)
1535                 return -1;
1536         else if (l->page > r->page)
1537                 return 1;
1538         return 0;
1539 }
1540
1541 static struct sort_dimension page_sort_dimension = {
1542         .name   = "page",
1543         .cmp    = page_cmp,
1544 };
1545
1546 static int page_callsite_cmp(void *a, void *b)
1547 {
1548         struct page_stat *l = a;
1549         struct page_stat *r = b;
1550
1551         if (l->callsite < r->callsite)
1552                 return -1;
1553         else if (l->callsite > r->callsite)
1554                 return 1;
1555         return 0;
1556 }
1557
1558 static struct sort_dimension page_callsite_sort_dimension = {
1559         .name   = "callsite",
1560         .cmp    = page_callsite_cmp,
1561 };
1562
1563 static int page_hit_cmp(void *a, void *b)
1564 {
1565         struct page_stat *l = a;
1566         struct page_stat *r = b;
1567
1568         if (l->nr_alloc < r->nr_alloc)
1569                 return -1;
1570         else if (l->nr_alloc > r->nr_alloc)
1571                 return 1;
1572         return 0;
1573 }
1574
1575 static struct sort_dimension page_hit_sort_dimension = {
1576         .name   = "hit",
1577         .cmp    = page_hit_cmp,
1578 };
1579
1580 static int page_bytes_cmp(void *a, void *b)
1581 {
1582         struct page_stat *l = a;
1583         struct page_stat *r = b;
1584
1585         if (l->alloc_bytes < r->alloc_bytes)
1586                 return -1;
1587         else if (l->alloc_bytes > r->alloc_bytes)
1588                 return 1;
1589         return 0;
1590 }
1591
1592 static struct sort_dimension page_bytes_sort_dimension = {
1593         .name   = "bytes",
1594         .cmp    = page_bytes_cmp,
1595 };
1596
1597 static int page_order_cmp(void *a, void *b)
1598 {
1599         struct page_stat *l = a;
1600         struct page_stat *r = b;
1601
1602         if (l->order < r->order)
1603                 return -1;
1604         else if (l->order > r->order)
1605                 return 1;
1606         return 0;
1607 }
1608
1609 static struct sort_dimension page_order_sort_dimension = {
1610         .name   = "order",
1611         .cmp    = page_order_cmp,
1612 };
1613
1614 static int migrate_type_cmp(void *a, void *b)
1615 {
1616         struct page_stat *l = a;
1617         struct page_stat *r = b;
1618
1619         /* for internal use to find free'd page */
1620         if (l->migrate_type == -1U)
1621                 return 0;
1622
1623         if (l->migrate_type < r->migrate_type)
1624                 return -1;
1625         else if (l->migrate_type > r->migrate_type)
1626                 return 1;
1627         return 0;
1628 }
1629
1630 static struct sort_dimension migrate_type_sort_dimension = {
1631         .name   = "migtype",
1632         .cmp    = migrate_type_cmp,
1633 };
1634
1635 static int gfp_flags_cmp(void *a, void *b)
1636 {
1637         struct page_stat *l = a;
1638         struct page_stat *r = b;
1639
1640         /* for internal use to find free'd page */
1641         if (l->gfp_flags == -1U)
1642                 return 0;
1643
1644         if (l->gfp_flags < r->gfp_flags)
1645                 return -1;
1646         else if (l->gfp_flags > r->gfp_flags)
1647                 return 1;
1648         return 0;
1649 }
1650
1651 static struct sort_dimension gfp_flags_sort_dimension = {
1652         .name   = "gfp",
1653         .cmp    = gfp_flags_cmp,
1654 };
1655
1656 static struct sort_dimension *slab_sorts[] = {
1657         &ptr_sort_dimension,
1658         &callsite_sort_dimension,
1659         &hit_sort_dimension,
1660         &bytes_sort_dimension,
1661         &frag_sort_dimension,
1662         &pingpong_sort_dimension,
1663 };
1664
1665 static struct sort_dimension *page_sorts[] = {
1666         &page_sort_dimension,
1667         &page_callsite_sort_dimension,
1668         &page_hit_sort_dimension,
1669         &page_bytes_sort_dimension,
1670         &page_order_sort_dimension,
1671         &migrate_type_sort_dimension,
1672         &gfp_flags_sort_dimension,
1673 };
1674
1675 static int slab_sort_dimension__add(const char *tok, struct list_head *list)
1676 {
1677         struct sort_dimension *sort;
1678         int i;
1679
1680         for (i = 0; i < (int)ARRAY_SIZE(slab_sorts); i++) {
1681                 if (!strcmp(slab_sorts[i]->name, tok)) {
1682                         sort = memdup(slab_sorts[i], sizeof(*slab_sorts[i]));
1683                         if (!sort) {
1684                                 pr_err("%s: memdup failed\n", __func__);
1685                                 return -1;
1686                         }
1687                         list_add_tail(&sort->list, list);
1688                         return 0;
1689                 }
1690         }
1691
1692         return -1;
1693 }
1694
1695 static int page_sort_dimension__add(const char *tok, struct list_head *list)
1696 {
1697         struct sort_dimension *sort;
1698         int i;
1699
1700         for (i = 0; i < (int)ARRAY_SIZE(page_sorts); i++) {
1701                 if (!strcmp(page_sorts[i]->name, tok)) {
1702                         sort = memdup(page_sorts[i], sizeof(*page_sorts[i]));
1703                         if (!sort) {
1704                                 pr_err("%s: memdup failed\n", __func__);
1705                                 return -1;
1706                         }
1707                         list_add_tail(&sort->list, list);
1708                         return 0;
1709                 }
1710         }
1711
1712         return -1;
1713 }
1714
1715 static int setup_slab_sorting(struct list_head *sort_list, const char *arg)
1716 {
1717         char *tok;
1718         char *str = strdup(arg);
1719         char *pos = str;
1720
1721         if (!str) {
1722                 pr_err("%s: strdup failed\n", __func__);
1723                 return -1;
1724         }
1725
1726         while (true) {
1727                 tok = strsep(&pos, ",");
1728                 if (!tok)
1729                         break;
1730                 if (slab_sort_dimension__add(tok, sort_list) < 0) {
1731                         pr_err("Unknown slab --sort key: '%s'", tok);
1732                         free(str);
1733                         return -1;
1734                 }
1735         }
1736
1737         free(str);
1738         return 0;
1739 }
1740
1741 static int setup_page_sorting(struct list_head *sort_list, const char *arg)
1742 {
1743         char *tok;
1744         char *str = strdup(arg);
1745         char *pos = str;
1746
1747         if (!str) {
1748                 pr_err("%s: strdup failed\n", __func__);
1749                 return -1;
1750         }
1751
1752         while (true) {
1753                 tok = strsep(&pos, ",");
1754                 if (!tok)
1755                         break;
1756                 if (page_sort_dimension__add(tok, sort_list) < 0) {
1757                         pr_err("Unknown page --sort key: '%s'", tok);
1758                         free(str);
1759                         return -1;
1760                 }
1761         }
1762
1763         free(str);
1764         return 0;
1765 }
1766
1767 static int parse_sort_opt(const struct option *opt __maybe_unused,
1768                           const char *arg, int unset __maybe_unused)
1769 {
1770         if (!arg)
1771                 return -1;
1772
1773         if (kmem_page > kmem_slab ||
1774             (kmem_page == 0 && kmem_slab == 0 && kmem_default == KMEM_PAGE)) {
1775                 if (caller_flag > alloc_flag)
1776                         return setup_page_sorting(&page_caller_sort, arg);
1777                 else
1778                         return setup_page_sorting(&page_alloc_sort, arg);
1779         } else {
1780                 if (caller_flag > alloc_flag)
1781                         return setup_slab_sorting(&slab_caller_sort, arg);
1782                 else
1783                         return setup_slab_sorting(&slab_alloc_sort, arg);
1784         }
1785
1786         return 0;
1787 }
1788
1789 static int parse_caller_opt(const struct option *opt __maybe_unused,
1790                             const char *arg __maybe_unused,
1791                             int unset __maybe_unused)
1792 {
1793         caller_flag = (alloc_flag + 1);
1794         return 0;
1795 }
1796
1797 static int parse_alloc_opt(const struct option *opt __maybe_unused,
1798                            const char *arg __maybe_unused,
1799                            int unset __maybe_unused)
1800 {
1801         alloc_flag = (caller_flag + 1);
1802         return 0;
1803 }
1804
1805 static int parse_slab_opt(const struct option *opt __maybe_unused,
1806                           const char *arg __maybe_unused,
1807                           int unset __maybe_unused)
1808 {
1809         kmem_slab = (kmem_page + 1);
1810         return 0;
1811 }
1812
1813 static int parse_page_opt(const struct option *opt __maybe_unused,
1814                           const char *arg __maybe_unused,
1815                           int unset __maybe_unused)
1816 {
1817         kmem_page = (kmem_slab + 1);
1818         return 0;
1819 }
1820
1821 static int parse_line_opt(const struct option *opt __maybe_unused,
1822                           const char *arg, int unset __maybe_unused)
1823 {
1824         int lines;
1825
1826         if (!arg)
1827                 return -1;
1828
1829         lines = strtoul(arg, NULL, 10);
1830
1831         if (caller_flag > alloc_flag)
1832                 caller_lines = lines;
1833         else
1834                 alloc_lines = lines;
1835
1836         return 0;
1837 }
1838
1839 static bool slab_legacy_tp_is_exposed(void)
1840 {
1841         /*
1842          * The tracepoints "kmem:kmalloc_node" and
1843          * "kmem:kmem_cache_alloc_node" have been removed on the latest
1844          * kernel, if the tracepoint "kmem:kmalloc_node" is existed it
1845          * means the tool is running on an old kernel, we need to
1846          * rollback to support these legacy tracepoints.
1847          */
1848         return IS_ERR(trace_event__tp_format("kmem", "kmalloc_node")) ?
1849                 false : true;
1850 }
1851
1852 static int __cmd_record(int argc, const char **argv)
1853 {
1854         const char * const record_args[] = {
1855         "record", "-a", "-R", "-c", "1",
1856         };
1857         const char * const slab_events[] = {
1858         "-e", "kmem:kmalloc",
1859         "-e", "kmem:kfree",
1860         "-e", "kmem:kmem_cache_alloc",
1861         "-e", "kmem:kmem_cache_free",
1862         };
1863         const char * const slab_legacy_events[] = {
1864         "-e", "kmem:kmalloc_node",
1865         "-e", "kmem:kmem_cache_alloc_node",
1866         };
1867         const char * const page_events[] = {
1868         "-e", "kmem:mm_page_alloc",
1869         "-e", "kmem:mm_page_free",
1870         };
1871         unsigned int rec_argc, i, j;
1872         const char **rec_argv;
1873         unsigned int slab_legacy_tp_exposed = slab_legacy_tp_is_exposed();
1874
1875         rec_argc = ARRAY_SIZE(record_args) + argc - 1;
1876         if (kmem_slab) {
1877                 rec_argc += ARRAY_SIZE(slab_events);
1878                 if (slab_legacy_tp_exposed)
1879                         rec_argc += ARRAY_SIZE(slab_legacy_events);
1880         }
1881         if (kmem_page)
1882                 rec_argc += ARRAY_SIZE(page_events) + 1; /* for -g */
1883
1884         rec_argv = calloc(rec_argc + 1, sizeof(char *));
1885
1886         if (rec_argv == NULL)
1887                 return -ENOMEM;
1888
1889         for (i = 0; i < ARRAY_SIZE(record_args); i++)
1890                 rec_argv[i] = strdup(record_args[i]);
1891
1892         if (kmem_slab) {
1893                 for (j = 0; j < ARRAY_SIZE(slab_events); j++, i++)
1894                         rec_argv[i] = strdup(slab_events[j]);
1895                 if (slab_legacy_tp_exposed) {
1896                         for (j = 0; j < ARRAY_SIZE(slab_legacy_events); j++, i++)
1897                                 rec_argv[i] = strdup(slab_legacy_events[j]);
1898                 }
1899         }
1900         if (kmem_page) {
1901                 rec_argv[i++] = strdup("-g");
1902
1903                 for (j = 0; j < ARRAY_SIZE(page_events); j++, i++)
1904                         rec_argv[i] = strdup(page_events[j]);
1905         }
1906
1907         for (j = 1; j < (unsigned int)argc; j++, i++)
1908                 rec_argv[i] = argv[j];
1909
1910         return cmd_record(i, rec_argv);
1911 }
1912
1913 static int kmem_config(const char *var, const char *value, void *cb __maybe_unused)
1914 {
1915         if (!strcmp(var, "kmem.default")) {
1916                 if (!strcmp(value, "slab"))
1917                         kmem_default = KMEM_SLAB;
1918                 else if (!strcmp(value, "page"))
1919                         kmem_default = KMEM_PAGE;
1920                 else
1921                         pr_err("invalid default value ('slab' or 'page' required): %s\n",
1922                                value);
1923                 return 0;
1924         }
1925
1926         return 0;
1927 }
1928
1929 int cmd_kmem(int argc, const char **argv)
1930 {
1931         const char * const default_slab_sort = "frag,hit,bytes";
1932         const char * const default_page_sort = "bytes,hit";
1933         struct perf_data data = {
1934                 .mode = PERF_DATA_MODE_READ,
1935         };
1936         const struct option kmem_options[] = {
1937         OPT_STRING('i', "input", &input_name, "file", "input file name"),
1938         OPT_INCR('v', "verbose", &verbose,
1939                     "be more verbose (show symbol address, etc)"),
1940         OPT_CALLBACK_NOOPT(0, "caller", NULL, NULL,
1941                            "show per-callsite statistics", parse_caller_opt),
1942         OPT_CALLBACK_NOOPT(0, "alloc", NULL, NULL,
1943                            "show per-allocation statistics", parse_alloc_opt),
1944         OPT_CALLBACK('s', "sort", NULL, "key[,key2...]",
1945                      "sort by keys: ptr, callsite, bytes, hit, pingpong, frag, "
1946                      "page, order, migtype, gfp", parse_sort_opt),
1947         OPT_CALLBACK('l', "line", NULL, "num", "show n lines", parse_line_opt),
1948         OPT_BOOLEAN(0, "raw-ip", &raw_ip, "show raw ip instead of symbol"),
1949         OPT_BOOLEAN('f', "force", &data.force, "don't complain, do it"),
1950         OPT_CALLBACK_NOOPT(0, "slab", NULL, NULL, "Analyze slab allocator",
1951                            parse_slab_opt),
1952         OPT_CALLBACK_NOOPT(0, "page", NULL, NULL, "Analyze page allocator",
1953                            parse_page_opt),
1954         OPT_BOOLEAN(0, "live", &live_page, "Show live page stat"),
1955         OPT_STRING(0, "time", &time_str, "str",
1956                    "Time span of interest (start,stop)"),
1957         OPT_END()
1958         };
1959         const char *const kmem_subcommands[] = { "record", "stat", NULL };
1960         const char *kmem_usage[] = {
1961                 NULL,
1962                 NULL
1963         };
1964         struct perf_session *session;
1965         static const char errmsg[] = "No %s allocation events found.  Have you run 'perf kmem record --%s'?\n";
1966         int ret = perf_config(kmem_config, NULL);
1967
1968         if (ret)
1969                 return ret;
1970
1971         argc = parse_options_subcommand(argc, argv, kmem_options,
1972                                         kmem_subcommands, kmem_usage,
1973                                         PARSE_OPT_STOP_AT_NON_OPTION);
1974
1975         if (!argc)
1976                 usage_with_options(kmem_usage, kmem_options);
1977
1978         if (kmem_slab == 0 && kmem_page == 0) {
1979                 if (kmem_default == KMEM_SLAB)
1980                         kmem_slab = 1;
1981                 else
1982                         kmem_page = 1;
1983         }
1984
1985         if (strlen(argv[0]) > 2 && strstarts("record", argv[0])) {
1986                 symbol__init(NULL);
1987                 return __cmd_record(argc, argv);
1988         }
1989
1990         data.path = input_name;
1991
1992         kmem_session = session = perf_session__new(&data, &perf_kmem);
1993         if (IS_ERR(session))
1994                 return PTR_ERR(session);
1995
1996         ret = -1;
1997
1998         if (kmem_slab) {
1999                 if (!evlist__find_tracepoint_by_name(session->evlist, "kmem:kmalloc")) {
2000                         pr_err(errmsg, "slab", "slab");
2001                         goto out_delete;
2002                 }
2003         }
2004
2005         if (kmem_page) {
2006                 struct evsel *evsel = evlist__find_tracepoint_by_name(session->evlist, "kmem:mm_page_alloc");
2007
2008                 if (evsel == NULL) {
2009                         pr_err(errmsg, "page", "page");
2010                         goto out_delete;
2011                 }
2012
2013                 kmem_page_size = tep_get_page_size(evsel->tp_format->tep);
2014                 symbol_conf.use_callchain = true;
2015         }
2016
2017         symbol__init(&session->header.env);
2018
2019         if (perf_time__parse_str(&ptime, time_str) != 0) {
2020                 pr_err("Invalid time string\n");
2021                 ret = -EINVAL;
2022                 goto out_delete;
2023         }
2024
2025         if (!strcmp(argv[0], "stat")) {
2026                 setlocale(LC_ALL, "");
2027
2028                 if (cpu__setup_cpunode_map())
2029                         goto out_delete;
2030
2031                 if (list_empty(&slab_caller_sort))
2032                         setup_slab_sorting(&slab_caller_sort, default_slab_sort);
2033                 if (list_empty(&slab_alloc_sort))
2034                         setup_slab_sorting(&slab_alloc_sort, default_slab_sort);
2035                 if (list_empty(&page_caller_sort))
2036                         setup_page_sorting(&page_caller_sort, default_page_sort);
2037                 if (list_empty(&page_alloc_sort))
2038                         setup_page_sorting(&page_alloc_sort, default_page_sort);
2039
2040                 if (kmem_page) {
2041                         setup_page_sorting(&page_alloc_sort_input,
2042                                            "page,order,migtype,gfp");
2043                         setup_page_sorting(&page_caller_sort_input,
2044                                            "callsite,order,migtype,gfp");
2045                 }
2046                 ret = __cmd_kmem(session);
2047         } else
2048                 usage_with_options(kmem_usage, kmem_options);
2049
2050 out_delete:
2051         perf_session__delete(session);
2052
2053         return ret;
2054 }
2055