Commit | Line | Data |
---|---|---|
725e8ec5 TH |
1 | #!/usr/bin/env drgn |
2 | # | |
3 | # Copyright (C) 2023 Tejun Heo <tj@kernel.org> | |
4 | # Copyright (C) 2023 Meta Platforms, Inc. and affiliates. | |
5 | ||
6 | desc = """ | |
7 | This is a drgn script to monitor workqueues. For more info on drgn, visit | |
8 | https://github.com/osandov/drgn. | |
9 | ||
10 | total Total number of work items executed by the workqueue. | |
11 | ||
12 | infl The number of currently in-flight work items. | |
13 | ||
8a1dd1e5 TH |
14 | CPUtime Total CPU time consumed by the workqueue in seconds. This is |
15 | sampled from scheduler ticks and only provides ballpark | |
16 | measurement. "nohz_full=" CPUs are excluded from measurement. | |
17 | ||
616db877 TH |
18 | CPUitsv The number of times a concurrency-managed work item hogged CPU |
19 | longer than the threshold (workqueue.cpu_intensive_thresh_us) | |
20 | and got excluded from concurrency management to avoid stalling | |
21 | other work items. | |
22 | ||
8639eceb TH |
23 | CMW/RPR For per-cpu workqueues, the number of concurrency-management |
24 | wake-ups while executing a work item of the workqueue. For | |
25 | unbound workqueues, the number of times a worker was repatriated | |
26 | to its affinity scope after being migrated to an off-scope CPU by | |
27 | the scheduler. | |
725e8ec5 TH |
28 | |
29 | mayday The number of times the rescuer was requested while waiting for | |
30 | new worker creation. | |
31 | ||
32 | rescued The number of work items executed by the rescuer. | |
33 | """ | |
34 | ||
35 | import sys | |
36 | import signal | |
37 | import os | |
38 | import re | |
39 | import time | |
40 | import json | |
41 | ||
42 | import drgn | |
43 | from drgn.helpers.linux.list import list_for_each_entry,list_empty | |
44 | from drgn.helpers.linux.cpumask import for_each_possible_cpu | |
45 | ||
46 | import argparse | |
47 | parser = argparse.ArgumentParser(description=desc, | |
48 | formatter_class=argparse.RawTextHelpFormatter) | |
49 | parser.add_argument('workqueue', metavar='REGEX', nargs='*', | |
50 | help='Target workqueue name patterns (all if empty)') | |
51 | parser.add_argument('-i', '--interval', metavar='SECS', type=float, default=1, | |
52 | help='Monitoring interval (0 to print once and exit)') | |
53 | parser.add_argument('-j', '--json', action='store_true', | |
54 | help='Output in json') | |
55 | args = parser.parse_args() | |
56 | ||
57 | def err(s): | |
58 | print(s, file=sys.stderr, flush=True) | |
59 | sys.exit(1) | |
60 | ||
61 | workqueues = prog['workqueues'] | |
62 | ||
63 | WQ_UNBOUND = prog['WQ_UNBOUND'] | |
64 | WQ_MEM_RECLAIM = prog['WQ_MEM_RECLAIM'] | |
65 | ||
66 | PWQ_STAT_STARTED = prog['PWQ_STAT_STARTED'] # work items started execution | |
67 | PWQ_STAT_COMPLETED = prog['PWQ_STAT_COMPLETED'] # work items completed execution | |
8a1dd1e5 | 68 | PWQ_STAT_CPU_TIME = prog['PWQ_STAT_CPU_TIME'] # total CPU time consumed |
616db877 | 69 | PWQ_STAT_CPU_INTENSIVE = prog['PWQ_STAT_CPU_INTENSIVE'] # wq_cpu_intensive_thresh_us violations |
725e8ec5 | 70 | PWQ_STAT_CM_WAKEUP = prog['PWQ_STAT_CM_WAKEUP'] # concurrency-management worker wakeups |
8639eceb | 71 | PWQ_STAT_REPATRIATED = prog['PWQ_STAT_REPATRIATED'] # unbound workers brought back into scope |
725e8ec5 TH |
72 | PWQ_STAT_MAYDAY = prog['PWQ_STAT_MAYDAY'] # maydays to rescuer |
73 | PWQ_STAT_RESCUED = prog['PWQ_STAT_RESCUED'] # linked work items executed by rescuer | |
74 | PWQ_NR_STATS = prog['PWQ_NR_STATS'] | |
75 | ||
76 | class WqStats: | |
77 | def __init__(self, wq): | |
78 | self.name = wq.name.string_().decode() | |
79 | self.unbound = wq.flags & WQ_UNBOUND != 0 | |
80 | self.mem_reclaim = wq.flags & WQ_MEM_RECLAIM != 0 | |
81 | self.stats = [0] * PWQ_NR_STATS | |
82 | for pwq in list_for_each_entry('struct pool_workqueue', wq.pwqs.address_of_(), 'pwqs_node'): | |
83 | for i in range(PWQ_NR_STATS): | |
84 | self.stats[i] += int(pwq.stats[i]) | |
85 | ||
86 | def dict(self, now): | |
87 | return { 'timestamp' : now, | |
88 | 'name' : self.name, | |
89 | 'unbound' : self.unbound, | |
90 | 'mem_reclaim' : self.mem_reclaim, | |
91 | 'started' : self.stats[PWQ_STAT_STARTED], | |
92 | 'completed' : self.stats[PWQ_STAT_COMPLETED], | |
8a1dd1e5 | 93 | 'cpu_time' : self.stats[PWQ_STAT_CPU_TIME], |
616db877 | 94 | 'cpu_intensive' : self.stats[PWQ_STAT_CPU_INTENSIVE], |
725e8ec5 | 95 | 'cm_wakeup' : self.stats[PWQ_STAT_CM_WAKEUP], |
8639eceb | 96 | 'repatriated' : self.stats[PWQ_STAT_REPATRIATED], |
725e8ec5 TH |
97 | 'mayday' : self.stats[PWQ_STAT_MAYDAY], |
98 | 'rescued' : self.stats[PWQ_STAT_RESCUED], } | |
99 | ||
100 | def table_header_str(): | |
8a1dd1e5 | 101 | return f'{"":>24} {"total":>8} {"infl":>5} {"CPUtime":>8} '\ |
8639eceb | 102 | f'{"CPUitsv":>7} {"CMW/RPR":>7} {"mayday":>7} {"rescued":>7}' |
725e8ec5 TH |
103 | |
104 | def table_row_str(self): | |
616db877 | 105 | cpu_intensive = '-' |
8639eceb | 106 | cmw_rpr = '-' |
725e8ec5 TH |
107 | mayday = '-' |
108 | rescued = '-' | |
109 | ||
8639eceb TH |
110 | if self.unbound: |
111 | cmw_rpr = str(self.stats[PWQ_STAT_REPATRIATED]); | |
112 | else: | |
616db877 | 113 | cpu_intensive = str(self.stats[PWQ_STAT_CPU_INTENSIVE]) |
8639eceb | 114 | cmw_rpr = str(self.stats[PWQ_STAT_CM_WAKEUP]) |
725e8ec5 TH |
115 | |
116 | if self.mem_reclaim: | |
117 | mayday = str(self.stats[PWQ_STAT_MAYDAY]) | |
118 | rescued = str(self.stats[PWQ_STAT_RESCUED]) | |
119 | ||
120 | out = f'{self.name[-24:]:24} ' \ | |
121 | f'{self.stats[PWQ_STAT_STARTED]:8} ' \ | |
122 | f'{max(self.stats[PWQ_STAT_STARTED] - self.stats[PWQ_STAT_COMPLETED], 0):5} ' \ | |
8a1dd1e5 | 123 | f'{self.stats[PWQ_STAT_CPU_TIME] / 1000000:8.1f} ' \ |
616db877 | 124 | f'{cpu_intensive:>7} ' \ |
8639eceb | 125 | f'{cmw_rpr:>7} ' \ |
725e8ec5 TH |
126 | f'{mayday:>7} ' \ |
127 | f'{rescued:>7} ' | |
128 | return out.rstrip(':') | |
129 | ||
130 | exit_req = False | |
131 | ||
132 | def sigint_handler(signr, frame): | |
133 | global exit_req | |
134 | exit_req = True | |
135 | ||
136 | def main(): | |
137 | # handle args | |
138 | table_fmt = not args.json | |
139 | interval = args.interval | |
140 | ||
141 | re_str = None | |
142 | if args.workqueue: | |
143 | for r in args.workqueue: | |
144 | if re_str is None: | |
145 | re_str = r | |
146 | else: | |
147 | re_str += '|' + r | |
148 | ||
149 | filter_re = re.compile(re_str) if re_str else None | |
150 | ||
151 | # monitoring loop | |
152 | signal.signal(signal.SIGINT, sigint_handler) | |
153 | ||
154 | while not exit_req: | |
155 | now = time.time() | |
156 | ||
157 | if table_fmt: | |
158 | print() | |
159 | print(WqStats.table_header_str()) | |
160 | ||
161 | for wq in list_for_each_entry('struct workqueue_struct', workqueues.address_of_(), 'list'): | |
162 | stats = WqStats(wq) | |
163 | if filter_re and not filter_re.search(stats.name): | |
164 | continue | |
165 | if table_fmt: | |
166 | print(stats.table_row_str()) | |
167 | else: | |
168 | print(stats.dict(now)) | |
169 | ||
170 | if interval == 0: | |
171 | break | |
172 | time.sleep(interval) | |
173 | ||
174 | if __name__ == "__main__": | |
175 | main() |