d2432df2b905bc33da2f2144c97810f53171a1ac
[linux-2.6-block.git] / kernel / hung_task.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Detect Hung Task
4  *
5  * kernel/hung_task.c - kernel thread for detecting tasks stuck in D state
6  *
7  */
8
9 #include <linux/mm.h>
10 #include <linux/cpu.h>
11 #include <linux/nmi.h>
12 #include <linux/init.h>
13 #include <linux/delay.h>
14 #include <linux/freezer.h>
15 #include <linux/kthread.h>
16 #include <linux/lockdep.h>
17 #include <linux/export.h>
18 #include <linux/panic_notifier.h>
19 #include <linux/sysctl.h>
20 #include <linux/suspend.h>
21 #include <linux/utsname.h>
22 #include <linux/sched/signal.h>
23 #include <linux/sched/debug.h>
24 #include <linux/sched/sysctl.h>
25 #include <linux/hung_task.h>
26
27 #include <trace/events/sched.h>
28
29 /*
30  * The number of tasks checked:
31  */
32 static int __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT;
33
34 /*
35  * Total number of tasks detected as hung since boot:
36  */
37 static unsigned long __read_mostly sysctl_hung_task_detect_count;
38
39 /*
40  * Limit number of tasks checked in a batch.
41  *
42  * This value controls the preemptibility of khungtaskd since preemption
43  * is disabled during the critical section. It also controls the size of
44  * the RCU grace period. So it needs to be upper-bound.
45  */
46 #define HUNG_TASK_LOCK_BREAK (HZ / 10)
47
48 /*
49  * Zero means infinite timeout - no checking done:
50  */
51 unsigned long __read_mostly sysctl_hung_task_timeout_secs = CONFIG_DEFAULT_HUNG_TASK_TIMEOUT;
52 EXPORT_SYMBOL_GPL(sysctl_hung_task_timeout_secs);
53
54 /*
55  * Zero (default value) means use sysctl_hung_task_timeout_secs:
56  */
57 static unsigned long __read_mostly sysctl_hung_task_check_interval_secs;
58
59 static int __read_mostly sysctl_hung_task_warnings = 10;
60
61 static int __read_mostly did_panic;
62 static bool hung_task_show_lock;
63 static bool hung_task_call_panic;
64 static bool hung_task_show_all_bt;
65
66 static struct task_struct *watchdog_task;
67
68 #ifdef CONFIG_SMP
69 /*
70  * Should we dump all CPUs backtraces in a hung task event?
71  * Defaults to 0, can be changed via sysctl.
72  */
73 static unsigned int __read_mostly sysctl_hung_task_all_cpu_backtrace;
74 #else
75 #define sysctl_hung_task_all_cpu_backtrace 0
76 #endif /* CONFIG_SMP */
77
78 /*
79  * Should we panic (and reboot, if panic_timeout= is set) when a
80  * hung task is detected:
81  */
82 static unsigned int __read_mostly sysctl_hung_task_panic =
83         IS_ENABLED(CONFIG_BOOTPARAM_HUNG_TASK_PANIC);
84
85 static int
86 hung_task_panic(struct notifier_block *this, unsigned long event, void *ptr)
87 {
88         did_panic = 1;
89
90         return NOTIFY_DONE;
91 }
92
93 static struct notifier_block panic_block = {
94         .notifier_call = hung_task_panic,
95 };
96
97
98 #ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER
99 static void debug_show_blocker(struct task_struct *task)
100 {
101         struct task_struct *g, *t;
102         unsigned long owner, blocker, blocker_type;
103
104         RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "No rcu lock held");
105
106         blocker = READ_ONCE(task->blocker);
107         if (!blocker)
108                 return;
109
110         blocker_type = hung_task_get_blocker_type(blocker);
111
112         switch (blocker_type) {
113         case BLOCKER_TYPE_MUTEX:
114                 owner = mutex_get_owner(
115                         (struct mutex *)hung_task_blocker_to_lock(blocker));
116                 break;
117         case BLOCKER_TYPE_SEM:
118                 owner = sem_last_holder(
119                         (struct semaphore *)hung_task_blocker_to_lock(blocker));
120                 break;
121         default:
122                 WARN_ON_ONCE(1);
123                 return;
124         }
125
126
127         if (unlikely(!owner)) {
128                 switch (blocker_type) {
129                 case BLOCKER_TYPE_MUTEX:
130                         pr_err("INFO: task %s:%d is blocked on a mutex, but the owner is not found.\n",
131                                task->comm, task->pid);
132                         break;
133                 case BLOCKER_TYPE_SEM:
134                         pr_err("INFO: task %s:%d is blocked on a semaphore, but the last holder is not found.\n",
135                                task->comm, task->pid);
136                         break;
137                 }
138                 return;
139         }
140
141         /* Ensure the owner information is correct. */
142         for_each_process_thread(g, t) {
143                 if ((unsigned long)t != owner)
144                         continue;
145
146                 switch (blocker_type) {
147                 case BLOCKER_TYPE_MUTEX:
148                         pr_err("INFO: task %s:%d is blocked on a mutex likely owned by task %s:%d.\n",
149                                task->comm, task->pid, t->comm, t->pid);
150                         break;
151                 case BLOCKER_TYPE_SEM:
152                         pr_err("INFO: task %s:%d blocked on a semaphore likely last held by task %s:%d\n",
153                                task->comm, task->pid, t->comm, t->pid);
154                         break;
155                 }
156                 sched_show_task(t);
157                 return;
158         }
159 }
160 #else
161 static inline void debug_show_blocker(struct task_struct *task)
162 {
163 }
164 #endif
165
166 static void check_hung_task(struct task_struct *t, unsigned long timeout)
167 {
168         unsigned long switch_count = t->nvcsw + t->nivcsw;
169
170         /*
171          * Ensure the task is not frozen.
172          * Also, skip vfork and any other user process that freezer should skip.
173          */
174         if (unlikely(READ_ONCE(t->__state) & TASK_FROZEN))
175                 return;
176
177         /*
178          * When a freshly created task is scheduled once, changes its state to
179          * TASK_UNINTERRUPTIBLE without having ever been switched out once, it
180          * musn't be checked.
181          */
182         if (unlikely(!switch_count))
183                 return;
184
185         if (switch_count != t->last_switch_count) {
186                 t->last_switch_count = switch_count;
187                 t->last_switch_time = jiffies;
188                 return;
189         }
190         if (time_is_after_jiffies(t->last_switch_time + timeout * HZ))
191                 return;
192
193         /*
194          * This counter tracks the total number of tasks detected as hung
195          * since boot.
196          */
197         sysctl_hung_task_detect_count++;
198
199         trace_sched_process_hang(t);
200
201         if (sysctl_hung_task_panic) {
202                 console_verbose();
203                 hung_task_show_lock = true;
204                 hung_task_call_panic = true;
205         }
206
207         /*
208          * Ok, the task did not get scheduled for more than 2 minutes,
209          * complain:
210          */
211         if (sysctl_hung_task_warnings || hung_task_call_panic) {
212                 if (sysctl_hung_task_warnings > 0)
213                         sysctl_hung_task_warnings--;
214                 pr_err("INFO: task %s:%d blocked for more than %ld seconds.\n",
215                        t->comm, t->pid, (jiffies - t->last_switch_time) / HZ);
216                 pr_err("      %s %s %.*s\n",
217                         print_tainted(), init_utsname()->release,
218                         (int)strcspn(init_utsname()->version, " "),
219                         init_utsname()->version);
220                 if (t->flags & PF_POSTCOREDUMP)
221                         pr_err("      Blocked by coredump.\n");
222                 pr_err("\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
223                         " disables this message.\n");
224                 sched_show_task(t);
225                 debug_show_blocker(t);
226                 hung_task_show_lock = true;
227
228                 if (sysctl_hung_task_all_cpu_backtrace)
229                         hung_task_show_all_bt = true;
230                 if (!sysctl_hung_task_warnings)
231                         pr_info("Future hung task reports are suppressed, see sysctl kernel.hung_task_warnings\n");
232         }
233
234         touch_nmi_watchdog();
235 }
236
237 /*
238  * To avoid extending the RCU grace period for an unbounded amount of time,
239  * periodically exit the critical section and enter a new one.
240  *
241  * For preemptible RCU it is sufficient to call rcu_read_unlock in order
242  * to exit the grace period. For classic RCU, a reschedule is required.
243  */
244 static bool rcu_lock_break(struct task_struct *g, struct task_struct *t)
245 {
246         bool can_cont;
247
248         get_task_struct(g);
249         get_task_struct(t);
250         rcu_read_unlock();
251         cond_resched();
252         rcu_read_lock();
253         can_cont = pid_alive(g) && pid_alive(t);
254         put_task_struct(t);
255         put_task_struct(g);
256
257         return can_cont;
258 }
259
260 /*
261  * Check whether a TASK_UNINTERRUPTIBLE does not get woken up for
262  * a really long time (120 seconds). If that happens, print out
263  * a warning.
264  */
265 static void check_hung_uninterruptible_tasks(unsigned long timeout)
266 {
267         int max_count = sysctl_hung_task_check_count;
268         unsigned long last_break = jiffies;
269         struct task_struct *g, *t;
270
271         /*
272          * If the system crashed already then all bets are off,
273          * do not report extra hung tasks:
274          */
275         if (test_taint(TAINT_DIE) || did_panic)
276                 return;
277
278         hung_task_show_lock = false;
279         rcu_read_lock();
280         for_each_process_thread(g, t) {
281                 unsigned int state;
282
283                 if (!max_count--)
284                         goto unlock;
285                 if (time_after(jiffies, last_break + HUNG_TASK_LOCK_BREAK)) {
286                         if (!rcu_lock_break(g, t))
287                                 goto unlock;
288                         last_break = jiffies;
289                 }
290                 /*
291                  * skip the TASK_KILLABLE tasks -- these can be killed
292                  * skip the TASK_IDLE tasks -- those are genuinely idle
293                  */
294                 state = READ_ONCE(t->__state);
295                 if ((state & TASK_UNINTERRUPTIBLE) &&
296                     !(state & TASK_WAKEKILL) &&
297                     !(state & TASK_NOLOAD))
298                         check_hung_task(t, timeout);
299         }
300  unlock:
301         rcu_read_unlock();
302         if (hung_task_show_lock)
303                 debug_show_all_locks();
304
305         if (hung_task_show_all_bt) {
306                 hung_task_show_all_bt = false;
307                 trigger_all_cpu_backtrace();
308         }
309
310         if (hung_task_call_panic)
311                 panic("hung_task: blocked tasks");
312 }
313
314 static long hung_timeout_jiffies(unsigned long last_checked,
315                                  unsigned long timeout)
316 {
317         /* timeout of 0 will disable the watchdog */
318         return timeout ? last_checked - jiffies + timeout * HZ :
319                 MAX_SCHEDULE_TIMEOUT;
320 }
321
322 #ifdef CONFIG_SYSCTL
323 /*
324  * Process updating of timeout sysctl
325  */
326 static int proc_dohung_task_timeout_secs(const struct ctl_table *table, int write,
327                                   void *buffer,
328                                   size_t *lenp, loff_t *ppos)
329 {
330         int ret;
331
332         ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
333
334         if (ret || !write)
335                 goto out;
336
337         wake_up_process(watchdog_task);
338
339  out:
340         return ret;
341 }
342
343 /*
344  * This is needed for proc_doulongvec_minmax of sysctl_hung_task_timeout_secs
345  * and hung_task_check_interval_secs
346  */
347 static const unsigned long hung_task_timeout_max = (LONG_MAX / HZ);
348 static const struct ctl_table hung_task_sysctls[] = {
349 #ifdef CONFIG_SMP
350         {
351                 .procname       = "hung_task_all_cpu_backtrace",
352                 .data           = &sysctl_hung_task_all_cpu_backtrace,
353                 .maxlen         = sizeof(int),
354                 .mode           = 0644,
355                 .proc_handler   = proc_dointvec_minmax,
356                 .extra1         = SYSCTL_ZERO,
357                 .extra2         = SYSCTL_ONE,
358         },
359 #endif /* CONFIG_SMP */
360         {
361                 .procname       = "hung_task_panic",
362                 .data           = &sysctl_hung_task_panic,
363                 .maxlen         = sizeof(int),
364                 .mode           = 0644,
365                 .proc_handler   = proc_dointvec_minmax,
366                 .extra1         = SYSCTL_ZERO,
367                 .extra2         = SYSCTL_ONE,
368         },
369         {
370                 .procname       = "hung_task_check_count",
371                 .data           = &sysctl_hung_task_check_count,
372                 .maxlen         = sizeof(int),
373                 .mode           = 0644,
374                 .proc_handler   = proc_dointvec_minmax,
375                 .extra1         = SYSCTL_ZERO,
376         },
377         {
378                 .procname       = "hung_task_timeout_secs",
379                 .data           = &sysctl_hung_task_timeout_secs,
380                 .maxlen         = sizeof(unsigned long),
381                 .mode           = 0644,
382                 .proc_handler   = proc_dohung_task_timeout_secs,
383                 .extra2         = (void *)&hung_task_timeout_max,
384         },
385         {
386                 .procname       = "hung_task_check_interval_secs",
387                 .data           = &sysctl_hung_task_check_interval_secs,
388                 .maxlen         = sizeof(unsigned long),
389                 .mode           = 0644,
390                 .proc_handler   = proc_dohung_task_timeout_secs,
391                 .extra2         = (void *)&hung_task_timeout_max,
392         },
393         {
394                 .procname       = "hung_task_warnings",
395                 .data           = &sysctl_hung_task_warnings,
396                 .maxlen         = sizeof(int),
397                 .mode           = 0644,
398                 .proc_handler   = proc_dointvec_minmax,
399                 .extra1         = SYSCTL_NEG_ONE,
400         },
401         {
402                 .procname       = "hung_task_detect_count",
403                 .data           = &sysctl_hung_task_detect_count,
404                 .maxlen         = sizeof(unsigned long),
405                 .mode           = 0444,
406                 .proc_handler   = proc_doulongvec_minmax,
407         },
408 };
409
410 static void __init hung_task_sysctl_init(void)
411 {
412         register_sysctl_init("kernel", hung_task_sysctls);
413 }
414 #else
415 #define hung_task_sysctl_init() do { } while (0)
416 #endif /* CONFIG_SYSCTL */
417
418
419 static atomic_t reset_hung_task = ATOMIC_INIT(0);
420
421 void reset_hung_task_detector(void)
422 {
423         atomic_set(&reset_hung_task, 1);
424 }
425 EXPORT_SYMBOL_GPL(reset_hung_task_detector);
426
427 static bool hung_detector_suspended;
428
429 static int hungtask_pm_notify(struct notifier_block *self,
430                               unsigned long action, void *hcpu)
431 {
432         switch (action) {
433         case PM_SUSPEND_PREPARE:
434         case PM_HIBERNATION_PREPARE:
435         case PM_RESTORE_PREPARE:
436                 hung_detector_suspended = true;
437                 break;
438         case PM_POST_SUSPEND:
439         case PM_POST_HIBERNATION:
440         case PM_POST_RESTORE:
441                 hung_detector_suspended = false;
442                 break;
443         default:
444                 break;
445         }
446         return NOTIFY_OK;
447 }
448
449 /*
450  * kthread which checks for tasks stuck in D state
451  */
452 static int watchdog(void *dummy)
453 {
454         unsigned long hung_last_checked = jiffies;
455
456         set_user_nice(current, 0);
457
458         for ( ; ; ) {
459                 unsigned long timeout = sysctl_hung_task_timeout_secs;
460                 unsigned long interval = sysctl_hung_task_check_interval_secs;
461                 long t;
462
463                 if (interval == 0)
464                         interval = timeout;
465                 interval = min_t(unsigned long, interval, timeout);
466                 t = hung_timeout_jiffies(hung_last_checked, interval);
467                 if (t <= 0) {
468                         if (!atomic_xchg(&reset_hung_task, 0) &&
469                             !hung_detector_suspended)
470                                 check_hung_uninterruptible_tasks(timeout);
471                         hung_last_checked = jiffies;
472                         continue;
473                 }
474                 schedule_timeout_interruptible(t);
475         }
476
477         return 0;
478 }
479
480 static int __init hung_task_init(void)
481 {
482         atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
483
484         /* Disable hung task detector on suspend */
485         pm_notifier(hungtask_pm_notify, 0);
486
487         watchdog_task = kthread_run(watchdog, NULL, "khungtaskd");
488         hung_task_sysctl_init();
489
490         return 0;
491 }
492 subsys_initcall(hung_task_init);