watchdog: allow nmi watchdog to use raw perf event
authorSong Liu <song@kernel.org>
Tue, 30 Apr 2024 06:02:36 +0000 (23:02 -0700)
committerAndrew Morton <akpm@linux-foundation.org>
Wed, 8 May 2024 15:41:29 +0000 (08:41 -0700)
NMI watchdog permanently consumes one hardware counters per CPU on the
system.  For systems that use many hardware counters, this causes more
aggressive time multiplexing of perf events.

OTOH, some CPUs (mostly Intel) support "ref-cycles" event, which is rarely
used.  Add kernel cmdline arg nmi_watchdog=rNNN to configure the watchdog
to use raw event.  For example, on Intel CPUs, we can use "r300" to
configure the watchdog to use ref-cycles event.

If the raw event does not work, fall back to use "cycles".

[akpm@linux-foundation.org: fix kerneldoc]
Link: https://lkml.kernel.org/r/20240430060236.1878002-2-song@kernel.org
Signed-off-by: Song Liu <song@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Documentation/admin-guide/kernel-parameters.txt
include/linux/nmi.h
kernel/watchdog.c
kernel/watchdog_perf.c

index 902ecd92a29fbe83df18d32d1a8fe652c8277132..1fa79a3d0d1a257f20894a1a09a71585d75f7e7c 100644 (file)
                        Format: [state][,regs][,debounce][,die]
 
        nmi_watchdog=   [KNL,BUGS=X86] Debugging features for SMP kernels
-                       Format: [panic,][nopanic,][num]
+                       Format: [panic,][nopanic,][rNNN,][num]
                        Valid num: 0 or 1
                        0 - turn hardlockup detector in nmi_watchdog off
                        1 - turn hardlockup detector in nmi_watchdog on
+                       rNNN - configure the watchdog with raw perf event 0xNNN
+
                        When panic is specified, panic when an NMI watchdog
                        timeout occurs (or 'nopanic' to not panic on an NMI
                        watchdog, if CONFIG_BOOTPARAM_HARDLOCKUP_PANIC is set)
                                memory, and other data can't be written using
                                xmon commands.
                        off     xmon is disabled.
-
index f53438eae815dead4b6c88bfada6eaf0d7d8ec11..a8dfb38c9bb6f1f97f0bcc37915f0c22ecb5ffc9 100644 (file)
@@ -105,10 +105,12 @@ void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs);
 extern void hardlockup_detector_perf_stop(void);
 extern void hardlockup_detector_perf_restart(void);
 extern void hardlockup_detector_perf_cleanup(void);
+extern void hardlockup_config_perf_event(const char *str);
 #else
 static inline void hardlockup_detector_perf_stop(void) { }
 static inline void hardlockup_detector_perf_restart(void) { }
 static inline void hardlockup_detector_perf_cleanup(void) { }
+static inline void hardlockup_config_perf_event(const char *str) { }
 #endif
 
 void watchdog_hardlockup_stop(void);
index 7f54484de16f7ad303a446d428bcf0149d156742..ab0129b15f2513f5fd667dffb383beaad4d1104d 100644 (file)
@@ -80,6 +80,8 @@ next:
                watchdog_hardlockup_user_enabled = 0;
        else if (!strncmp(str, "1", 1))
                watchdog_hardlockup_user_enabled = 1;
+       else if (!strncmp(str, "r", 1))
+               hardlockup_config_perf_event(str + 1);
        while (*(str++)) {
                if (*str == ',') {
                        str++;
index 8ea00c4a24b2d91e330875ac95776e9fc9301f3b..5f7d1f0d426866b94949a295512e25803e925378 100644 (file)
@@ -90,6 +90,14 @@ static struct perf_event_attr wd_hw_attr = {
        .disabled       = 1,
 };
 
+static struct perf_event_attr fallback_wd_hw_attr = {
+       .type           = PERF_TYPE_HARDWARE,
+       .config         = PERF_COUNT_HW_CPU_CYCLES,
+       .size           = sizeof(struct perf_event_attr),
+       .pinned         = 1,
+       .disabled       = 1,
+};
+
 /* Callback function for perf event subsystem */
 static void watchdog_overflow_callback(struct perf_event *event,
                                       struct perf_sample_data *data,
@@ -122,6 +130,13 @@ static int hardlockup_detector_event_create(void)
        /* Try to register using hardware perf events */
        evt = perf_event_create_kernel_counter(wd_attr, cpu, NULL,
                                               watchdog_overflow_callback, NULL);
+       if (IS_ERR(evt)) {
+               wd_attr = &fallback_wd_hw_attr;
+               wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);
+               evt = perf_event_create_kernel_counter(wd_attr, cpu, NULL,
+                                                      watchdog_overflow_callback, NULL);
+       }
+
        if (IS_ERR(evt)) {
                pr_debug("Perf event create on CPU %d failed with %ld\n", cpu,
                         PTR_ERR(evt));
@@ -259,3 +274,34 @@ int __init watchdog_hardlockup_probe(void)
        }
        return ret;
 }
+
+/**
+ * hardlockup_config_perf_event - Overwrite config of wd_hw_attr.
+ *
+ * @str: number which identifies the raw perf event to use
+ */
+void __init hardlockup_config_perf_event(const char *str)
+{
+       u64 config;
+       char buf[24];
+       char *comma = strchr(str, ',');
+
+       if (!comma) {
+               if (kstrtoull(str, 16, &config))
+                       return;
+       } else {
+               unsigned int len = comma - str;
+
+               if (len >= sizeof(buf))
+                       return;
+
+               if (strscpy(buf, str, sizeof(buf)) < 0)
+                       return;
+               buf[len] = 0;
+               if (kstrtoull(buf, 16, &config))
+                       return;
+       }
+
+       wd_hw_attr.type = PERF_TYPE_RAW;
+       wd_hw_attr.config = config;
+}