Merge branch 'selftests-net-page_poll-allocation-error-injection'
authorJakub Kicinski <kuba@kernel.org>
Tue, 30 Apr 2024 15:15:33 +0000 (08:15 -0700)
committerJakub Kicinski <kuba@kernel.org>
Tue, 30 Apr 2024 15:15:34 +0000 (08:15 -0700)
Jakub Kicinski says:

====================
selftests: net: page_poll allocation error injection

Add a test for exercising driver memory allocation failure paths.
page pool is a bit tricky to inject errors into at the page allocator
level because of the bulk alloc and recycling, so add explicit error
injection support "in front" of the caches.

Add a test to exercise that using only the standard APIs.
This is the first useful test for the new tests with an endpoint.
There's no point testing netdevsim here, so this is also the first
HW-only test in Python.

I'm not super happy with the traffic generation using iperf3,
my initial approach was to use mausezahn. But it turned out to be
5x slower in terms of PPS. Hopefully this is good enough for now.

v1: https://lore.kernel.org/all/20240426232400.624864-1-kuba@kernel.org/
====================

Link: https://lore.kernel.org/r/20240429144426.743476-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
net/core/page_pool.c
tools/testing/selftests/Makefile
tools/testing/selftests/drivers/net/hw/Makefile
tools/testing/selftests/drivers/net/hw/lib/py/__init__.py [new file with mode: 0644]
tools/testing/selftests/drivers/net/hw/pp_alloc_fail.py [new file with mode: 0755]
tools/testing/selftests/drivers/net/lib/py/__init__.py
tools/testing/selftests/drivers/net/lib/py/env.py
tools/testing/selftests/drivers/net/lib/py/load.py [new file with mode: 0644]
tools/testing/selftests/net/lib/py/ksft.py
tools/testing/selftests/net/lib/py/utils.py

index 273c24429bced732cc2c4dac2c6ec655541643b2..8bcc7014a61a006d1cdaffed352c38cca8689334 100644 (file)
@@ -5,6 +5,7 @@
  *     Copyright (C) 2016 Red Hat, Inc.
  */
 
+#include <linux/error-injection.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/slab.h>
@@ -550,6 +551,7 @@ struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp)
        return page;
 }
 EXPORT_SYMBOL(page_pool_alloc_pages);
+ALLOW_ERROR_INJECTION(page_pool_alloc_pages, NULL);
 
 /* Calculate distance between two u32 values, valid if distance is below 2^(31)
  *  https://en.wikipedia.org/wiki/Serial_number_arithmetic#General_Solution
index 2c940e9c4cedfe218bd616102d452cf50b5282a8..9039f3709affb91e91af16741c309e8c5c5de5ef 100644 (file)
@@ -119,7 +119,7 @@ TARGETS_HOTPLUG = cpu-hotplug
 TARGETS_HOTPLUG += memory-hotplug
 
 # Networking tests want the net/lib target, include it automatically
-ifneq ($(filter net drivers/net,$(TARGETS)),)
+ifneq ($(filter net drivers/net drivers/net/hw,$(TARGETS)),)
 ifeq ($(filter net/lib,$(TARGETS)),)
        INSTALL_DEP_TARGETS := net/lib
 endif
index 2259a39a70edf5de4dec81f94856650178bb7d3b..1dd732855d76035652fef154ee067ec90e36af9f 100644 (file)
@@ -9,6 +9,7 @@ TEST_PROGS = \
        hw_stats_l3.sh \
        hw_stats_l3_gre.sh \
        loopback.sh \
+       pp_alloc_fail.py \
        #
 
 TEST_FILES := \
@@ -16,6 +17,7 @@ TEST_FILES := \
        #
 
 TEST_INCLUDES := \
+       $(wildcard lib/py/*.py ../lib/py/*.py) \
        ../../../net/lib.sh \
        ../../../net/forwarding/lib.sh \
        ../../../net/forwarding/ipip_lib.sh \
diff --git a/tools/testing/selftests/drivers/net/hw/lib/py/__init__.py b/tools/testing/selftests/drivers/net/hw/lib/py/__init__.py
new file mode 100644 (file)
index 0000000..b582885
--- /dev/null
@@ -0,0 +1,16 @@
+# SPDX-License-Identifier: GPL-2.0
+
+import sys
+from pathlib import Path
+
+KSFT_DIR = (Path(__file__).parent / "../../../../..").resolve()
+
+try:
+    sys.path.append(KSFT_DIR.as_posix())
+    from net.lib.py import *
+    from drivers.net.lib.py import *
+except ModuleNotFoundError as e:
+    ksft_pr("Failed importing `net` library from kernel sources")
+    ksft_pr(str(e))
+    ktap_result(True, comment="SKIP")
+    sys.exit(4)
diff --git a/tools/testing/selftests/drivers/net/hw/pp_alloc_fail.py b/tools/testing/selftests/drivers/net/hw/pp_alloc_fail.py
new file mode 100755 (executable)
index 0000000..026d989
--- /dev/null
@@ -0,0 +1,129 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0
+
+import time
+import os
+from lib.py import ksft_run, ksft_exit, ksft_pr
+from lib.py import KsftSkipEx, KsftFailEx
+from lib.py import NetdevFamily, NlError
+from lib.py import NetDrvEpEnv
+from lib.py import cmd, tool, GenerateTraffic
+
+
+def _write_fail_config(config):
+    for key, value in config.items():
+        with open("/sys/kernel/debug/fail_function/" + key, "w") as fp:
+            fp.write(str(value) + "\n")
+
+
+def _enable_pp_allocation_fail():
+    if not os.path.exists("/sys/kernel/debug/fail_function"):
+        raise KsftSkipEx("Kernel built without function error injection (or DebugFS)")
+
+    if not os.path.exists("/sys/kernel/debug/fail_function/page_pool_alloc_pages"):
+        with open("/sys/kernel/debug/fail_function/inject", "w") as fp:
+            fp.write("page_pool_alloc_pages\n")
+
+    _write_fail_config({
+        "verbose": 0,
+        "interval": 511,
+        "probability": 100,
+        "times": -1,
+    })
+
+
+def _disable_pp_allocation_fail():
+    if not os.path.exists("/sys/kernel/debug/fail_function"):
+        return
+
+    if os.path.exists("/sys/kernel/debug/fail_function/page_pool_alloc_pages"):
+        with open("/sys/kernel/debug/fail_function/inject", "w") as fp:
+            fp.write("\n")
+
+    _write_fail_config({
+        "probability": 0,
+        "times": 0,
+    })
+
+
+def test_pp_alloc(cfg, netdevnl):
+    def get_stats():
+        return netdevnl.qstats_get({"ifindex": cfg.ifindex}, dump=True)[0]
+
+    def check_traffic_flowing():
+        stat1 = get_stats()
+        time.sleep(1)
+        stat2 = get_stats()
+        if stat2['rx-packets'] - stat1['rx-packets'] < 15000:
+            raise KsftFailEx("Traffic seems low:", stat2['rx-packets'] - stat1['rx-packets'])
+
+
+    try:
+        stats = get_stats()
+    except NlError as e:
+        if e.nl_msg.error == -95:
+            stats = {}
+        else:
+            raise
+    if 'rx-alloc-fail' not in stats:
+        raise KsftSkipEx("Driver does not report 'rx-alloc-fail' via qstats")
+
+    set_g = False
+    traffic = None
+    try:
+        traffic = GenerateTraffic(cfg)
+
+        check_traffic_flowing()
+
+        _enable_pp_allocation_fail()
+
+        s1 = get_stats()
+        time.sleep(3)
+        s2 = get_stats()
+
+        if s2['rx-alloc-fail'] - s1['rx-alloc-fail'] < 1:
+            raise KsftSkipEx("Allocation failures not increasing")
+        if s2['rx-alloc-fail'] - s1['rx-alloc-fail'] < 100:
+            raise KsftSkipEx("Allocation increasing too slowly", s2['rx-alloc-fail'] - s1['rx-alloc-fail'],
+                             "packets:", s2['rx-packets'] - s1['rx-packets'])
+
+        # Basic failures are fine, try to wobble some settings to catch extra failures
+        check_traffic_flowing()
+        g = tool("ethtool", "-g " + cfg.ifname, json=True)[0]
+        if 'rx' in g and g["rx"] * 2 <= g["rx-max"]:
+            new_g = g['rx'] * 2
+        elif 'rx' in g:
+            new_g = g['rx'] // 2
+        else:
+            new_g = None
+
+        if new_g:
+            set_g = cmd(f"ethtool -G {cfg.ifname} rx {new_g}", fail=False).ret == 0
+            if set_g:
+                ksft_pr("ethtool -G change retval: success")
+            else:
+                ksft_pr("ethtool -G change retval: did not succeed", new_g)
+        else:
+                ksft_pr("ethtool -G change retval: did not try")
+
+        time.sleep(0.1)
+        check_traffic_flowing()
+    finally:
+        _disable_pp_allocation_fail()
+        if traffic:
+            traffic.stop()
+        time.sleep(0.1)
+        if set_g:
+            cmd(f"ethtool -G {cfg.ifname} rx {g['rx']}")
+
+
+def main() -> None:
+    netdevnl = NetdevFamily()
+    with NetDrvEpEnv(__file__, nsim_test=False) as cfg:
+
+        ksft_run([test_pp_alloc], args=(cfg, netdevnl, ))
+    ksft_exit()
+
+
+if __name__ == "__main__":
+    main()
index 4789c1a4282d48a4278210ae5641e6405cd02c4e..401e70f7f1362435113bdb0dfe579de2ed2bd82d 100644 (file)
@@ -15,4 +15,5 @@ except ModuleNotFoundError as e:
     sys.exit(4)
 
 from .env import *
+from .load import *
 from .remote import Remote
index e2ab637e56dcdc690781f51a5cec63493bbd7c71..5c8f695b253665447e42fd812bc41845e01015ed 100644 (file)
@@ -2,7 +2,7 @@
 
 import os
 from pathlib import Path
-from lib.py import KsftSkipEx
+from lib.py import KsftSkipEx, KsftXfailEx
 from lib.py import cmd, ip
 from lib.py import NetNS, NetdevSimDev
 from .remote import Remote
@@ -76,7 +76,7 @@ class NetDrvEpEnv:
     nsim_v4_pfx = "192.0.2."
     nsim_v6_pfx = "2001:db8::"
 
-    def __init__(self, src_path):
+    def __init__(self, src_path, nsim_test=None):
 
         self.env = _load_env_file(src_path)
 
@@ -88,7 +88,10 @@ class NetDrvEpEnv:
         self._ns_peer = None
 
         if "NETIF" in self.env:
+            if nsim_test is True:
+                raise KsftXfailEx("Test only works on netdevsim")
             self._check_env()
+
             self.dev = ip("link show dev " + self.env['NETIF'], json=True)[0]
 
             self.v4 = self.env.get("LOCAL_V4")
@@ -98,6 +101,9 @@ class NetDrvEpEnv:
             kind = self.env["REMOTE_TYPE"]
             args = self.env["REMOTE_ARGS"]
         else:
+            if nsim_test is False:
+                raise KsftXfailEx("Test does not work on netdevsim")
+
             self.create_local()
 
             self.dev = self._ns.nsims[0].dev
diff --git a/tools/testing/selftests/drivers/net/lib/py/load.py b/tools/testing/selftests/drivers/net/lib/py/load.py
new file mode 100644 (file)
index 0000000..abdb677
--- /dev/null
@@ -0,0 +1,41 @@
+# SPDX-License-Identifier: GPL-2.0
+
+import time
+
+from lib.py import ksft_pr, cmd, ip, rand_port, wait_port_listen
+
+class GenerateTraffic:
+    def __init__(self, env):
+        env.require_cmd("iperf3", remote=True)
+
+        self.env = env
+
+        port = rand_port()
+        self._iperf_server = cmd(f"iperf3 -s -p {port}", background=True)
+        wait_port_listen(port)
+        time.sleep(0.1)
+        self._iperf_client = cmd(f"iperf3 -c {env.addr} -P 16 -p {port} -t 86400",
+                                 background=True, host=env.remote)
+
+        # Wait for traffic to ramp up
+        pkt = ip("-s link show dev " + env.ifname, json=True)[0]["stats64"]["rx"]["packets"]
+        for _ in range(50):
+            time.sleep(0.1)
+            now = ip("-s link show dev " + env.ifname, json=True)[0]["stats64"]["rx"]["packets"]
+            if now - pkt > 1000:
+                return
+            pkt = now
+        self.stop(verbose=True)
+        raise Exception("iperf3 traffic did not ramp up")
+
+    def stop(self, verbose=None):
+        self._iperf_client.process(terminate=True)
+        if verbose:
+            ksft_pr(">> Client:")
+            ksft_pr(self._iperf_client.stdout)
+            ksft_pr(self._iperf_client.stderr)
+        self._iperf_server.process(terminate=True)
+        if verbose:
+            ksft_pr(">> Server:")
+            ksft_pr(self._iperf_server.stdout)
+            ksft_pr(self._iperf_server.stderr)
index f84e9fdd0032597417027485e19becaf795cd05f..4769b4eb1ea192d9e2f2e9e0b656f0f4b72efdfd 100644 (file)
@@ -11,6 +11,10 @@ KSFT_RESULT = None
 KSFT_RESULT_ALL = True
 
 
+class KsftFailEx(Exception):
+    pass
+
+
 class KsftSkipEx(Exception):
     pass
 
index d3715e6c21f2e7a0ce7b11dfb29225b5306c6a44..b57d467afd0fb01a7aa8c0a4b087b53e21567b84 100644 (file)
@@ -56,10 +56,10 @@ class bkg(cmd):
         return self.process(terminate=self.terminate)
 
 
-def ip(args, json=None, ns=None, host=None):
-    cmd_str = "ip "
+def tool(name, args, json=None, ns=None, host=None):
+    cmd_str = name + ' '
     if json:
-        cmd_str += '-j '
+        cmd_str += '--json '
     cmd_str += args
     cmd_obj = cmd(cmd_str, ns=ns, host=host)
     if json:
@@ -67,11 +67,17 @@ def ip(args, json=None, ns=None, host=None):
     return cmd_obj
 
 
+def ip(args, json=None, ns=None, host=None):
+    if ns:
+        args = f'-netns {ns} ' + args
+    return tool('ip', args, json=json, host=host)
+
+
 def rand_port():
     """
     Get unprivileged port, for now just random, one day we may decide to check if used.
     """
-    return random.randint(1024, 65535)
+    return random.randint(10000, 65535)
 
 
 def wait_port_listen(port, proto="tcp", ns=None, host=None, sleep=0.005, deadline=5):