EDAC: Add support for EDAC device features control
authorShiju Jose <shiju.jose@huawei.com>
Wed, 12 Feb 2025 14:36:39 +0000 (14:36 +0000)
committerBorislav Petkov (AMD) <bp@alien8.de>
Tue, 25 Feb 2025 14:33:27 +0000 (15:33 +0100)
Add generic EDAC device feature controls supporting the registration of RAS
features available in the system. The driver exposes control attributes for
these features to userspace in

  /sys/bus/edac/devices/<dev-name>/<ras-feature>

  [ bp: Touch-up documentation, simplify, make edac_dev_type static,
    fixup edac_dev_register() retvals. ]

Co-developed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Reviewed-by: Fan Ni <fan.ni@samsung.com>
Tested-by: Daniel Ferguson <danielf@os.amperecomputing.com>
Tested-by: Fan Ni <fan.ni@samsung.com>
Link: https://lore.kernel.org/r/20250212143654.1893-2-shiju.jose@huawei.com
Documentation/edac/features.rst [new file with mode: 0644]
Documentation/edac/index.rst [new file with mode: 0644]
drivers/edac/edac_device.c
include/linux/edac.h

diff --git a/Documentation/edac/features.rst b/Documentation/edac/features.rst
new file mode 100644 (file)
index 0000000..3c279d0
--- /dev/null
@@ -0,0 +1,93 @@
+.. SPDX-License-Identifier: GPL-2.0 OR GFDL-1.2-no-invariants-or-later
+
+=================
+EDAC/RAS features
+=================
+
+Copyright (c) 2024-2025 HiSilicon Limited.
+
+:Author:   Shiju Jose <shiju.jose@huawei.com>
+:License:  The GNU Free Documentation License, Version 1.2 without
+           Invariant Sections, Front-Cover Texts nor Back-Cover Texts.
+           (dual licensed under the GPL v2)
+
+- Written for: 6.15
+
+Introduction
+------------
+
+EDAC/RAS components plugging and high-level design:
+
+1. Scrub control
+
+2. Error Check Scrub (ECS) control
+
+3. ACPI RAS2 features
+
+4. Post Package Repair (PPR) control
+
+5. Memory Sparing Repair control
+
+High level design is illustrated in the following diagram::
+
+        +-----------------------------------------------+
+        |   Userspace - Rasdaemon                       |
+        | +-------------+                               |
+        | | RAS CXL mem |     +---------------+         |
+        | |error handler|---->|               |         |
+        | +-------------+     | RAS dynamic   |         |
+        | +-------------+     | scrub, memory |         |
+        | | RAS memory  |---->| repair control|         |
+        | |error handler|     +----|----------+         |
+        | +-------------+          |                    |
+        +--------------------------|--------------------+
+                                   |
+                                   |
+   +-------------------------------|------------------------------+
+   |     Kernel EDAC extension for | controlling RAS Features     |
+   |+------------------------------|----------------------------+ |
+   || EDAC Core          Sysfs EDAC| Bus                        | |
+   ||   +--------------------------|---------------------------+| |
+   ||   |/sys/bus/edac/devices/<dev>/scrubX/ |   | EDAC device || |
+   ||   |/sys/bus/edac/devices/<dev>/ecsX/   |<->| EDAC MC     || |
+   ||   |/sys/bus/edac/devices/<dev>/repairX |   | EDAC sysfs  || |
+   ||   +---------------------------|--------------------------+| |
+   ||                           EDAC|Bus                        | |
+   ||                               |                           | |
+   ||   +----------+ Get feature    |      Get feature          | |
+   ||   |          | desc +---------|------+ desc +----------+  | |
+   ||   |EDAC scrub|<-----| EDAC device    |      |          |  | |
+   ||   +----------+      | driver- RAS    |----->| EDAC mem |  | |
+   ||   +----------+      | feature control|      | repair   |  | |
+   ||   |          |<-----|                |      +----------+  | |
+   ||   |EDAC ECS  |      +---------|------+                    | |
+   ||   +----------+    Register RAS|features                   | |
+   ||         ______________________|_____________              | |
+   |+---------|---------------|------------------|--------------+ |
+   |  +-------|----+  +-------|-------+     +----|----------+     |
+   |  |            |  | CXL mem driver|     | Client driver |     |
+   |  | ACPI RAS2  |  | scrub, ECS,   |     | memory repair |     |
+   |  | driver     |  | sparing, PPR  |     | features      |     |
+   |  +-----|------+  +-------|-------+     +------|--------+     |
+   |        |                 |                    |              |
+   +--------|-----------------|--------------------|--------------+
+            |                 |                    |
+   +--------|-----------------|--------------------|--------------+
+   |    +---|-----------------|--------------------|-------+      |
+   |    |                                                  |      |
+   |    |            Platform HW and Firmware              |      |
+   |    +--------------------------------------------------+      |
+   +--------------------------------------------------------------+
+
+
+1. EDAC Features components - Create feature-specific descriptors. For
+   example: scrub, ECS, memory repair in the above diagram.
+
+2. EDAC device driver for controlling RAS Features - Get feature's attribute
+   descriptors from EDAC RAS feature component and registers device's RAS
+   features with EDAC bus and expose the features control attributes via
+   sysfs. For example, /sys/bus/edac/devices/<dev-name>/<feature>X/
+
+3. RAS dynamic feature controller - Userspace sample modules in rasdaemon for
+   dynamic scrub/repair control to issue scrubbing/repair when excess number
+   of corrected memory errors are reported in a short span of time.
diff --git a/Documentation/edac/index.rst b/Documentation/edac/index.rst
new file mode 100644 (file)
index 0000000..de4a3aa
--- /dev/null
@@ -0,0 +1,10 @@
+.. SPDX-License-Identifier: GPL-2.0 OR GFDL-1.2-no-invariants-or-later
+
+==============
+EDAC Subsystem
+==============
+
+.. toctree::
+   :maxdepth: 1
+
+   features
index 621dc2a5d03474dada099b809b48561106d3c8b8..6af0893cadc94282652bf55822a74ebf26c66880 100644 (file)
@@ -570,3 +570,104 @@ void edac_device_handle_ue_count(struct edac_device_ctl_info *edac_dev,
                      block ? block->name : "N/A", count, msg);
 }
 EXPORT_SYMBOL_GPL(edac_device_handle_ue_count);
+
+static void edac_dev_release(struct device *dev)
+{
+       struct edac_dev_feat_ctx *ctx = container_of(dev, struct edac_dev_feat_ctx, dev);
+
+       kfree(ctx->dev.groups);
+       kfree(ctx);
+}
+
+static const struct device_type edac_dev_type = {
+       .name = "edac_dev",
+       .release = edac_dev_release,
+};
+
+static void edac_dev_unreg(void *data)
+{
+       device_unregister(data);
+}
+
+/**
+ * edac_dev_register - register device for RAS features with EDAC
+ * @parent: parent device.
+ * @name: name for the folder in the /sys/bus/edac/devices/,
+ *       which is derived from the parent device.
+ *       For e.g. /sys/bus/edac/devices/cxl_mem0/
+ * @private: parent driver's data to store in the context if any.
+ * @num_features: number of RAS features to register.
+ * @ras_features: list of RAS features to register.
+ *
+ * Return:
+ *  * %0       - Success.
+ *  * %-EINVAL - Invalid parameters passed.
+ *  * %-ENOMEM - Dynamic memory allocation failed.
+ *
+ */
+int edac_dev_register(struct device *parent, char *name,
+                     void *private, int num_features,
+                     const struct edac_dev_feature *ras_features)
+{
+       const struct attribute_group **ras_attr_groups;
+       struct edac_dev_feat_ctx *ctx;
+       int attr_gcnt = 0;
+       int ret = -ENOMEM;
+       int feat;
+
+       if (!parent || !name || !num_features || !ras_features)
+               return -EINVAL;
+
+       /* Double parse to make space for attributes */
+       for (feat = 0; feat < num_features; feat++) {
+               switch (ras_features[feat].ft_type) {
+               /* Add feature specific code */
+               default:
+                       return -EINVAL;
+               }
+       }
+
+       ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+       if (!ctx)
+               return -ENOMEM;
+
+       ras_attr_groups = kcalloc(attr_gcnt + 1, sizeof(*ras_attr_groups), GFP_KERNEL);
+       if (!ras_attr_groups)
+               goto ctx_free;
+
+       attr_gcnt = 0;
+       for (feat = 0; feat < num_features; feat++, ras_features++) {
+               switch (ras_features->ft_type) {
+               /* Add feature specific code */
+               default:
+                       ret = -EINVAL;
+                       goto groups_free;
+               }
+       }
+
+       ctx->dev.parent = parent;
+       ctx->dev.bus = edac_get_sysfs_subsys();
+       ctx->dev.type = &edac_dev_type;
+       ctx->dev.groups = ras_attr_groups;
+       ctx->private = private;
+       dev_set_drvdata(&ctx->dev, ctx);
+
+       ret = dev_set_name(&ctx->dev, name);
+       if (ret)
+               goto groups_free;
+
+       ret = device_register(&ctx->dev);
+       if (ret) {
+               put_device(&ctx->dev);
+               return ret;
+       }
+
+       return devm_add_action_or_reset(parent, edac_dev_unreg, &ctx->dev);
+
+groups_free:
+       kfree(ras_attr_groups);
+ctx_free:
+       kfree(ctx);
+       return ret;
+}
+EXPORT_SYMBOL_GPL(edac_dev_register);
index b4ee8961e6236c3ed02b1c67a817c8cee9882c33..8c4b6ca2a994dca6f942c8b63a7bf1cefb12d323 100644 (file)
@@ -661,4 +661,30 @@ static inline struct dimm_info *edac_get_dimm(struct mem_ctl_info *mci,
 
        return mci->dimms[index];
 }
+
+/* RAS feature type */
+enum edac_dev_feat {
+       RAS_FEAT_MAX
+};
+
+/* EDAC device feature information structure */
+struct edac_dev_data {
+       u8 instance;
+       void *private;
+};
+
+struct edac_dev_feat_ctx {
+       struct device dev;
+       void *private;
+};
+
+struct edac_dev_feature {
+       enum edac_dev_feat ft_type;
+       u8 instance;
+       void *ctx;
+};
+
+int edac_dev_register(struct device *parent, char *dev_name,
+                     void *parent_pvt_data, int num_features,
+                     const struct edac_dev_feature *ras_features);
 #endif /* _LINUX_EDAC_H_ */