uacce: supports device isolation feature
authorKai Ye <yekai13@huawei.com>
Sat, 19 Nov 2022 07:48:15 +0000 (07:48 +0000)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Fri, 20 Jan 2023 11:06:26 +0000 (12:06 +0100)
UACCE adds the hardware error isolation feature. To improve service
reliability, some uacce devices that frequently encounter hardware
errors are isolated. Therefore, this feature is added.

Users can configure the hardware error threshold by 'isolate_strategy'
sysfs node. The user space can get the device isolated state by 'isolate'
sysfs node. If the number of device errors exceeds the configured error
threshold, the device will be isolated. It means the uacce device is
unavailable.

Signed-off-by: Kai Ye <yekai13@huawei.com>
Link: https://lore.kernel.org/r/20221119074817.12063-2-yekai13@huawei.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
drivers/misc/uacce/uacce.c
include/linux/uacce.h

index 905eff1f840ed296c70bf767de330b0dac349b13..d3a217929a248e1315f8b716bb67c6d51ce253b5 100644 (file)
@@ -363,12 +363,52 @@ static ssize_t region_dus_size_show(struct device *dev,
                       uacce->qf_pg_num[UACCE_QFRT_DUS] << PAGE_SHIFT);
 }
 
+static ssize_t isolate_show(struct device *dev,
+                           struct device_attribute *attr, char *buf)
+{
+       struct uacce_device *uacce = to_uacce_device(dev);
+
+       return sysfs_emit(buf, "%d\n", uacce->ops->get_isolate_state(uacce));
+}
+
+static ssize_t isolate_strategy_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+       struct uacce_device *uacce = to_uacce_device(dev);
+       u32 val;
+
+       val = uacce->ops->isolate_err_threshold_read(uacce);
+
+       return sysfs_emit(buf, "%u\n", val);
+}
+
+static ssize_t isolate_strategy_store(struct device *dev, struct device_attribute *attr,
+                                  const char *buf, size_t count)
+{
+       struct uacce_device *uacce = to_uacce_device(dev);
+       unsigned long val;
+       int ret;
+
+       if (kstrtoul(buf, 0, &val) < 0)
+               return -EINVAL;
+
+       if (val > UACCE_MAX_ERR_THRESHOLD)
+               return -EINVAL;
+
+       ret = uacce->ops->isolate_err_threshold_write(uacce, val);
+       if (ret)
+               return ret;
+
+       return count;
+}
+
 static DEVICE_ATTR_RO(api);
 static DEVICE_ATTR_RO(flags);
 static DEVICE_ATTR_RO(available_instances);
 static DEVICE_ATTR_RO(algorithms);
 static DEVICE_ATTR_RO(region_mmio_size);
 static DEVICE_ATTR_RO(region_dus_size);
+static DEVICE_ATTR_RO(isolate);
+static DEVICE_ATTR_RW(isolate_strategy);
 
 static struct attribute *uacce_dev_attrs[] = {
        &dev_attr_api.attr,
@@ -377,6 +417,8 @@ static struct attribute *uacce_dev_attrs[] = {
        &dev_attr_algorithms.attr,
        &dev_attr_region_mmio_size.attr,
        &dev_attr_region_dus_size.attr,
+       &dev_attr_isolate.attr,
+       &dev_attr_isolate_strategy.attr,
        NULL,
 };
 
@@ -392,6 +434,14 @@ static umode_t uacce_dev_is_visible(struct kobject *kobj,
            (!uacce->qf_pg_num[UACCE_QFRT_DUS])))
                return 0;
 
+       if (attr == &dev_attr_isolate_strategy.attr &&
+           (!uacce->ops->isolate_err_threshold_read &&
+            !uacce->ops->isolate_err_threshold_write))
+               return 0;
+
+       if (attr == &dev_attr_isolate.attr && !uacce->ops->get_isolate_state)
+               return 0;
+
        return attr->mode;
 }
 
index 9ce88c28b0a8714d5a7d49fc5b1d87bad22cd4ab..0a81c3dfd26c802ee7998520b3757f435766021b 100644 (file)
@@ -8,6 +8,7 @@
 #define UACCE_NAME             "uacce"
 #define UACCE_MAX_REGION       2
 #define UACCE_MAX_NAME_SIZE    64
+#define UACCE_MAX_ERR_THRESHOLD        65535
 
 struct uacce_queue;
 struct uacce_device;
@@ -30,6 +31,9 @@ struct uacce_qfile_region {
  * @is_q_updated: check whether the task is finished
  * @mmap: mmap addresses of queue to user space
  * @ioctl: ioctl for user space users of the queue
+ * @get_isolate_state: get the device state after set the isolate strategy
+ * @isolate_err_threshold_write: stored the isolate error threshold to the device
+ * @isolate_err_threshold_read: read the isolate error threshold value from the device
  */
 struct uacce_ops {
        int (*get_available_instances)(struct uacce_device *uacce);
@@ -43,6 +47,9 @@ struct uacce_ops {
                    struct uacce_qfile_region *qfr);
        long (*ioctl)(struct uacce_queue *q, unsigned int cmd,
                      unsigned long arg);
+       enum uacce_dev_state (*get_isolate_state)(struct uacce_device *uacce);
+       int (*isolate_err_threshold_write)(struct uacce_device *uacce, u32 num);
+       u32 (*isolate_err_threshold_read)(struct uacce_device *uacce);
 };
 
 /**
@@ -57,6 +64,11 @@ struct uacce_interface {
        const struct uacce_ops *ops;
 };
 
+enum uacce_dev_state {
+       UACCE_DEV_NORMAL,
+       UACCE_DEV_ISOLATE,
+};
+
 enum uacce_q_state {
        UACCE_Q_ZOMBIE = 0,
        UACCE_Q_INIT,