habanalabs: add new opcodes for INFO IOCTL
authorfarah kassabri <fkassabri@habana.ai>
Sun, 24 Oct 2021 16:02:32 +0000 (19:02 +0300)
committerOded Gabbay <ogabbay@kernel.org>
Sun, 26 Dec 2021 06:59:05 +0000 (08:59 +0200)
Add implementation for new opcodes in the INFO IOCTL:
1. Retrieve the replaced DRAM rows from f/w.
2. Retrieve the pending DRAM rows from f/w.

Signed-off-by: farah kassabri <fkassabri@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
drivers/misc/habanalabs/common/firmware_if.c
drivers/misc/habanalabs/common/habanalabs.h
drivers/misc/habanalabs/common/habanalabs_ioctl.c
drivers/misc/habanalabs/include/common/cpucp_if.h
include/uapi/misc/habanalabs.h

index 9addcfba6a8bc7583dd611d307f94b3d7928759d..70e992bdbde7f44a19c1127145e1f01283a9232d 100644 (file)
@@ -972,6 +972,72 @@ int hl_fw_cpucp_power_get(struct hl_device *hdev, u64 *power)
        return rc;
 }
 
+int hl_fw_dram_replaced_row_get(struct hl_device *hdev,
+                               struct cpucp_hbm_row_info *info)
+{
+       struct cpucp_hbm_row_info *cpucp_repl_rows_info_cpu_addr;
+       dma_addr_t cpucp_repl_rows_info_dma_addr;
+       struct cpucp_packet pkt = {};
+       u64 result;
+       int rc;
+
+       cpucp_repl_rows_info_cpu_addr =
+                       hdev->asic_funcs->cpu_accessible_dma_pool_alloc(hdev,
+                                       sizeof(struct cpucp_hbm_row_info),
+                                       &cpucp_repl_rows_info_dma_addr);
+       if (!cpucp_repl_rows_info_cpu_addr) {
+               dev_err(hdev->dev,
+                       "Failed to allocate DMA memory for CPU-CP replaced rows info packet\n");
+               return -ENOMEM;
+       }
+
+       memset(cpucp_repl_rows_info_cpu_addr, 0, sizeof(struct cpucp_hbm_row_info));
+
+       pkt.ctl = cpu_to_le32(CPUCP_PACKET_HBM_REPLACED_ROWS_INFO_GET <<
+                                       CPUCP_PKT_CTL_OPCODE_SHIFT);
+       pkt.addr = cpu_to_le64(cpucp_repl_rows_info_dma_addr);
+       pkt.data_max_size = cpu_to_le32(sizeof(struct cpucp_hbm_row_info));
+
+       rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
+                                       HL_CPUCP_INFO_TIMEOUT_USEC, &result);
+       if (rc) {
+               dev_err(hdev->dev,
+                       "Failed to handle CPU-CP replaced rows info pkt, error %d\n", rc);
+               goto out;
+       }
+
+       memcpy(info, cpucp_repl_rows_info_cpu_addr, sizeof(*info));
+
+out:
+       hdev->asic_funcs->cpu_accessible_dma_pool_free(hdev,
+                                       sizeof(struct cpucp_hbm_row_info),
+                                       cpucp_repl_rows_info_cpu_addr);
+
+       return rc;
+}
+
+int hl_fw_dram_pending_row_get(struct hl_device *hdev, u32 *pend_rows_num)
+{
+       struct cpucp_packet pkt;
+       u64 result;
+       int rc;
+
+       memset(&pkt, 0, sizeof(pkt));
+
+       pkt.ctl = cpu_to_le32(CPUCP_PACKET_HBM_PENDING_ROWS_STATUS << CPUCP_PKT_CTL_OPCODE_SHIFT);
+
+       rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), 0, &result);
+       if (rc) {
+               dev_err(hdev->dev,
+                               "Failed to handle CPU-CP pending rows info pkt, error %d\n", rc);
+               goto out;
+       }
+
+       *pend_rows_num = (u32) result;
+out:
+       return rc;
+}
+
 void hl_fw_ask_hard_reset_without_linux(struct hl_device *hdev)
 {
        struct static_fw_load_mgr *static_loader =
index fc201537f7a905e54b845b93b1164f231ee19c48..a19563c416ac4a7b900d9c4930630ea0014a9f5d 100644 (file)
@@ -3012,6 +3012,9 @@ int hl_fw_dynamic_send_protocol_cmd(struct hl_device *hdev,
                                struct fw_load_mgr *fw_loader,
                                enum comms_cmd cmd, unsigned int size,
                                bool wait_ok, u32 timeout);
+int hl_fw_dram_replaced_row_get(struct hl_device *hdev,
+                               struct cpucp_hbm_row_info *info);
+int hl_fw_dram_pending_row_get(struct hl_device *hdev, u32 *pend_rows_num);
 int hl_pci_bars_map(struct hl_device *hdev, const char * const name[3],
                        bool is_wc[3]);
 int hl_pci_elbi_read(struct hl_device *hdev, u64 addr, u32 *data);
index 19726c6b642a1ab2f1ee37f04160e3abe2d701f4..68c655acdec8f3d6fa6b7105976bbb876f6fddf9 100644 (file)
@@ -503,6 +503,43 @@ static int open_stats_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
                min((size_t) max_size, sizeof(open_stats_info))) ? -EFAULT : 0;
 }
 
+static int dram_pending_rows_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
+{
+       struct hl_device *hdev = hpriv->hdev;
+       u32 max_size = args->return_size;
+       u32 pend_rows_num = 0;
+       void __user *out = (void __user *) (uintptr_t) args->return_pointer;
+       int rc;
+
+       if ((!max_size) || (!out))
+               return -EINVAL;
+
+       rc = hl_fw_dram_pending_row_get(hdev, &pend_rows_num);
+       if (rc)
+               return rc;
+
+       return copy_to_user(out, &pend_rows_num,
+                       min_t(size_t, max_size, sizeof(pend_rows_num))) ? -EFAULT : 0;
+}
+
+static int dram_replaced_rows_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
+{
+       struct hl_device *hdev = hpriv->hdev;
+       u32 max_size = args->return_size;
+       struct cpucp_hbm_row_info info = {0};
+       void __user *out = (void __user *) (uintptr_t) args->return_pointer;
+       int rc;
+
+       if ((!max_size) || (!out))
+               return -EINVAL;
+
+       rc = hl_fw_dram_replaced_row_get(hdev, &info);
+       if (rc)
+               return rc;
+
+       return copy_to_user(out, &info, min_t(size_t, max_size, sizeof(info))) ? -EFAULT : 0;
+}
+
 static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data,
                                struct device *dev)
 {
@@ -589,6 +626,12 @@ static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data,
        case HL_INFO_OPEN_STATS:
                return open_stats_info(hpriv, args);
 
+       case HL_INFO_DRAM_REPLACED_ROWS:
+               return dram_replaced_rows_info(hpriv, args);
+
+       case HL_INFO_DRAM_PENDING_ROWS:
+               return dram_pending_rows_info(hpriv, args);
+
        default:
                dev_err(dev, "Invalid request %d\n", args->op);
                rc = -ENOTTY;
index 17927968e19a3aab6114dc87efceed97719e2d50..5e19c763f3f088533fc68f459e1027193db7828b 100644 (file)
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0
  *
- * Copyright 2020 HabanaLabs, Ltd.
+ * Copyright 2021 HabanaLabs, Ltd.
  * All Rights Reserved.
  *
  */
@@ -377,6 +377,13 @@ enum pq_init_status {
  *       a different engine or QMAN according to enum cpucp_idle_mask.
  *       The bit will be 1 if the engine is NOT idle.
  *
+ * CPUCP_PACKET_HBM_REPLACED_ROWS_INFO_GET -
+ *       Fetch all HBM replaced-rows and prending to be replaced rows data.
+ *
+ * CPUCP_PACKET_HBM_PENDING_ROWS_STATUS -
+ *       Fetch status of HBM rows pending replacement and need a reboot to
+ *       be replaced.
+ *
  * CPUCP_PACKET_POWER_SET -
  *       Resets power history of device to 0
  */
@@ -424,6 +431,8 @@ enum cpucp_packet_id {
        CPUCP_PACKET_NIC_STAT_REGS_CLR,         /* internal */
        CPUCP_PACKET_NIC_STAT_REGS_ALL_GET,     /* internal */
        CPUCP_PACKET_IS_IDLE_CHECK,             /* internal */
+       CPUCP_PACKET_HBM_REPLACED_ROWS_INFO_GET,/* internal */
+       CPUCP_PACKET_HBM_PENDING_ROWS_STATUS,   /* internal */
        CPUCP_PACKET_POWER_SET,                 /* internal */
 };
 
@@ -692,6 +701,7 @@ struct eq_generic_event {
 #define CPUCP_MAX_NIC_LANES            (CPUCP_MAX_NICS * CPUCP_LANES_PER_NIC)
 #define CPUCP_NIC_MASK_ARR_LEN         ((CPUCP_MAX_NICS + 63) / 64)
 #define CPUCP_NIC_POLARITY_ARR_LEN     ((CPUCP_MAX_NIC_LANES + 63) / 64)
+#define CPUCP_HBM_ROW_REPLACE_MAX      32
 
 struct cpucp_sensor {
        __le32 type;
@@ -837,4 +847,25 @@ struct cpucp_nic_status {
        __le32 high_ber_cnt;
 };
 
+enum cpucp_hbm_row_replace_cause {
+       REPLACE_CAUSE_DOUBLE_ECC_ERR,
+       REPLACE_CAUSE_MULTI_SINGLE_ECC_ERR,
+};
+
+struct cpucp_hbm_row_info {
+       __u8 hbm_idx;
+       __u8 pc;
+       __u8 sid;
+       __u8 bank_idx;
+       __le16 row_addr;
+       __u8 replaced_row_cause; /* enum cpucp_hbm_row_replace_cause */
+       __u8 pad;
+};
+
+struct cpucp_hbm_row_replaced_rows_info {
+       __le16 num_replaced_rows;
+       __u8 pad[6];
+       struct cpucp_hbm_row_info replaced_rows[CPUCP_HBM_ROW_REPLACE_MAX];
+};
+
 #endif /* CPUCP_IF_H */
index 257b9630773eae57069f44c66f4e01875902e2ca..9b4d72897061d3814dd14a50bf92131053560bee 100644 (file)
@@ -334,6 +334,8 @@ enum hl_server_type {
  * HL_INFO_TOTAL_ENERGY  - Retrieve total energy consumption
  * HL_INFO_PLL_FREQUENCY - Retrieve PLL frequency
  * HL_INFO_OPEN_STATS    - Retrieve info regarding recent device open calls
+ * HL_INFO_DRAM_REPLACED_ROWS - Retrieve DRAM replaced rows info
+ * HL_INFO_DRAM_PENDING_ROWS - Retrieve DRAM pending rows num
  */
 #define HL_INFO_HW_IP_INFO             0
 #define HL_INFO_HW_EVENTS              1
@@ -353,6 +355,8 @@ enum hl_server_type {
 #define HL_INFO_PLL_FREQUENCY          16
 #define HL_INFO_POWER                  17
 #define HL_INFO_OPEN_STATS             18
+#define HL_INFO_DRAM_REPLACED_ROWS     21
+#define HL_INFO_DRAM_PENDING_ROWS      22
 
 #define HL_INFO_VERSION_MAX_LEN        128
 #define HL_INFO_CARD_NAME_MAX_LEN      16