habanalabs: set 4s timeout for message to device CPU
authorOded Gabbay <oded.gabbay@gmail.com>
Tue, 7 Jul 2020 14:30:13 +0000 (17:30 +0300)
committerOded Gabbay <oded.gabbay@gmail.com>
Fri, 10 Jul 2020 16:53:03 +0000 (19:53 +0300)
We see that sometimes the CPU in GOYA and GAUDI is occupied by the
power/thermal loop and can't answer requests from the driver fast enough.

Therefore, to avoid false notifications on timeouts, increase the timeout
to 4 seconds on each message sent to the device CPU.

Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
Reviewed-by: Tomer Tayar <ttayar@habana.ai>
drivers/misc/habanalabs/debugfs.c
drivers/misc/habanalabs/firmware_if.c
drivers/misc/habanalabs/gaudi/gaudi.c
drivers/misc/habanalabs/goya/goya.c
drivers/misc/habanalabs/habanalabs.h
drivers/misc/habanalabs/hwmon.c
drivers/misc/habanalabs/sysfs.c

index 136b8f6fa0b3fb4f31a621fe6ea51f822084c764..0bc036e01ee8df1bfbfa8fccd8f41ba1890a4b3d 100644 (file)
@@ -36,7 +36,7 @@ static int hl_debugfs_i2c_read(struct hl_device *hdev, u8 i2c_bus, u8 i2c_addr,
        pkt.i2c_reg = i2c_reg;
 
        rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
-                                       HL_DEVICE_TIMEOUT_USEC, (long *) val);
+                                               0, (long *) val);
 
        if (rc)
                dev_err(hdev->dev, "Failed to read from I2C, error %d\n", rc);
@@ -63,7 +63,7 @@ static int hl_debugfs_i2c_write(struct hl_device *hdev, u8 i2c_bus, u8 i2c_addr,
        pkt.value = cpu_to_le64(val);
 
        rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
-                                       HL_DEVICE_TIMEOUT_USEC, NULL);
+                                               0, NULL);
 
        if (rc)
                dev_err(hdev->dev, "Failed to write to I2C, error %d\n", rc);
@@ -87,7 +87,7 @@ static void hl_debugfs_led_set(struct hl_device *hdev, u8 led, u8 state)
        pkt.value = cpu_to_le64(state);
 
        rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
-                                               HL_DEVICE_TIMEOUT_USEC, NULL);
+                                               0, NULL);
 
        if (rc)
                dev_err(hdev->dev, "Failed to set LED %d, error %d\n", led, rc);
index baf790cf4b787fb4f2164fc443096e58c30c763b..d27841cb5bcb3dc2ef56b3a69e687290bc328626 100644 (file)
@@ -61,7 +61,7 @@ int hl_fw_send_pci_access_msg(struct hl_device *hdev, u32 opcode)
        pkt.ctl = cpu_to_le32(opcode << ARMCP_PKT_CTL_OPCODE_SHIFT);
 
        return hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt,
-                               sizeof(pkt), HL_DEVICE_TIMEOUT_USEC, NULL);
+                                               sizeof(pkt), 0, NULL);
 }
 
 int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg,
@@ -144,7 +144,7 @@ int hl_fw_unmask_irq(struct hl_device *hdev, u16 event_type)
        pkt.value = cpu_to_le64(event_type);
 
        rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
-                       HL_DEVICE_TIMEOUT_USEC, &result);
+                                               0, &result);
 
        if (rc)
                dev_err(hdev->dev, "failed to unmask RAZWI IRQ %d", event_type);
@@ -183,7 +183,7 @@ int hl_fw_unmask_irq_arr(struct hl_device *hdev, const u32 *irq_arr,
                                                ARMCP_PKT_CTL_OPCODE_SHIFT);
 
        rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) pkt,
-                       total_pkt_size, HL_DEVICE_TIMEOUT_USEC, &result);
+                                               total_pkt_size, 0, &result);
 
        if (rc)
                dev_err(hdev->dev, "failed to unmask IRQ array\n");
@@ -204,7 +204,7 @@ int hl_fw_test_cpu_queue(struct hl_device *hdev)
        test_pkt.value = cpu_to_le64(ARMCP_PACKET_FENCE_VAL);
 
        rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &test_pkt,
-                       sizeof(test_pkt), HL_DEVICE_TIMEOUT_USEC, &result);
+                                               sizeof(test_pkt), 0, &result);
 
        if (!rc) {
                if (result != ARMCP_PACKET_FENCE_VAL)
@@ -248,7 +248,7 @@ int hl_fw_send_heartbeat(struct hl_device *hdev)
        hb_pkt.value = cpu_to_le64(ARMCP_PACKET_FENCE_VAL);
 
        rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &hb_pkt,
-                       sizeof(hb_pkt), HL_DEVICE_TIMEOUT_USEC, &result);
+                                               sizeof(hb_pkt), 0, &result);
 
        if ((rc) || (result != ARMCP_PACKET_FENCE_VAL))
                rc = -EIO;
index 9d6aebef885427fdde38d00013f4d958af5ef4e8..637a9d608707f51e30226e121fad16019659a2bb 100644 (file)
@@ -80,6 +80,7 @@
 #define GAUDI_PLDM_QMAN0_TIMEOUT_USEC  (HL_DEVICE_TIMEOUT_USEC * 30)
 #define GAUDI_PLDM_TPC_KERNEL_WAIT_USEC        (HL_DEVICE_TIMEOUT_USEC * 30)
 #define GAUDI_BOOT_FIT_REQ_TIMEOUT_USEC        1000000         /* 1s */
+#define GAUDI_MSG_TO_CPU_TIMEOUT_USEC  4000000         /* 4s */
 
 #define GAUDI_QMAN0_FENCE_VAL          0x72E91AB9
 
@@ -3479,6 +3480,9 @@ static int gaudi_send_cpu_message(struct hl_device *hdev, u32 *msg,
                return 0;
        }
 
+       if (!timeout)
+               timeout = GAUDI_MSG_TO_CPU_TIMEOUT_USEC;
+
        return hl_fw_send_cpu_message(hdev, GAUDI_QUEUE_ID_CPU_PQ, msg, len,
                                                timeout, result);
 }
index 83f0c70f140b151ec84fa87ad08d4dfea5bb8fb5..88460b2138d88bfe80e10e41b349e7c9c4986df3 100644 (file)
@@ -88,6 +88,7 @@
 #define GOYA_PLDM_MMU_TIMEOUT_USEC     (MMU_CONFIG_TIMEOUT_USEC * 100)
 #define GOYA_PLDM_QMAN0_TIMEOUT_USEC   (HL_DEVICE_TIMEOUT_USEC * 30)
 #define GOYA_BOOT_FIT_REQ_TIMEOUT_USEC 1000000         /* 1s */
+#define GOYA_MSG_TO_CPU_TIMEOUT_USEC   4000000         /* 4s */
 
 #define GOYA_QMAN0_FENCE_VAL           0xD169B243
 
@@ -2830,6 +2831,9 @@ int goya_send_cpu_message(struct hl_device *hdev, u32 *msg, u16 len,
                return 0;
        }
 
+       if (!timeout)
+               timeout = GOYA_MSG_TO_CPU_TIMEOUT_USEC;
+
        return hl_fw_send_cpu_message(hdev, GOYA_QUEUE_ID_CPU_PQ, msg, len,
                                        timeout, result);
 }
@@ -4431,8 +4435,8 @@ static int goya_unmask_irq_arr(struct hl_device *hdev, u32 *irq_arr,
        pkt->armcp_pkt.ctl = cpu_to_le32(ARMCP_PACKET_UNMASK_RAZWI_IRQ_ARRAY <<
                                                ARMCP_PKT_CTL_OPCODE_SHIFT);
 
-       rc = goya_send_cpu_message(hdev, (u32 *) pkt, total_pkt_size,
-                       HL_DEVICE_TIMEOUT_USEC, &result);
+       rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) pkt,
+                                               total_pkt_size, 0, &result);
 
        if (rc)
                dev_err(hdev->dev, "failed to unmask IRQ array\n");
@@ -4464,8 +4468,8 @@ static int goya_unmask_irq(struct hl_device *hdev, u16 event_type)
                                ARMCP_PKT_CTL_OPCODE_SHIFT);
        pkt.value = cpu_to_le64(event_type);
 
-       rc = goya_send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
-                       HL_DEVICE_TIMEOUT_USEC, &result);
+       rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
+                                               0, &result);
 
        if (rc)
                dev_err(hdev->dev, "failed to unmask RAZWI IRQ %d", event_type);
index dee5cc25fe5b66d3e852f368f1c249319d23a7ba..194d8335269642e8edb2e3c25f858d8392335da0 100644 (file)
@@ -588,7 +588,11 @@ enum hl_pll_frequency {
  * @hw_queues_unlock: release H/W queues lock.
  * @get_pci_id: retrieve PCI ID.
  * @get_eeprom_data: retrieve EEPROM data from F/W.
- * @send_cpu_message: send buffer to ArmCP.
+ * @send_cpu_message: send message to F/W. If the message is timedout, the
+ *                    driver will eventually reset the device. The timeout can
+ *                    be determined by the calling function or it can be 0 and
+ *                    then the timeout is the default timeout for the specific
+ *                    ASIC
  * @get_hw_state: retrieve the H/W state
  * @pci_bars_map: Map PCI BARs.
  * @set_dram_bar_base: Set DRAM BAR to map specific device address. Returns
index 8c6cd77e6af6bd3b6ada7cd114867d9d528191a7..b997336fa75fc88ad7a565163697b789e977f4ca 100644 (file)
@@ -10,7 +10,6 @@
 #include <linux/pci.h>
 #include <linux/hwmon.h>
 
-#define SENSORS_PKT_TIMEOUT            1000000 /* 1s */
 #define HWMON_NR_SENSOR_TYPES          (hwmon_pwm + 1)
 
 int hl_build_hwmon_channel_info(struct hl_device *hdev,
@@ -323,7 +322,7 @@ int hl_get_temperature(struct hl_device *hdev,
        pkt.type = __cpu_to_le16(attr);
 
        rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
-                       SENSORS_PKT_TIMEOUT, value);
+                                               0, value);
 
        if (rc) {
                dev_err(hdev->dev,
@@ -350,7 +349,7 @@ int hl_set_temperature(struct hl_device *hdev,
        pkt.value = __cpu_to_le64(value);
 
        rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
-                                               SENSORS_PKT_TIMEOUT, NULL);
+                                               0, NULL);
 
        if (rc)
                dev_err(hdev->dev,
@@ -374,7 +373,7 @@ int hl_get_voltage(struct hl_device *hdev,
        pkt.type = __cpu_to_le16(attr);
 
        rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
-                                       SENSORS_PKT_TIMEOUT, value);
+                                               0, value);
 
        if (rc) {
                dev_err(hdev->dev,
@@ -400,7 +399,7 @@ int hl_get_current(struct hl_device *hdev,
        pkt.type = __cpu_to_le16(attr);
 
        rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
-                                       SENSORS_PKT_TIMEOUT, value);
+                                               0, value);
 
        if (rc) {
                dev_err(hdev->dev,
@@ -426,7 +425,7 @@ int hl_get_fan_speed(struct hl_device *hdev,
        pkt.type = __cpu_to_le16(attr);
 
        rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
-                                       SENSORS_PKT_TIMEOUT, value);
+                                               0, value);
 
        if (rc) {
                dev_err(hdev->dev,
@@ -452,7 +451,7 @@ int hl_get_pwm_info(struct hl_device *hdev,
        pkt.type = __cpu_to_le16(attr);
 
        rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
-                                       SENSORS_PKT_TIMEOUT, value);
+                                               0, value);
 
        if (rc) {
                dev_err(hdev->dev,
@@ -479,7 +478,7 @@ void hl_set_pwm_info(struct hl_device *hdev, int sensor_index, u32 attr,
        pkt.value = cpu_to_le64(value);
 
        rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
-                                       SENSORS_PKT_TIMEOUT, NULL);
+                                               0, NULL);
 
        if (rc)
                dev_err(hdev->dev,
@@ -502,7 +501,7 @@ int hl_set_voltage(struct hl_device *hdev,
        pkt.value = __cpu_to_le64(value);
 
        rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
-                                               SENSORS_PKT_TIMEOUT, NULL);
+                                               0, NULL);
 
        if (rc)
                dev_err(hdev->dev,
@@ -527,7 +526,7 @@ int hl_set_current(struct hl_device *hdev,
        pkt.value = __cpu_to_le64(value);
 
        rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
-                                               SENSORS_PKT_TIMEOUT, NULL);
+                                               0, NULL);
 
        if (rc)
                dev_err(hdev->dev,
index 5d78d5e1c7826163601cacdd12bac6a1a87e4e3e..70b6b1863c2ef3ee4042d57caef2672bda7d19c6 100644 (file)
@@ -9,9 +9,6 @@
 
 #include <linux/pci.h>
 
-#define SET_CLK_PKT_TIMEOUT    1000000 /* 1s */
-#define SET_PWR_PKT_TIMEOUT    1000000 /* 1s */
-
 long hl_get_frequency(struct hl_device *hdev, u32 pll_index, bool curr)
 {
        struct armcp_packet pkt;
@@ -29,7 +26,7 @@ long hl_get_frequency(struct hl_device *hdev, u32 pll_index, bool curr)
        pkt.pll_index = cpu_to_le32(pll_index);
 
        rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
-                                               SET_CLK_PKT_TIMEOUT, &result);
+                                               0, &result);
 
        if (rc) {
                dev_err(hdev->dev,
@@ -54,7 +51,7 @@ void hl_set_frequency(struct hl_device *hdev, u32 pll_index, u64 freq)
        pkt.value = cpu_to_le64(freq);
 
        rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
-                                       SET_CLK_PKT_TIMEOUT, NULL);
+                                               0, NULL);
 
        if (rc)
                dev_err(hdev->dev,
@@ -74,7 +71,7 @@ u64 hl_get_max_power(struct hl_device *hdev)
                                ARMCP_PKT_CTL_OPCODE_SHIFT);
 
        rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
-                                               SET_PWR_PKT_TIMEOUT, &result);
+                                               0, &result);
 
        if (rc) {
                dev_err(hdev->dev, "Failed to get max power, error %d\n", rc);
@@ -96,7 +93,7 @@ void hl_set_max_power(struct hl_device *hdev, u64 value)
        pkt.value = cpu_to_le64(value);
 
        rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
-                                       SET_PWR_PKT_TIMEOUT, NULL);
+                                               0, NULL);
 
        if (rc)
                dev_err(hdev->dev, "Failed to set max power, error %d\n", rc);