set_bit(BNXT_STATE_FW_ACTIVATE_RESET, &bp->state);
} else if (EVENT_DATA1_RESET_NOTIFY_FATAL(data1)) {
type_str = "Fatal";
+ bp->fw_health->fatalities++;
set_bit(BNXT_STATE_FW_FATAL_COND, &bp->state);
} else if (data2 && BNXT_FW_STATUS_HEALTHY !=
EVENT_DATA2_RESET_NOTIFY_FW_STATUS_CODE(data2)) {
type_str = "Non-fatal";
+ bp->fw_health->survivals++;
set_bit(BNXT_STATE_FW_NON_FATAL_COND, &bp->state);
}
netif_warn(bp, hw, bp->dev,
if (!bp->fw_health)
return -ENOMEM;
+ mutex_init(&bp->fw_health->lock);
return 0;
}
struct bnxt_fw_health *fw_health = bp->fw_health;
u32 reg_type;
- if (!fw_health || !fw_health->status_reliable)
+ if (!fw_health)
return;
reg_type = BNXT_FW_HEALTH_REG_TYPE(fw_health->regs[BNXT_FW_HEALTH_REG]);
if (reg_type == BNXT_FW_HEALTH_REG_TYPE_GRC)
fw_health->status_reliable = false;
+
+ reg_type = BNXT_FW_HEALTH_REG_TYPE(fw_health->regs[BNXT_FW_RESET_CNT_REG]);
+ if (reg_type == BNXT_FW_HEALTH_REG_TYPE_GRC)
+ fw_health->resets_reliable = false;
}
static void bnxt_try_map_fw_health_reg(struct bnxt *bp)
int i;
bp->fw_health->status_reliable = false;
+ bp->fw_health->resets_reliable = false;
/* Only pre-map the monitoring GRC registers using window 3 */
for (i = 0; i < 4; i++) {
u32 reg = fw_health->regs[i];
fw_health->mapped_regs[i] = BNXT_FW_HEALTH_WIN_OFF(reg);
}
bp->fw_health->status_reliable = true;
+ bp->fw_health->resets_reliable = true;
if (reg_base == 0xffffffff)
return 0;
}
val = bnxt_fw_health_readl(bp, BNXT_FW_HEARTBEAT_REG);
- if (val == fw_health->last_fw_heartbeat)
+ if (val == fw_health->last_fw_heartbeat) {
+ fw_health->arrests++;
goto fw_reset;
+ }
fw_health->last_fw_heartbeat = val;
val = bnxt_fw_health_readl(bp, BNXT_FW_RESET_CNT_REG);
- if (val != fw_health->last_fw_reset_cnt)
+ if (val != fw_health->last_fw_reset_cnt) {
+ fw_health->discoveries++;
goto fw_reset;
+ }
fw_health->tmr_counter = fw_health->tmr_multiplier;
return;
return hwrm_req_send(bp, req);
}
+static char *bnxt_health_severity_str(enum bnxt_health_severity severity)
+{
+ switch (severity) {
+ case SEVERITY_NORMAL: return "normal";
+ case SEVERITY_WARNING: return "warning";
+ case SEVERITY_RECOVERABLE: return "recoverable";
+ case SEVERITY_FATAL: return "fatal";
+ default: return "unknown";
+ }
+}
+
+static char *bnxt_health_remedy_str(enum bnxt_health_remedy remedy)
+{
+ switch (remedy) {
+ case REMEDY_DEVLINK_RECOVER: return "devlink recover";
+ case REMEDY_POWER_CYCLE_DEVICE: return "device power cycle";
+ case REMEDY_POWER_CYCLE_HOST: return "host power cycle";
+ case REMEDY_FW_UPDATE: return "update firmware";
+ case REMEDY_HW_REPLACE: return "replace hardware";
+ default: return "unknown";
+ }
+}
+
static int bnxt_fw_diagnose(struct devlink_health_reporter *reporter,
struct devlink_fmsg *fmsg,
struct netlink_ext_ack *extack)
{
struct bnxt *bp = devlink_health_reporter_priv(reporter);
- u32 val;
+ struct bnxt_fw_health *h = bp->fw_health;
+ u32 fw_status, fw_resets;
int rc;
if (test_bit(BNXT_STATE_IN_FW_RESET, &bp->state))
- return 0;
+ return devlink_fmsg_string_pair_put(fmsg, "Status", "recovering");
- val = bnxt_fw_health_readl(bp, BNXT_FW_HEALTH_REG);
+ if (!h->status_reliable)
+ return devlink_fmsg_string_pair_put(fmsg, "Status", "unknown");
- if (BNXT_FW_IS_BOOTING(val)) {
- rc = devlink_fmsg_string_pair_put(fmsg, "Description",
- "Not yet completed initialization");
+ mutex_lock(&h->lock);
+ fw_status = bnxt_fw_health_readl(bp, BNXT_FW_HEALTH_REG);
+ if (BNXT_FW_IS_BOOTING(fw_status)) {
+ rc = devlink_fmsg_string_pair_put(fmsg, "Status", "initializing");
if (rc)
- return rc;
- } else if (BNXT_FW_IS_ERR(val)) {
- rc = devlink_fmsg_string_pair_put(fmsg, "Description",
- "Encountered fatal error and cannot recover");
+ goto unlock;
+ } else if (h->severity || fw_status != BNXT_FW_STATUS_HEALTHY) {
+ if (!h->severity) {
+ h->severity = SEVERITY_FATAL;
+ h->remedy = REMEDY_POWER_CYCLE_DEVICE;
+ h->diagnoses++;
+ devlink_health_report(h->fw_reporter,
+ "FW error diagnosed", h);
+ }
+ rc = devlink_fmsg_string_pair_put(fmsg, "Status", "error");
if (rc)
- return rc;
+ goto unlock;
+ rc = devlink_fmsg_u32_pair_put(fmsg, "Syndrome", fw_status);
+ if (rc)
+ goto unlock;
+ } else {
+ rc = devlink_fmsg_string_pair_put(fmsg, "Status", "healthy");
+ if (rc)
+ goto unlock;
}
- if (val >> 16) {
- rc = devlink_fmsg_u32_pair_put(fmsg, "Error code", val >> 16);
+ rc = devlink_fmsg_string_pair_put(fmsg, "Severity",
+ bnxt_health_severity_str(h->severity));
+ if (rc)
+ goto unlock;
+
+ if (h->severity) {
+ rc = devlink_fmsg_string_pair_put(fmsg, "Remedy",
+ bnxt_health_remedy_str(h->remedy));
if (rc)
- return rc;
+ goto unlock;
+ if (h->remedy == REMEDY_DEVLINK_RECOVER) {
+ rc = devlink_fmsg_string_pair_put(fmsg, "Impact",
+ "traffic+ntuple_cfg");
+ if (rc)
+ goto unlock;
+ }
}
- val = bnxt_fw_health_readl(bp, BNXT_FW_RESET_CNT_REG);
- rc = devlink_fmsg_u32_pair_put(fmsg, "Reset count", val);
- if (rc)
+unlock:
+ mutex_unlock(&h->lock);
+ if (rc || !h->resets_reliable)
return rc;
- return 0;
+ fw_resets = bnxt_fw_health_readl(bp, BNXT_FW_RESET_CNT_REG);
+ rc = devlink_fmsg_u32_pair_put(fmsg, "Resets", fw_resets);
+ if (rc)
+ return rc;
+ rc = devlink_fmsg_u32_pair_put(fmsg, "Arrests", h->arrests);
+ if (rc)
+ return rc;
+ rc = devlink_fmsg_u32_pair_put(fmsg, "Survivals", h->survivals);
+ if (rc)
+ return rc;
+ rc = devlink_fmsg_u32_pair_put(fmsg, "Discoveries", h->discoveries);
+ if (rc)
+ return rc;
+ rc = devlink_fmsg_u32_pair_put(fmsg, "Fatalities", h->fatalities);
+ if (rc)
+ return rc;
+ return devlink_fmsg_u32_pair_put(fmsg, "Diagnoses", h->diagnoses);
}
static int bnxt_fw_recover(struct devlink_health_reporter *reporter,
{
struct bnxt *bp = devlink_health_reporter_priv(reporter);
+ if (bp->fw_health->severity == SEVERITY_FATAL)
+ return -ENODEV;
+
set_bit(BNXT_STATE_RECOVER, &bp->state);
__bnxt_fw_recover(bp);
void bnxt_devlink_health_fw_report(struct bnxt *bp)
{
struct bnxt_fw_health *fw_health = bp->fw_health;
+ int rc;
if (!fw_health)
return;
return;
}
- devlink_health_report(fw_health->fw_reporter, "FW error reported", NULL);
+ mutex_lock(&fw_health->lock);
+ fw_health->severity = SEVERITY_RECOVERABLE;
+ fw_health->remedy = REMEDY_DEVLINK_RECOVER;
+ mutex_unlock(&fw_health->lock);
+ rc = devlink_health_report(fw_health->fw_reporter, "FW error reported",
+ fw_health);
+ if (rc == -ECANCELED)
+ __bnxt_fw_recover(bp);
}
void bnxt_dl_health_fw_status_update(struct bnxt *bp, bool healthy)
{
- struct bnxt_fw_health *health = bp->fw_health;
+ struct bnxt_fw_health *fw_health = bp->fw_health;
u8 state;
- if (healthy)
+ mutex_lock(&fw_health->lock);
+ if (healthy) {
+ fw_health->severity = SEVERITY_NORMAL;
state = DEVLINK_HEALTH_REPORTER_STATE_HEALTHY;
- else
+ } else {
+ fw_health->severity = SEVERITY_FATAL;
+ fw_health->remedy = REMEDY_POWER_CYCLE_DEVICE;
state = DEVLINK_HEALTH_REPORTER_STATE_ERROR;
-
- devlink_health_reporter_state_update(health->fw_reporter, state);
+ }
+ mutex_unlock(&fw_health->lock);
+ devlink_health_reporter_state_update(fw_health->fw_reporter, state);
}
void bnxt_dl_health_fw_recovery_done(struct bnxt *bp)