Commit | Line | Data |
---|---|---|
225c7b1f RD |
1 | /* |
2 | * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. | |
51a379d0 | 3 | * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved. |
225c7b1f RD |
4 | * |
5 | * This software is available to you under a choice of one of two | |
6 | * licenses. You may choose to be licensed under the terms of the GNU | |
7 | * General Public License (GPL) Version 2, available from the file | |
8 | * COPYING in the main directory of this source tree, or the | |
9 | * OpenIB.org BSD license below: | |
10 | * | |
11 | * Redistribution and use in source and binary forms, with or | |
12 | * without modification, are permitted provided that the following | |
13 | * conditions are met: | |
14 | * | |
15 | * - Redistributions of source code must retain the above | |
16 | * copyright notice, this list of conditions and the following | |
17 | * disclaimer. | |
18 | * | |
19 | * - Redistributions in binary form must reproduce the above | |
20 | * copyright notice, this list of conditions and the following | |
21 | * disclaimer in the documentation and/or other materials | |
22 | * provided with the distribution. | |
23 | * | |
24 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | |
25 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | |
26 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | |
27 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | |
28 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | |
29 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | |
30 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
31 | * SOFTWARE. | |
32 | */ | |
33 | ||
ee49bd93 | 34 | #include <linux/workqueue.h> |
9d9779e7 | 35 | #include <linux/module.h> |
ee49bd93 | 36 | |
225c7b1f RD |
37 | #include "mlx4.h" |
38 | ||
ee49bd93 JM |
39 | enum { |
40 | MLX4_CATAS_POLL_INTERVAL = 5 * HZ, | |
41 | }; | |
42 | ||
ee49bd93 | 43 | |
ee49bd93 | 44 | |
f5aef5aa YH |
45 | int mlx4_internal_err_reset = 1; |
46 | module_param_named(internal_err_reset, mlx4_internal_err_reset, int, 0644); | |
ee49bd93 | 47 | MODULE_PARM_DESC(internal_err_reset, |
d81c7186 JM |
48 | "Reset device on internal errors if non-zero" |
49 | " (default 1, in SRIOV mode default is 0)"); | |
ee49bd93 | 50 | |
f6bc11e4 YH |
51 | static int read_vendor_id(struct mlx4_dev *dev) |
52 | { | |
53 | u16 vendor_id = 0; | |
54 | int ret; | |
55 | ||
56 | ret = pci_read_config_word(dev->persist->pdev, 0, &vendor_id); | |
57 | if (ret) { | |
58 | mlx4_err(dev, "Failed to read vendor ID, ret=%d\n", ret); | |
59 | return ret; | |
60 | } | |
61 | ||
62 | if (vendor_id == 0xffff) { | |
63 | mlx4_err(dev, "PCI can't be accessed to read vendor id\n"); | |
64 | return -EINVAL; | |
65 | } | |
66 | ||
67 | return 0; | |
68 | } | |
69 | ||
70 | static int mlx4_reset_master(struct mlx4_dev *dev) | |
71 | { | |
72 | int err = 0; | |
73 | ||
74 | if (!pci_channel_offline(dev->persist->pdev)) { | |
75 | err = read_vendor_id(dev); | |
76 | /* If PCI can't be accessed to read vendor ID we assume that its | |
77 | * link was disabled and chip was already reset. | |
78 | */ | |
79 | if (err) | |
80 | return 0; | |
81 | ||
82 | err = mlx4_reset(dev); | |
83 | if (err) | |
84 | mlx4_err(dev, "Fail to reset HCA\n"); | |
85 | } | |
86 | ||
87 | return err; | |
88 | } | |
89 | ||
90 | void mlx4_enter_error_state(struct mlx4_dev_persistent *persist) | |
91 | { | |
92 | int err; | |
93 | struct mlx4_dev *dev; | |
94 | ||
f5aef5aa | 95 | if (!mlx4_internal_err_reset) |
f6bc11e4 YH |
96 | return; |
97 | ||
98 | mutex_lock(&persist->device_state_mutex); | |
99 | if (persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR) | |
100 | goto out; | |
101 | ||
102 | dev = persist->dev; | |
103 | mlx4_err(dev, "device is going to be reset\n"); | |
104 | err = mlx4_reset_master(dev); | |
105 | BUG_ON(err != 0); | |
106 | ||
107 | dev->persist->state |= MLX4_DEVICE_STATE_INTERNAL_ERROR; | |
108 | mlx4_err(dev, "device was reset successfully\n"); | |
109 | mutex_unlock(&persist->device_state_mutex); | |
110 | ||
111 | /* At that step HW was already reset, now notify clients */ | |
112 | mlx4_dispatch_event(dev, MLX4_DEV_EVENT_CATASTROPHIC_ERROR, 0); | |
f5aef5aa | 113 | mlx4_cmd_wake_completions(dev); |
f6bc11e4 YH |
114 | return; |
115 | ||
116 | out: | |
117 | mutex_unlock(&persist->device_state_mutex); | |
118 | } | |
119 | ||
120 | static void mlx4_handle_error_state(struct mlx4_dev_persistent *persist) | |
121 | { | |
122 | int err = 0; | |
123 | ||
124 | mlx4_enter_error_state(persist); | |
c69453e2 YH |
125 | mutex_lock(&persist->interface_state_mutex); |
126 | if (persist->interface_state & MLX4_INTERFACE_STATE_UP && | |
127 | !(persist->interface_state & MLX4_INTERFACE_STATE_DELETION)) { | |
128 | err = mlx4_restart_one(persist->pdev); | |
129 | mlx4_info(persist->dev, "mlx4_restart_one was ended, ret=%d\n", | |
130 | err); | |
131 | } | |
132 | mutex_unlock(&persist->interface_state_mutex); | |
f6bc11e4 YH |
133 | } |
134 | ||
ee49bd93 | 135 | static void dump_err_buf(struct mlx4_dev *dev) |
225c7b1f RD |
136 | { |
137 | struct mlx4_priv *priv = mlx4_priv(dev); | |
138 | ||
139 | int i; | |
140 | ||
ee49bd93 | 141 | mlx4_err(dev, "Internal error detected:\n"); |
225c7b1f RD |
142 | for (i = 0; i < priv->fw.catas_size; ++i) |
143 | mlx4_err(dev, " buf[%02x]: %08x\n", | |
144 | i, swab32(readl(priv->catas_err.map + i))); | |
ee49bd93 | 145 | } |
225c7b1f | 146 | |
ee49bd93 JM |
147 | static void poll_catas(unsigned long dev_ptr) |
148 | { | |
149 | struct mlx4_dev *dev = (struct mlx4_dev *) dev_ptr; | |
150 | struct mlx4_priv *priv = mlx4_priv(dev); | |
151 | ||
152 | if (readl(priv->catas_err.map)) { | |
f6bc11e4 YH |
153 | dump_err_buf(dev); |
154 | goto internal_err; | |
155 | } | |
156 | ||
157 | if (dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR) { | |
158 | mlx4_warn(dev, "Internal error mark was detected on device\n"); | |
159 | goto internal_err; | |
160 | } | |
161 | ||
162 | mod_timer(&priv->catas_err.timer, | |
163 | round_jiffies(jiffies + MLX4_CATAS_POLL_INTERVAL)); | |
164 | return; | |
165 | ||
166 | internal_err: | |
f5aef5aa | 167 | if (mlx4_internal_err_reset) |
f6bc11e4 | 168 | queue_work(dev->persist->catas_wq, &dev->persist->catas_work); |
225c7b1f RD |
169 | } |
170 | ||
ee49bd93 JM |
171 | static void catas_reset(struct work_struct *work) |
172 | { | |
ad9a0bf0 YH |
173 | struct mlx4_dev_persistent *persist = |
174 | container_of(work, struct mlx4_dev_persistent, | |
175 | catas_work); | |
57dbf29a | 176 | |
f6bc11e4 | 177 | mlx4_handle_error_state(persist); |
ee49bd93 JM |
178 | } |
179 | ||
180 | void mlx4_start_catas_poll(struct mlx4_dev *dev) | |
225c7b1f RD |
181 | { |
182 | struct mlx4_priv *priv = mlx4_priv(dev); | |
4979d18f | 183 | phys_addr_t addr; |
225c7b1f | 184 | |
d81c7186 JM |
185 | /*If we are in SRIOV the default of the module param must be 0*/ |
186 | if (mlx4_is_mfunc(dev)) | |
f5aef5aa | 187 | mlx4_internal_err_reset = 0; |
d81c7186 | 188 | |
ee49bd93 JM |
189 | INIT_LIST_HEAD(&priv->catas_err.list); |
190 | init_timer(&priv->catas_err.timer); | |
191 | priv->catas_err.map = NULL; | |
192 | ||
872bf2fb | 193 | addr = pci_resource_start(dev->persist->pdev, priv->fw.catas_bar) + |
225c7b1f RD |
194 | priv->fw.catas_offset; |
195 | ||
196 | priv->catas_err.map = ioremap(addr, priv->fw.catas_size * 4); | |
ee49bd93 | 197 | if (!priv->catas_err.map) { |
4979d18f RD |
198 | mlx4_warn(dev, "Failed to map internal error buffer at 0x%llx\n", |
199 | (unsigned long long) addr); | |
ee49bd93 JM |
200 | return; |
201 | } | |
225c7b1f | 202 | |
ee49bd93 JM |
203 | priv->catas_err.timer.data = (unsigned long) dev; |
204 | priv->catas_err.timer.function = poll_catas; | |
205 | priv->catas_err.timer.expires = | |
206 | round_jiffies(jiffies + MLX4_CATAS_POLL_INTERVAL); | |
207 | add_timer(&priv->catas_err.timer); | |
225c7b1f RD |
208 | } |
209 | ||
ee49bd93 | 210 | void mlx4_stop_catas_poll(struct mlx4_dev *dev) |
225c7b1f RD |
211 | { |
212 | struct mlx4_priv *priv = mlx4_priv(dev); | |
213 | ||
ee49bd93 JM |
214 | del_timer_sync(&priv->catas_err.timer); |
215 | ||
ad9a0bf0 | 216 | if (priv->catas_err.map) { |
225c7b1f | 217 | iounmap(priv->catas_err.map); |
ad9a0bf0 YH |
218 | priv->catas_err.map = NULL; |
219 | } | |
c69453e2 YH |
220 | |
221 | if (dev->persist->interface_state & MLX4_INTERFACE_STATE_DELETION) | |
222 | flush_workqueue(dev->persist->catas_wq); | |
ad9a0bf0 | 223 | } |
ee49bd93 | 224 | |
ad9a0bf0 YH |
225 | int mlx4_catas_init(struct mlx4_dev *dev) |
226 | { | |
227 | INIT_WORK(&dev->persist->catas_work, catas_reset); | |
228 | dev->persist->catas_wq = create_singlethread_workqueue("mlx4_health"); | |
229 | if (!dev->persist->catas_wq) | |
230 | return -ENOMEM; | |
231 | ||
232 | return 0; | |
ee49bd93 JM |
233 | } |
234 | ||
ad9a0bf0 | 235 | void mlx4_catas_end(struct mlx4_dev *dev) |
ee49bd93 | 236 | { |
ad9a0bf0 YH |
237 | if (dev->persist->catas_wq) { |
238 | destroy_workqueue(dev->persist->catas_wq); | |
239 | dev->persist->catas_wq = NULL; | |
240 | } | |
225c7b1f | 241 | } |