Commit | Line | Data |
---|---|---|
225c7b1f RD |
1 | /* |
2 | * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. | |
51a379d0 | 3 | * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved. |
225c7b1f RD |
4 | * |
5 | * This software is available to you under a choice of one of two | |
6 | * licenses. You may choose to be licensed under the terms of the GNU | |
7 | * General Public License (GPL) Version 2, available from the file | |
8 | * COPYING in the main directory of this source tree, or the | |
9 | * OpenIB.org BSD license below: | |
10 | * | |
11 | * Redistribution and use in source and binary forms, with or | |
12 | * without modification, are permitted provided that the following | |
13 | * conditions are met: | |
14 | * | |
15 | * - Redistributions of source code must retain the above | |
16 | * copyright notice, this list of conditions and the following | |
17 | * disclaimer. | |
18 | * | |
19 | * - Redistributions in binary form must reproduce the above | |
20 | * copyright notice, this list of conditions and the following | |
21 | * disclaimer in the documentation and/or other materials | |
22 | * provided with the distribution. | |
23 | * | |
24 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | |
25 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | |
26 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | |
27 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | |
28 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | |
29 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | |
30 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
31 | * SOFTWARE. | |
32 | */ | |
33 | ||
ee49bd93 | 34 | #include <linux/workqueue.h> |
9d9779e7 | 35 | #include <linux/module.h> |
ee49bd93 | 36 | |
225c7b1f RD |
37 | #include "mlx4.h" |
38 | ||
ee49bd93 JM |
39 | enum { |
40 | MLX4_CATAS_POLL_INTERVAL = 5 * HZ, | |
41 | }; | |
42 | ||
ee49bd93 | 43 | |
ee49bd93 | 44 | |
f5aef5aa YH |
45 | int mlx4_internal_err_reset = 1; |
46 | module_param_named(internal_err_reset, mlx4_internal_err_reset, int, 0644); | |
ee49bd93 | 47 | MODULE_PARM_DESC(internal_err_reset, |
55ad3592 | 48 | "Reset device on internal errors if non-zero (default 1)"); |
ee49bd93 | 49 | |
f6bc11e4 YH |
50 | static int read_vendor_id(struct mlx4_dev *dev) |
51 | { | |
52 | u16 vendor_id = 0; | |
53 | int ret; | |
54 | ||
55 | ret = pci_read_config_word(dev->persist->pdev, 0, &vendor_id); | |
56 | if (ret) { | |
57 | mlx4_err(dev, "Failed to read vendor ID, ret=%d\n", ret); | |
58 | return ret; | |
59 | } | |
60 | ||
61 | if (vendor_id == 0xffff) { | |
62 | mlx4_err(dev, "PCI can't be accessed to read vendor id\n"); | |
63 | return -EINVAL; | |
64 | } | |
65 | ||
66 | return 0; | |
67 | } | |
68 | ||
69 | static int mlx4_reset_master(struct mlx4_dev *dev) | |
70 | { | |
71 | int err = 0; | |
72 | ||
55ad3592 YH |
73 | if (mlx4_is_master(dev)) |
74 | mlx4_report_internal_err_comm_event(dev); | |
75 | ||
f6bc11e4 YH |
76 | if (!pci_channel_offline(dev->persist->pdev)) { |
77 | err = read_vendor_id(dev); | |
78 | /* If PCI can't be accessed to read vendor ID we assume that its | |
79 | * link was disabled and chip was already reset. | |
80 | */ | |
81 | if (err) | |
82 | return 0; | |
83 | ||
84 | err = mlx4_reset(dev); | |
85 | if (err) | |
86 | mlx4_err(dev, "Fail to reset HCA\n"); | |
87 | } | |
88 | ||
89 | return err; | |
90 | } | |
91 | ||
55ad3592 YH |
92 | static int mlx4_reset_slave(struct mlx4_dev *dev) |
93 | { | |
94 | #define COM_CHAN_RST_REQ_OFFSET 0x10 | |
95 | #define COM_CHAN_RST_ACK_OFFSET 0x08 | |
96 | ||
97 | u32 comm_flags; | |
98 | u32 rst_req; | |
99 | u32 rst_ack; | |
100 | unsigned long end; | |
101 | struct mlx4_priv *priv = mlx4_priv(dev); | |
102 | ||
103 | if (pci_channel_offline(dev->persist->pdev)) | |
104 | return 0; | |
105 | ||
106 | comm_flags = swab32(readl((__iomem char *)priv->mfunc.comm + | |
107 | MLX4_COMM_CHAN_FLAGS)); | |
108 | if (comm_flags == 0xffffffff) { | |
109 | mlx4_err(dev, "VF reset is not needed\n"); | |
110 | return 0; | |
111 | } | |
112 | ||
113 | if (!(dev->caps.vf_caps & MLX4_VF_CAP_FLAG_RESET)) { | |
114 | mlx4_err(dev, "VF reset is not supported\n"); | |
115 | return -EOPNOTSUPP; | |
116 | } | |
117 | ||
118 | rst_req = (comm_flags & (u32)(1 << COM_CHAN_RST_REQ_OFFSET)) >> | |
119 | COM_CHAN_RST_REQ_OFFSET; | |
120 | rst_ack = (comm_flags & (u32)(1 << COM_CHAN_RST_ACK_OFFSET)) >> | |
121 | COM_CHAN_RST_ACK_OFFSET; | |
122 | if (rst_req != rst_ack) { | |
123 | mlx4_err(dev, "Communication channel isn't sync, fail to send reset\n"); | |
124 | return -EIO; | |
125 | } | |
126 | ||
127 | rst_req ^= 1; | |
128 | mlx4_warn(dev, "VF is sending reset request to Firmware\n"); | |
129 | comm_flags = rst_req << COM_CHAN_RST_REQ_OFFSET; | |
130 | __raw_writel((__force u32)cpu_to_be32(comm_flags), | |
131 | (__iomem char *)priv->mfunc.comm + MLX4_COMM_CHAN_FLAGS); | |
132 | /* Make sure that our comm channel write doesn't | |
133 | * get mixed in with writes from another CPU. | |
134 | */ | |
135 | mmiowb(); | |
136 | ||
137 | end = msecs_to_jiffies(MLX4_COMM_TIME) + jiffies; | |
138 | while (time_before(jiffies, end)) { | |
139 | comm_flags = swab32(readl((__iomem char *)priv->mfunc.comm + | |
140 | MLX4_COMM_CHAN_FLAGS)); | |
141 | rst_ack = (comm_flags & (u32)(1 << COM_CHAN_RST_ACK_OFFSET)) >> | |
142 | COM_CHAN_RST_ACK_OFFSET; | |
143 | ||
144 | /* Reading rst_req again since the communication channel can | |
145 | * be reset at any time by the PF and all its bits will be | |
146 | * set to zero. | |
147 | */ | |
148 | rst_req = (comm_flags & (u32)(1 << COM_CHAN_RST_REQ_OFFSET)) >> | |
149 | COM_CHAN_RST_REQ_OFFSET; | |
150 | ||
151 | if (rst_ack == rst_req) { | |
152 | mlx4_warn(dev, "VF Reset succeed\n"); | |
153 | return 0; | |
154 | } | |
155 | cond_resched(); | |
156 | } | |
157 | mlx4_err(dev, "Fail to send reset over the communication channel\n"); | |
158 | return -ETIMEDOUT; | |
159 | } | |
160 | ||
161 | static int mlx4_comm_internal_err(u32 slave_read) | |
162 | { | |
163 | return (u32)COMM_CHAN_EVENT_INTERNAL_ERR == | |
164 | (slave_read & (u32)COMM_CHAN_EVENT_INTERNAL_ERR) ? 1 : 0; | |
165 | } | |
166 | ||
f6bc11e4 YH |
167 | void mlx4_enter_error_state(struct mlx4_dev_persistent *persist) |
168 | { | |
169 | int err; | |
170 | struct mlx4_dev *dev; | |
171 | ||
f5aef5aa | 172 | if (!mlx4_internal_err_reset) |
f6bc11e4 YH |
173 | return; |
174 | ||
175 | mutex_lock(&persist->device_state_mutex); | |
176 | if (persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR) | |
177 | goto out; | |
178 | ||
179 | dev = persist->dev; | |
180 | mlx4_err(dev, "device is going to be reset\n"); | |
55ad3592 YH |
181 | if (mlx4_is_slave(dev)) |
182 | err = mlx4_reset_slave(dev); | |
183 | else | |
184 | err = mlx4_reset_master(dev); | |
f6bc11e4 YH |
185 | BUG_ON(err != 0); |
186 | ||
187 | dev->persist->state |= MLX4_DEVICE_STATE_INTERNAL_ERROR; | |
188 | mlx4_err(dev, "device was reset successfully\n"); | |
189 | mutex_unlock(&persist->device_state_mutex); | |
190 | ||
191 | /* At that step HW was already reset, now notify clients */ | |
192 | mlx4_dispatch_event(dev, MLX4_DEV_EVENT_CATASTROPHIC_ERROR, 0); | |
f5aef5aa | 193 | mlx4_cmd_wake_completions(dev); |
f6bc11e4 YH |
194 | return; |
195 | ||
196 | out: | |
197 | mutex_unlock(&persist->device_state_mutex); | |
198 | } | |
199 | ||
200 | static void mlx4_handle_error_state(struct mlx4_dev_persistent *persist) | |
201 | { | |
202 | int err = 0; | |
203 | ||
204 | mlx4_enter_error_state(persist); | |
c69453e2 YH |
205 | mutex_lock(&persist->interface_state_mutex); |
206 | if (persist->interface_state & MLX4_INTERFACE_STATE_UP && | |
207 | !(persist->interface_state & MLX4_INTERFACE_STATE_DELETION)) { | |
208 | err = mlx4_restart_one(persist->pdev); | |
209 | mlx4_info(persist->dev, "mlx4_restart_one was ended, ret=%d\n", | |
210 | err); | |
211 | } | |
212 | mutex_unlock(&persist->interface_state_mutex); | |
f6bc11e4 YH |
213 | } |
214 | ||
ee49bd93 | 215 | static void dump_err_buf(struct mlx4_dev *dev) |
225c7b1f RD |
216 | { |
217 | struct mlx4_priv *priv = mlx4_priv(dev); | |
218 | ||
219 | int i; | |
220 | ||
ee49bd93 | 221 | mlx4_err(dev, "Internal error detected:\n"); |
225c7b1f RD |
222 | for (i = 0; i < priv->fw.catas_size; ++i) |
223 | mlx4_err(dev, " buf[%02x]: %08x\n", | |
224 | i, swab32(readl(priv->catas_err.map + i))); | |
ee49bd93 | 225 | } |
225c7b1f | 226 | |
ee49bd93 JM |
227 | static void poll_catas(unsigned long dev_ptr) |
228 | { | |
229 | struct mlx4_dev *dev = (struct mlx4_dev *) dev_ptr; | |
230 | struct mlx4_priv *priv = mlx4_priv(dev); | |
55ad3592 YH |
231 | u32 slave_read; |
232 | ||
233 | if (mlx4_is_slave(dev)) { | |
234 | slave_read = swab32(readl(&priv->mfunc.comm->slave_read)); | |
235 | if (mlx4_comm_internal_err(slave_read)) { | |
236 | mlx4_warn(dev, "Internal error detected on the communication channel\n"); | |
237 | goto internal_err; | |
238 | } | |
239 | } else if (readl(priv->catas_err.map)) { | |
f6bc11e4 YH |
240 | dump_err_buf(dev); |
241 | goto internal_err; | |
242 | } | |
243 | ||
244 | if (dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR) { | |
245 | mlx4_warn(dev, "Internal error mark was detected on device\n"); | |
246 | goto internal_err; | |
247 | } | |
248 | ||
249 | mod_timer(&priv->catas_err.timer, | |
250 | round_jiffies(jiffies + MLX4_CATAS_POLL_INTERVAL)); | |
251 | return; | |
252 | ||
253 | internal_err: | |
f5aef5aa | 254 | if (mlx4_internal_err_reset) |
f6bc11e4 | 255 | queue_work(dev->persist->catas_wq, &dev->persist->catas_work); |
225c7b1f RD |
256 | } |
257 | ||
ee49bd93 JM |
258 | static void catas_reset(struct work_struct *work) |
259 | { | |
ad9a0bf0 YH |
260 | struct mlx4_dev_persistent *persist = |
261 | container_of(work, struct mlx4_dev_persistent, | |
262 | catas_work); | |
57dbf29a | 263 | |
f6bc11e4 | 264 | mlx4_handle_error_state(persist); |
ee49bd93 JM |
265 | } |
266 | ||
267 | void mlx4_start_catas_poll(struct mlx4_dev *dev) | |
225c7b1f RD |
268 | { |
269 | struct mlx4_priv *priv = mlx4_priv(dev); | |
4979d18f | 270 | phys_addr_t addr; |
225c7b1f | 271 | |
ee49bd93 JM |
272 | INIT_LIST_HEAD(&priv->catas_err.list); |
273 | init_timer(&priv->catas_err.timer); | |
274 | priv->catas_err.map = NULL; | |
275 | ||
55ad3592 YH |
276 | if (!mlx4_is_slave(dev)) { |
277 | addr = pci_resource_start(dev->persist->pdev, | |
278 | priv->fw.catas_bar) + | |
279 | priv->fw.catas_offset; | |
280 | ||
281 | priv->catas_err.map = ioremap(addr, priv->fw.catas_size * 4); | |
282 | if (!priv->catas_err.map) { | |
283 | mlx4_warn(dev, "Failed to map internal error buffer at 0x%llx\n", | |
284 | (unsigned long long)addr); | |
285 | return; | |
286 | } | |
ee49bd93 | 287 | } |
225c7b1f | 288 | |
ee49bd93 JM |
289 | priv->catas_err.timer.data = (unsigned long) dev; |
290 | priv->catas_err.timer.function = poll_catas; | |
291 | priv->catas_err.timer.expires = | |
292 | round_jiffies(jiffies + MLX4_CATAS_POLL_INTERVAL); | |
293 | add_timer(&priv->catas_err.timer); | |
225c7b1f RD |
294 | } |
295 | ||
ee49bd93 | 296 | void mlx4_stop_catas_poll(struct mlx4_dev *dev) |
225c7b1f RD |
297 | { |
298 | struct mlx4_priv *priv = mlx4_priv(dev); | |
299 | ||
ee49bd93 JM |
300 | del_timer_sync(&priv->catas_err.timer); |
301 | ||
ad9a0bf0 | 302 | if (priv->catas_err.map) { |
225c7b1f | 303 | iounmap(priv->catas_err.map); |
ad9a0bf0 YH |
304 | priv->catas_err.map = NULL; |
305 | } | |
c69453e2 YH |
306 | |
307 | if (dev->persist->interface_state & MLX4_INTERFACE_STATE_DELETION) | |
308 | flush_workqueue(dev->persist->catas_wq); | |
ad9a0bf0 | 309 | } |
ee49bd93 | 310 | |
ad9a0bf0 YH |
311 | int mlx4_catas_init(struct mlx4_dev *dev) |
312 | { | |
313 | INIT_WORK(&dev->persist->catas_work, catas_reset); | |
314 | dev->persist->catas_wq = create_singlethread_workqueue("mlx4_health"); | |
315 | if (!dev->persist->catas_wq) | |
316 | return -ENOMEM; | |
317 | ||
318 | return 0; | |
ee49bd93 JM |
319 | } |
320 | ||
ad9a0bf0 | 321 | void mlx4_catas_end(struct mlx4_dev *dev) |
ee49bd93 | 322 | { |
ad9a0bf0 YH |
323 | if (dev->persist->catas_wq) { |
324 | destroy_workqueue(dev->persist->catas_wq); | |
325 | dev->persist->catas_wq = NULL; | |
326 | } | |
225c7b1f | 327 | } |