drm/amdgpu: Modify .ras_late_init function pointer parameter
[linux-block.git] / drivers / gpu / drm / amd / amdgpu / amdgpu_umc.c
CommitLineData
86edcc7d
TZ
1/*
2 * Copyright 2019 Advanced Micro Devices, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
21 *
22 */
23
7cab2124 24#include "amdgpu.h"
86edcc7d 25
fec8c524 26static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
34cc4fd9 27 void *ras_error_status,
f4409ee8
TZ
28 struct amdgpu_iv_entry *entry,
29 bool reset)
34cc4fd9
TZ
30{
31 struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
513befa6 32 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
fdcb279d 33 int ret = 0;
34cc4fd9 34
34cc4fd9 35 kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
bc143d8b 36 ret = amdgpu_dpm_get_ecc_info(adev, (void *)&(con->umc_ecc));
fdcb279d 37 if (ret == -EOPNOTSUPP) {
efe17d5a 38 if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops &&
39 adev->umc.ras->ras_block.hw_ops->query_ras_error_count)
40 adev->umc.ras->ras_block.hw_ops->query_ras_error_count(adev, ras_error_status);
fdcb279d 41
efe17d5a 42 if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops &&
43 adev->umc.ras->ras_block.hw_ops->query_ras_error_address &&
fdcb279d
SY
44 adev->umc.max_ras_err_cnt_per_query) {
45 err_data->err_addr =
46 kcalloc(adev->umc.max_ras_err_cnt_per_query,
47 sizeof(struct eeprom_table_record), GFP_KERNEL);
48
49 /* still call query_ras_error_address to clear error status
50 * even NOMEM error is encountered
51 */
52 if(!err_data->err_addr)
53 dev_warn(adev->dev, "Failed to alloc memory for "
54 "umc error address record!\n");
55
56 /* umc query_ras_error_address is also responsible for clearing
57 * error status
58 */
efe17d5a 59 adev->umc.ras->ras_block.hw_ops->query_ras_error_address(adev, ras_error_status);
fdcb279d
SY
60 }
61 } else if (!ret) {
efe17d5a 62 if (adev->umc.ras &&
63 adev->umc.ras->ecc_info_query_ras_error_count)
64 adev->umc.ras->ecc_info_query_ras_error_count(adev, ras_error_status);
fdcb279d 65
efe17d5a 66 if (adev->umc.ras &&
67 adev->umc.ras->ecc_info_query_ras_error_address &&
fdcb279d
SY
68 adev->umc.max_ras_err_cnt_per_query) {
69 err_data->err_addr =
70 kcalloc(adev->umc.max_ras_err_cnt_per_query,
71 sizeof(struct eeprom_table_record), GFP_KERNEL);
72
73 /* still call query_ras_error_address to clear error status
74 * even NOMEM error is encountered
75 */
76 if(!err_data->err_addr)
77 dev_warn(adev->dev, "Failed to alloc memory for "
78 "umc error address record!\n");
79
80 /* umc query_ras_error_address is also responsible for clearing
81 * error status
82 */
efe17d5a 83 adev->umc.ras->ecc_info_query_ras_error_address(adev, ras_error_status);
fdcb279d 84 }
34cc4fd9
TZ
85 }
86
87 /* only uncorrectable error needs gpu reset */
88 if (err_data->ue_count) {
6952e99c
GC
89 dev_info(adev->dev, "%ld uncorrectable hardware errors "
90 "detected in UMC block\n",
91 err_data->ue_count);
1f3ef0ef 92
a219ecbb 93 if ((amdgpu_bad_page_threshold != 0) &&
22503d80 94 err_data->err_addr_cnt) {
a219ecbb 95 amdgpu_ras_add_bad_pages(adev, err_data->err_addr,
22503d80
DL
96 err_data->err_addr_cnt);
97 amdgpu_ras_save_bad_pages(adev);
513befa6 98
bc143d8b 99 amdgpu_dpm_send_hbm_bad_pages_num(adev, con->eeprom_control.ras_num_recs);
22503d80 100 }
34cc4fd9 101
f4409ee8
TZ
102 if (reset)
103 amdgpu_ras_reset_gpu(adev);
34cc4fd9
TZ
104 }
105
106 kfree(err_data->err_addr);
107 return AMDGPU_RAS_SUCCESS;
108}
109
fec8c524
TZ
110int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
111 void *ras_error_status,
112 bool reset)
113{
114 int ret;
115 struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
116 struct ras_common_if head = {
117 .block = AMDGPU_RAS_BLOCK__UMC,
118 };
119 struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head);
120
121 ret =
122 amdgpu_umc_do_page_retirement(adev, ras_error_status, NULL, reset);
123
124 if (ret == AMDGPU_RAS_SUCCESS && obj) {
125 obj->err_data.ue_count += err_data->ue_count;
126 obj->err_data.ce_count += err_data->ce_count;
127 }
128
129 return ret;
130}
131
a3ace75c 132int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev,
fec8c524
TZ
133 void *ras_error_status,
134 struct amdgpu_iv_entry *entry)
135{
136 return amdgpu_umc_do_page_retirement(adev, ras_error_status, entry, true);
137}
138
4e9b1fa5 139int amdgpu_umc_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block)
fec8c524
TZ
140{
141 int r;
fec8c524 142
a3ace75c 143 r = amdgpu_ras_block_late_init(adev, adev->umc.ras_if);
fec8c524 144 if (r)
a3ace75c 145 return r;
fec8c524
TZ
146
147 if (amdgpu_ras_is_supported(adev, adev->umc.ras_if->block)) {
148 r = amdgpu_irq_get(adev, &adev->gmc.ecc_irq, 0);
149 if (r)
150 goto late_fini;
fec8c524
TZ
151 }
152
153 /* ras init of specific umc version */
efe17d5a 154 if (adev->umc.ras &&
155 adev->umc.ras->err_cnt_init)
156 adev->umc.ras->err_cnt_init(adev);
fec8c524
TZ
157
158 return 0;
159
160late_fini:
a3ace75c 161 amdgpu_ras_block_late_fini(adev, adev->umc.ras_if);
fec8c524
TZ
162 return r;
163}
164
165void amdgpu_umc_ras_fini(struct amdgpu_device *adev)
166{
167 if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC) &&
a3ace75c 168 adev->umc.ras_if)
169 amdgpu_ras_block_late_fini(adev, adev->umc.ras_if);
fec8c524
TZ
170}
171
34cc4fd9
TZ
172int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev,
173 struct amdgpu_irq_src *source,
174 struct amdgpu_iv_entry *entry)
175{
03740baa 176 struct ras_common_if *ras_if = adev->umc.ras_if;
34cc4fd9
TZ
177 struct ras_dispatch_if ih_data = {
178 .entry = entry,
179 };
180
181 if (!ras_if)
182 return 0;
183
184 ih_data.head = *ras_if;
185
186 amdgpu_ras_interrupt_dispatch(adev, &ih_data);
187 return 0;
188}
400013b2
TZ
189
190void amdgpu_umc_fill_error_record(struct ras_err_data *err_data,
191 uint64_t err_addr,
192 uint64_t retired_page,
193 uint32_t channel_index,
194 uint32_t umc_inst)
195{
196 struct eeprom_table_record *err_rec =
197 &err_data->err_addr[err_data->err_addr_cnt];
198
199 err_rec->address = err_addr;
200 /* page frame address is saved */
201 err_rec->retired_page = retired_page >> AMDGPU_GPU_PAGE_SHIFT;
202 err_rec->ts = (uint64_t)ktime_get_real_seconds();
203 err_rec->err_type = AMDGPU_RAS_EEPROM_ERR_NON_RECOVERABLE;
204 err_rec->cu = 0;
205 err_rec->mem_channel = channel_index;
206 err_rec->mcumc_id = umc_inst;
207
208 err_data->err_addr_cnt++;
209}