Commit | Line | Data |
---|---|---|
8c16567d | 1 | // SPDX-License-Identifier: GPL-2.0 |
9e0e252a VV |
2 | /* |
3 | * Bad block management | |
4 | * | |
5 | * - Heavily based on MD badblocks code from Neil Brown | |
6 | * | |
7 | * Copyright (c) 2015, Intel Corporation. | |
9e0e252a VV |
8 | */ |
9 | ||
10 | #include <linux/badblocks.h> | |
11 | #include <linux/seqlock.h> | |
16263ff6 | 12 | #include <linux/device.h> |
9e0e252a VV |
13 | #include <linux/kernel.h> |
14 | #include <linux/module.h> | |
15 | #include <linux/stddef.h> | |
16 | #include <linux/types.h> | |
17 | #include <linux/slab.h> | |
18 | ||
19 | /** | |
20 | * badblocks_check() - check a given range for bad sectors | |
21 | * @bb: the badblocks structure that holds all badblock information | |
22 | * @s: sector (start) at which to check for badblocks | |
23 | * @sectors: number of sectors to check for badblocks | |
24 | * @first_bad: pointer to store location of the first badblock | |
25 | * @bad_sectors: pointer to store number of badblocks after @first_bad | |
26 | * | |
27 | * We can record which blocks on each device are 'bad' and so just | |
28 | * fail those blocks, or that stripe, rather than the whole device. | |
29 | * Entries in the bad-block table are 64bits wide. This comprises: | |
30 | * Length of bad-range, in sectors: 0-511 for lengths 1-512 | |
31 | * Start of bad-range, sector offset, 54 bits (allows 8 exbibytes) | |
32 | * A 'shift' can be set so that larger blocks are tracked and | |
33 | * consequently larger devices can be covered. | |
34 | * 'Acknowledged' flag - 1 bit. - the most significant bit. | |
35 | * | |
36 | * Locking of the bad-block table uses a seqlock so badblocks_check | |
37 | * might need to retry if it is very unlucky. | |
38 | * We will sometimes want to check for bad blocks in a bi_end_io function, | |
39 | * so we use the write_seqlock_irq variant. | |
40 | * | |
41 | * When looking for a bad block we specify a range and want to | |
42 | * know if any block in the range is bad. So we binary-search | |
43 | * to the last range that starts at-or-before the given endpoint, | |
44 | * (or "before the sector after the target range") | |
45 | * then see if it ends after the given start. | |
46 | * | |
47 | * Return: | |
48 | * 0: there are no known bad blocks in the range | |
49 | * 1: there are known bad block which are all acknowledged | |
50 | * -1: there are bad blocks which have not yet been acknowledged in metadata. | |
51 | * plus the start/length of the first bad section we overlap. | |
52 | */ | |
53 | int badblocks_check(struct badblocks *bb, sector_t s, int sectors, | |
54 | sector_t *first_bad, int *bad_sectors) | |
55 | { | |
56 | int hi; | |
57 | int lo; | |
58 | u64 *p = bb->page; | |
59 | int rv; | |
60 | sector_t target = s + sectors; | |
61 | unsigned seq; | |
62 | ||
63 | if (bb->shift > 0) { | |
64 | /* round the start down, and the end up */ | |
65 | s >>= bb->shift; | |
66 | target += (1<<bb->shift) - 1; | |
67 | target >>= bb->shift; | |
9e0e252a VV |
68 | } |
69 | /* 'target' is now the first block after the bad range */ | |
70 | ||
71 | retry: | |
72 | seq = read_seqbegin(&bb->lock); | |
73 | lo = 0; | |
74 | rv = 0; | |
75 | hi = bb->count; | |
76 | ||
77 | /* Binary search between lo and hi for 'target' | |
78 | * i.e. for the last range that starts before 'target' | |
79 | */ | |
80 | /* INVARIANT: ranges before 'lo' and at-or-after 'hi' | |
81 | * are known not to be the last range before target. | |
82 | * VARIANT: hi-lo is the number of possible | |
83 | * ranges, and decreases until it reaches 1 | |
84 | */ | |
85 | while (hi - lo > 1) { | |
86 | int mid = (lo + hi) / 2; | |
87 | sector_t a = BB_OFFSET(p[mid]); | |
88 | ||
89 | if (a < target) | |
90 | /* This could still be the one, earlier ranges | |
91 | * could not. | |
92 | */ | |
93 | lo = mid; | |
94 | else | |
95 | /* This and later ranges are definitely out. */ | |
96 | hi = mid; | |
97 | } | |
98 | /* 'lo' might be the last that started before target, but 'hi' isn't */ | |
99 | if (hi > lo) { | |
100 | /* need to check all range that end after 's' to see if | |
101 | * any are unacknowledged. | |
102 | */ | |
103 | while (lo >= 0 && | |
104 | BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) { | |
105 | if (BB_OFFSET(p[lo]) < target) { | |
106 | /* starts before the end, and finishes after | |
107 | * the start, so they must overlap | |
108 | */ | |
109 | if (rv != -1 && BB_ACK(p[lo])) | |
110 | rv = 1; | |
111 | else | |
112 | rv = -1; | |
113 | *first_bad = BB_OFFSET(p[lo]); | |
114 | *bad_sectors = BB_LEN(p[lo]); | |
115 | } | |
116 | lo--; | |
117 | } | |
118 | } | |
119 | ||
120 | if (read_seqretry(&bb->lock, seq)) | |
121 | goto retry; | |
122 | ||
123 | return rv; | |
124 | } | |
125 | EXPORT_SYMBOL_GPL(badblocks_check); | |
126 | ||
b4a1278c SL |
127 | static void badblocks_update_acked(struct badblocks *bb) |
128 | { | |
129 | u64 *p = bb->page; | |
130 | int i; | |
131 | bool unacked = false; | |
132 | ||
133 | if (!bb->unacked_exist) | |
134 | return; | |
135 | ||
136 | for (i = 0; i < bb->count ; i++) { | |
137 | if (!BB_ACK(p[i])) { | |
138 | unacked = true; | |
139 | break; | |
140 | } | |
141 | } | |
142 | ||
143 | if (!unacked) | |
144 | bb->unacked_exist = 0; | |
145 | } | |
146 | ||
9e0e252a VV |
147 | /** |
148 | * badblocks_set() - Add a range of bad blocks to the table. | |
149 | * @bb: the badblocks structure that holds all badblock information | |
150 | * @s: first sector to mark as bad | |
151 | * @sectors: number of sectors to mark as bad | |
152 | * @acknowledged: weather to mark the bad sectors as acknowledged | |
153 | * | |
154 | * This might extend the table, or might contract it if two adjacent ranges | |
155 | * can be merged. We binary-search to find the 'insertion' point, then | |
156 | * decide how best to handle it. | |
157 | * | |
158 | * Return: | |
159 | * 0: success | |
160 | * 1: failed to set badblocks (out of space) | |
161 | */ | |
162 | int badblocks_set(struct badblocks *bb, sector_t s, int sectors, | |
163 | int acknowledged) | |
164 | { | |
165 | u64 *p; | |
166 | int lo, hi; | |
167 | int rv = 0; | |
168 | unsigned long flags; | |
169 | ||
170 | if (bb->shift < 0) | |
171 | /* badblocks are disabled */ | |
39b4954c | 172 | return 1; |
9e0e252a VV |
173 | |
174 | if (bb->shift) { | |
175 | /* round the start down, and the end up */ | |
176 | sector_t next = s + sectors; | |
177 | ||
178 | s >>= bb->shift; | |
179 | next += (1<<bb->shift) - 1; | |
180 | next >>= bb->shift; | |
181 | sectors = next - s; | |
182 | } | |
183 | ||
184 | write_seqlock_irqsave(&bb->lock, flags); | |
185 | ||
186 | p = bb->page; | |
187 | lo = 0; | |
188 | hi = bb->count; | |
189 | /* Find the last range that starts at-or-before 's' */ | |
190 | while (hi - lo > 1) { | |
191 | int mid = (lo + hi) / 2; | |
192 | sector_t a = BB_OFFSET(p[mid]); | |
193 | ||
194 | if (a <= s) | |
195 | lo = mid; | |
196 | else | |
197 | hi = mid; | |
198 | } | |
199 | if (hi > lo && BB_OFFSET(p[lo]) > s) | |
200 | hi = lo; | |
201 | ||
202 | if (hi > lo) { | |
203 | /* we found a range that might merge with the start | |
204 | * of our new range | |
205 | */ | |
206 | sector_t a = BB_OFFSET(p[lo]); | |
207 | sector_t e = a + BB_LEN(p[lo]); | |
208 | int ack = BB_ACK(p[lo]); | |
209 | ||
210 | if (e >= s) { | |
211 | /* Yes, we can merge with a previous range */ | |
212 | if (s == a && s + sectors >= e) | |
213 | /* new range covers old */ | |
214 | ack = acknowledged; | |
215 | else | |
216 | ack = ack && acknowledged; | |
217 | ||
218 | if (e < s + sectors) | |
219 | e = s + sectors; | |
220 | if (e - a <= BB_MAX_LEN) { | |
221 | p[lo] = BB_MAKE(a, e-a, ack); | |
222 | s = e; | |
223 | } else { | |
224 | /* does not all fit in one range, | |
225 | * make p[lo] maximal | |
226 | */ | |
227 | if (BB_LEN(p[lo]) != BB_MAX_LEN) | |
228 | p[lo] = BB_MAKE(a, BB_MAX_LEN, ack); | |
229 | s = a + BB_MAX_LEN; | |
230 | } | |
231 | sectors = e - s; | |
232 | } | |
233 | } | |
234 | if (sectors && hi < bb->count) { | |
235 | /* 'hi' points to the first range that starts after 's'. | |
236 | * Maybe we can merge with the start of that range | |
237 | */ | |
238 | sector_t a = BB_OFFSET(p[hi]); | |
239 | sector_t e = a + BB_LEN(p[hi]); | |
240 | int ack = BB_ACK(p[hi]); | |
241 | ||
242 | if (a <= s + sectors) { | |
243 | /* merging is possible */ | |
244 | if (e <= s + sectors) { | |
245 | /* full overlap */ | |
246 | e = s + sectors; | |
247 | ack = acknowledged; | |
248 | } else | |
249 | ack = ack && acknowledged; | |
250 | ||
251 | a = s; | |
252 | if (e - a <= BB_MAX_LEN) { | |
253 | p[hi] = BB_MAKE(a, e-a, ack); | |
254 | s = e; | |
255 | } else { | |
256 | p[hi] = BB_MAKE(a, BB_MAX_LEN, ack); | |
257 | s = a + BB_MAX_LEN; | |
258 | } | |
259 | sectors = e - s; | |
260 | lo = hi; | |
261 | hi++; | |
262 | } | |
263 | } | |
264 | if (sectors == 0 && hi < bb->count) { | |
265 | /* we might be able to combine lo and hi */ | |
266 | /* Note: 's' is at the end of 'lo' */ | |
267 | sector_t a = BB_OFFSET(p[hi]); | |
268 | int lolen = BB_LEN(p[lo]); | |
269 | int hilen = BB_LEN(p[hi]); | |
270 | int newlen = lolen + hilen - (s - a); | |
271 | ||
272 | if (s >= a && newlen < BB_MAX_LEN) { | |
273 | /* yes, we can combine them */ | |
274 | int ack = BB_ACK(p[lo]) && BB_ACK(p[hi]); | |
275 | ||
276 | p[lo] = BB_MAKE(BB_OFFSET(p[lo]), newlen, ack); | |
277 | memmove(p + hi, p + hi + 1, | |
278 | (bb->count - hi - 1) * 8); | |
279 | bb->count--; | |
280 | } | |
281 | } | |
282 | while (sectors) { | |
283 | /* didn't merge (it all). | |
284 | * Need to add a range just before 'hi' | |
285 | */ | |
286 | if (bb->count >= MAX_BADBLOCKS) { | |
287 | /* No room for more */ | |
288 | rv = 1; | |
289 | break; | |
290 | } else { | |
291 | int this_sectors = sectors; | |
292 | ||
293 | memmove(p + hi + 1, p + hi, | |
294 | (bb->count - hi) * 8); | |
295 | bb->count++; | |
296 | ||
297 | if (this_sectors > BB_MAX_LEN) | |
298 | this_sectors = BB_MAX_LEN; | |
299 | p[hi] = BB_MAKE(s, this_sectors, acknowledged); | |
300 | sectors -= this_sectors; | |
301 | s += this_sectors; | |
302 | } | |
303 | } | |
304 | ||
305 | bb->changed = 1; | |
306 | if (!acknowledged) | |
307 | bb->unacked_exist = 1; | |
b4a1278c SL |
308 | else |
309 | badblocks_update_acked(bb); | |
9e0e252a VV |
310 | write_sequnlock_irqrestore(&bb->lock, flags); |
311 | ||
312 | return rv; | |
313 | } | |
314 | EXPORT_SYMBOL_GPL(badblocks_set); | |
315 | ||
316 | /** | |
317 | * badblocks_clear() - Remove a range of bad blocks to the table. | |
318 | * @bb: the badblocks structure that holds all badblock information | |
319 | * @s: first sector to mark as bad | |
320 | * @sectors: number of sectors to mark as bad | |
321 | * | |
322 | * This may involve extending the table if we spilt a region, | |
323 | * but it must not fail. So if the table becomes full, we just | |
324 | * drop the remove request. | |
325 | * | |
326 | * Return: | |
327 | * 0: success | |
328 | * 1: failed to clear badblocks | |
329 | */ | |
330 | int badblocks_clear(struct badblocks *bb, sector_t s, int sectors) | |
331 | { | |
332 | u64 *p; | |
333 | int lo, hi; | |
334 | sector_t target = s + sectors; | |
335 | int rv = 0; | |
336 | ||
337 | if (bb->shift > 0) { | |
338 | /* When clearing we round the start up and the end down. | |
339 | * This should not matter as the shift should align with | |
340 | * the block size and no rounding should ever be needed. | |
341 | * However it is better the think a block is bad when it | |
342 | * isn't than to think a block is not bad when it is. | |
343 | */ | |
344 | s += (1<<bb->shift) - 1; | |
345 | s >>= bb->shift; | |
346 | target >>= bb->shift; | |
9e0e252a VV |
347 | } |
348 | ||
349 | write_seqlock_irq(&bb->lock); | |
350 | ||
351 | p = bb->page; | |
352 | lo = 0; | |
353 | hi = bb->count; | |
354 | /* Find the last range that starts before 'target' */ | |
355 | while (hi - lo > 1) { | |
356 | int mid = (lo + hi) / 2; | |
357 | sector_t a = BB_OFFSET(p[mid]); | |
358 | ||
359 | if (a < target) | |
360 | lo = mid; | |
361 | else | |
362 | hi = mid; | |
363 | } | |
364 | if (hi > lo) { | |
365 | /* p[lo] is the last range that could overlap the | |
366 | * current range. Earlier ranges could also overlap, | |
367 | * but only this one can overlap the end of the range. | |
368 | */ | |
1fa9ce8d TM |
369 | if ((BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > target) && |
370 | (BB_OFFSET(p[lo]) < target)) { | |
9e0e252a VV |
371 | /* Partial overlap, leave the tail of this range */ |
372 | int ack = BB_ACK(p[lo]); | |
373 | sector_t a = BB_OFFSET(p[lo]); | |
374 | sector_t end = a + BB_LEN(p[lo]); | |
375 | ||
376 | if (a < s) { | |
377 | /* we need to split this range */ | |
378 | if (bb->count >= MAX_BADBLOCKS) { | |
379 | rv = -ENOSPC; | |
380 | goto out; | |
381 | } | |
382 | memmove(p+lo+1, p+lo, (bb->count - lo) * 8); | |
383 | bb->count++; | |
384 | p[lo] = BB_MAKE(a, s-a, ack); | |
385 | lo++; | |
386 | } | |
387 | p[lo] = BB_MAKE(target, end - target, ack); | |
388 | /* there is no longer an overlap */ | |
389 | hi = lo; | |
390 | lo--; | |
391 | } | |
392 | while (lo >= 0 && | |
1fa9ce8d TM |
393 | (BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) && |
394 | (BB_OFFSET(p[lo]) < target)) { | |
9e0e252a VV |
395 | /* This range does overlap */ |
396 | if (BB_OFFSET(p[lo]) < s) { | |
397 | /* Keep the early parts of this range. */ | |
398 | int ack = BB_ACK(p[lo]); | |
399 | sector_t start = BB_OFFSET(p[lo]); | |
400 | ||
401 | p[lo] = BB_MAKE(start, s - start, ack); | |
402 | /* now low doesn't overlap, so.. */ | |
403 | break; | |
404 | } | |
405 | lo--; | |
406 | } | |
407 | /* 'lo' is strictly before, 'hi' is strictly after, | |
408 | * anything between needs to be discarded | |
409 | */ | |
410 | if (hi - lo > 1) { | |
411 | memmove(p+lo+1, p+hi, (bb->count - hi) * 8); | |
412 | bb->count -= (hi - lo - 1); | |
413 | } | |
414 | } | |
415 | ||
b4a1278c | 416 | badblocks_update_acked(bb); |
9e0e252a VV |
417 | bb->changed = 1; |
418 | out: | |
419 | write_sequnlock_irq(&bb->lock); | |
420 | return rv; | |
421 | } | |
422 | EXPORT_SYMBOL_GPL(badblocks_clear); | |
423 | ||
424 | /** | |
425 | * ack_all_badblocks() - Acknowledge all bad blocks in a list. | |
426 | * @bb: the badblocks structure that holds all badblock information | |
427 | * | |
428 | * This only succeeds if ->changed is clear. It is used by | |
429 | * in-kernel metadata updates | |
430 | */ | |
431 | void ack_all_badblocks(struct badblocks *bb) | |
432 | { | |
433 | if (bb->page == NULL || bb->changed) | |
434 | /* no point even trying */ | |
435 | return; | |
436 | write_seqlock_irq(&bb->lock); | |
437 | ||
438 | if (bb->changed == 0 && bb->unacked_exist) { | |
439 | u64 *p = bb->page; | |
440 | int i; | |
441 | ||
442 | for (i = 0; i < bb->count ; i++) { | |
443 | if (!BB_ACK(p[i])) { | |
444 | sector_t start = BB_OFFSET(p[i]); | |
445 | int len = BB_LEN(p[i]); | |
446 | ||
447 | p[i] = BB_MAKE(start, len, 1); | |
448 | } | |
449 | } | |
450 | bb->unacked_exist = 0; | |
451 | } | |
452 | write_sequnlock_irq(&bb->lock); | |
453 | } | |
454 | EXPORT_SYMBOL_GPL(ack_all_badblocks); | |
455 | ||
456 | /** | |
457 | * badblocks_show() - sysfs access to bad-blocks list | |
458 | * @bb: the badblocks structure that holds all badblock information | |
459 | * @page: buffer received from sysfs | |
460 | * @unack: weather to show unacknowledged badblocks | |
461 | * | |
462 | * Return: | |
463 | * Length of returned data | |
464 | */ | |
465 | ssize_t badblocks_show(struct badblocks *bb, char *page, int unack) | |
466 | { | |
467 | size_t len; | |
468 | int i; | |
469 | u64 *p = bb->page; | |
470 | unsigned seq; | |
471 | ||
472 | if (bb->shift < 0) | |
473 | return 0; | |
474 | ||
475 | retry: | |
476 | seq = read_seqbegin(&bb->lock); | |
477 | ||
478 | len = 0; | |
479 | i = 0; | |
480 | ||
481 | while (len < PAGE_SIZE && i < bb->count) { | |
482 | sector_t s = BB_OFFSET(p[i]); | |
483 | unsigned int length = BB_LEN(p[i]); | |
484 | int ack = BB_ACK(p[i]); | |
485 | ||
486 | i++; | |
487 | ||
488 | if (unack && ack) | |
489 | continue; | |
490 | ||
491 | len += snprintf(page+len, PAGE_SIZE-len, "%llu %u\n", | |
492 | (unsigned long long)s << bb->shift, | |
493 | length << bb->shift); | |
494 | } | |
495 | if (unack && len == 0) | |
496 | bb->unacked_exist = 0; | |
497 | ||
498 | if (read_seqretry(&bb->lock, seq)) | |
499 | goto retry; | |
500 | ||
501 | return len; | |
502 | } | |
503 | EXPORT_SYMBOL_GPL(badblocks_show); | |
504 | ||
505 | /** | |
506 | * badblocks_store() - sysfs access to bad-blocks list | |
507 | * @bb: the badblocks structure that holds all badblock information | |
508 | * @page: buffer received from sysfs | |
509 | * @len: length of data received from sysfs | |
510 | * @unack: weather to show unacknowledged badblocks | |
511 | * | |
512 | * Return: | |
513 | * Length of the buffer processed or -ve error. | |
514 | */ | |
515 | ssize_t badblocks_store(struct badblocks *bb, const char *page, size_t len, | |
516 | int unack) | |
517 | { | |
518 | unsigned long long sector; | |
519 | int length; | |
520 | char newline; | |
521 | ||
522 | switch (sscanf(page, "%llu %d%c", §or, &length, &newline)) { | |
523 | case 3: | |
524 | if (newline != '\n') | |
525 | return -EINVAL; | |
df561f66 | 526 | fallthrough; |
9e0e252a VV |
527 | case 2: |
528 | if (length <= 0) | |
529 | return -EINVAL; | |
530 | break; | |
531 | default: | |
532 | return -EINVAL; | |
533 | } | |
534 | ||
535 | if (badblocks_set(bb, sector, length, !unack)) | |
536 | return -ENOSPC; | |
537 | else | |
538 | return len; | |
539 | } | |
540 | EXPORT_SYMBOL_GPL(badblocks_store); | |
541 | ||
16263ff6 DW |
542 | static int __badblocks_init(struct device *dev, struct badblocks *bb, |
543 | int enable) | |
9e0e252a | 544 | { |
16263ff6 | 545 | bb->dev = dev; |
9e0e252a VV |
546 | bb->count = 0; |
547 | if (enable) | |
548 | bb->shift = 0; | |
549 | else | |
550 | bb->shift = -1; | |
16263ff6 DW |
551 | if (dev) |
552 | bb->page = devm_kzalloc(dev, PAGE_SIZE, GFP_KERNEL); | |
553 | else | |
554 | bb->page = kzalloc(PAGE_SIZE, GFP_KERNEL); | |
555 | if (!bb->page) { | |
9e0e252a VV |
556 | bb->shift = -1; |
557 | return -ENOMEM; | |
558 | } | |
559 | seqlock_init(&bb->lock); | |
560 | ||
561 | return 0; | |
562 | } | |
16263ff6 DW |
563 | |
564 | /** | |
565 | * badblocks_init() - initialize the badblocks structure | |
566 | * @bb: the badblocks structure that holds all badblock information | |
567 | * @enable: weather to enable badblocks accounting | |
568 | * | |
569 | * Return: | |
570 | * 0: success | |
571 | * -ve errno: on error | |
572 | */ | |
573 | int badblocks_init(struct badblocks *bb, int enable) | |
574 | { | |
575 | return __badblocks_init(NULL, bb, enable); | |
576 | } | |
9e0e252a VV |
577 | EXPORT_SYMBOL_GPL(badblocks_init); |
578 | ||
16263ff6 DW |
579 | int devm_init_badblocks(struct device *dev, struct badblocks *bb) |
580 | { | |
581 | if (!bb) | |
582 | return -EINVAL; | |
583 | return __badblocks_init(dev, bb, 1); | |
584 | } | |
585 | EXPORT_SYMBOL_GPL(devm_init_badblocks); | |
586 | ||
9e0e252a | 587 | /** |
d3b407fb | 588 | * badblocks_exit() - free the badblocks structure |
9e0e252a VV |
589 | * @bb: the badblocks structure that holds all badblock information |
590 | */ | |
d3b407fb | 591 | void badblocks_exit(struct badblocks *bb) |
9e0e252a | 592 | { |
20a308f0 DW |
593 | if (!bb) |
594 | return; | |
16263ff6 DW |
595 | if (bb->dev) |
596 | devm_kfree(bb->dev, bb->page); | |
597 | else | |
598 | kfree(bb->page); | |
9e0e252a VV |
599 | bb->page = NULL; |
600 | } | |
d3b407fb | 601 | EXPORT_SYMBOL_GPL(badblocks_exit); |