Commit | Line | Data |
---|---|---|
65d45231 KO |
1 | /* |
2 | * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com> | |
3 | * | |
4 | * Uses a block device as cache for other block devices; optimized for SSDs. | |
5 | * All allocation is done in buckets, which should match the erase block size | |
6 | * of the device. | |
7 | * | |
8 | * Buckets containing cached data are kept on a heap sorted by priority; | |
9 | * bucket priority is increased on cache hit, and periodically all the buckets | |
10 | * on the heap have their priority scaled down. This currently is just used as | |
11 | * an LRU but in the future should allow for more intelligent heuristics. | |
12 | * | |
13 | * Buckets have an 8 bit counter; freeing is accomplished by incrementing the | |
14 | * counter. Garbage collection is used to remove stale pointers. | |
15 | * | |
16 | * Indexing is done via a btree; nodes are not necessarily fully sorted, rather | |
17 | * as keys are inserted we only sort the pages that have not yet been written. | |
18 | * When garbage collection is run, we resort the entire node. | |
19 | * | |
20 | * All configuration is done via sysfs; see Documentation/bcache.txt. | |
21 | */ | |
22 | ||
23 | #include "bcache.h" | |
24 | #include "btree.h" | |
25 | #include "debug.h" | |
26 | #include "extents.h" | |
27 | #include "writeback.h" | |
28 | ||
29 | static void sort_key_next(struct btree_iter *iter, | |
30 | struct btree_iter_set *i) | |
31 | { | |
32 | i->k = bkey_next(i->k); | |
33 | ||
34 | if (i->k == i->end) | |
35 | *i = iter->data[--iter->used]; | |
36 | } | |
37 | ||
38 | static bool bch_key_sort_cmp(struct btree_iter_set l, | |
39 | struct btree_iter_set r) | |
40 | { | |
41 | int64_t c = bkey_cmp(l.k, r.k); | |
42 | ||
43 | return c ? c > 0 : l.k < r.k; | |
44 | } | |
45 | ||
46 | static bool __ptr_invalid(struct cache_set *c, const struct bkey *k) | |
47 | { | |
48 | unsigned i; | |
49 | ||
50 | for (i = 0; i < KEY_PTRS(k); i++) | |
51 | if (ptr_available(c, k, i)) { | |
52 | struct cache *ca = PTR_CACHE(c, k, i); | |
53 | size_t bucket = PTR_BUCKET_NR(c, k, i); | |
54 | size_t r = bucket_remainder(c, PTR_OFFSET(k, i)); | |
55 | ||
56 | if (KEY_SIZE(k) + r > c->sb.bucket_size || | |
57 | bucket < ca->sb.first_bucket || | |
58 | bucket >= ca->sb.nbuckets) | |
59 | return true; | |
60 | } | |
61 | ||
62 | return false; | |
63 | } | |
64 | ||
65 | /* Btree ptrs */ | |
66 | ||
67 | bool __bch_btree_ptr_invalid(struct cache_set *c, const struct bkey *k) | |
68 | { | |
69 | char buf[80]; | |
70 | ||
71 | if (!KEY_PTRS(k) || !KEY_SIZE(k) || KEY_DIRTY(k)) | |
72 | goto bad; | |
73 | ||
74 | if (__ptr_invalid(c, k)) | |
75 | goto bad; | |
76 | ||
77 | return false; | |
78 | bad: | |
79 | bch_bkey_to_text(buf, sizeof(buf), k); | |
80 | cache_bug(c, "spotted btree ptr %s: %s", buf, bch_ptr_status(c, k)); | |
81 | return true; | |
82 | } | |
83 | ||
a85e968e | 84 | static bool bch_btree_ptr_invalid(struct btree_keys *bk, const struct bkey *k) |
65d45231 | 85 | { |
a85e968e | 86 | struct btree *b = container_of(bk, struct btree, keys); |
65d45231 KO |
87 | return __bch_btree_ptr_invalid(b->c, k); |
88 | } | |
89 | ||
90 | static bool btree_ptr_bad_expensive(struct btree *b, const struct bkey *k) | |
91 | { | |
92 | unsigned i; | |
93 | char buf[80]; | |
94 | struct bucket *g; | |
95 | ||
96 | if (mutex_trylock(&b->c->bucket_lock)) { | |
97 | for (i = 0; i < KEY_PTRS(k); i++) | |
98 | if (ptr_available(b->c, k, i)) { | |
99 | g = PTR_BUCKET(b->c, k, i); | |
100 | ||
101 | if (KEY_DIRTY(k) || | |
102 | g->prio != BTREE_PRIO || | |
103 | (b->c->gc_mark_valid && | |
104 | GC_MARK(g) != GC_MARK_METADATA)) | |
105 | goto err; | |
106 | } | |
107 | ||
108 | mutex_unlock(&b->c->bucket_lock); | |
109 | } | |
110 | ||
111 | return false; | |
112 | err: | |
113 | mutex_unlock(&b->c->bucket_lock); | |
114 | bch_bkey_to_text(buf, sizeof(buf), k); | |
115 | btree_bug(b, | |
116 | "inconsistent btree pointer %s: bucket %li pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i", | |
117 | buf, PTR_BUCKET_NR(b->c, k, i), atomic_read(&g->pin), | |
118 | g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen); | |
119 | return true; | |
120 | } | |
121 | ||
a85e968e | 122 | static bool bch_btree_ptr_bad(struct btree_keys *bk, const struct bkey *k) |
65d45231 | 123 | { |
a85e968e | 124 | struct btree *b = container_of(bk, struct btree, keys); |
65d45231 KO |
125 | unsigned i; |
126 | ||
127 | if (!bkey_cmp(k, &ZERO_KEY) || | |
128 | !KEY_PTRS(k) || | |
a85e968e | 129 | bch_ptr_invalid(bk, k)) |
65d45231 KO |
130 | return true; |
131 | ||
132 | for (i = 0; i < KEY_PTRS(k); i++) | |
133 | if (!ptr_available(b->c, k, i) || | |
134 | ptr_stale(b->c, k, i)) | |
135 | return true; | |
136 | ||
137 | if (expensive_debug_checks(b->c) && | |
138 | btree_ptr_bad_expensive(b, k)) | |
139 | return true; | |
140 | ||
141 | return false; | |
142 | } | |
143 | ||
144 | const struct btree_keys_ops bch_btree_keys_ops = { | |
145 | .sort_cmp = bch_key_sort_cmp, | |
146 | .key_invalid = bch_btree_ptr_invalid, | |
147 | .key_bad = bch_btree_ptr_bad, | |
148 | }; | |
149 | ||
150 | /* Extents */ | |
151 | ||
152 | /* | |
153 | * Returns true if l > r - unless l == r, in which case returns true if l is | |
154 | * older than r. | |
155 | * | |
156 | * Necessary for btree_sort_fixup() - if there are multiple keys that compare | |
157 | * equal in different sets, we have to process them newest to oldest. | |
158 | */ | |
159 | static bool bch_extent_sort_cmp(struct btree_iter_set l, | |
160 | struct btree_iter_set r) | |
161 | { | |
162 | int64_t c = bkey_cmp(&START_KEY(l.k), &START_KEY(r.k)); | |
163 | ||
164 | return c ? c > 0 : l.k < r.k; | |
165 | } | |
166 | ||
167 | static struct bkey *bch_extent_sort_fixup(struct btree_iter *iter, | |
168 | struct bkey *tmp) | |
169 | { | |
170 | while (iter->used > 1) { | |
171 | struct btree_iter_set *top = iter->data, *i = top + 1; | |
172 | ||
173 | if (iter->used > 2 && | |
174 | bch_extent_sort_cmp(i[0], i[1])) | |
175 | i++; | |
176 | ||
177 | if (bkey_cmp(top->k, &START_KEY(i->k)) <= 0) | |
178 | break; | |
179 | ||
180 | if (!KEY_SIZE(i->k)) { | |
181 | sort_key_next(iter, i); | |
182 | heap_sift(iter, i - top, bch_extent_sort_cmp); | |
183 | continue; | |
184 | } | |
185 | ||
186 | if (top->k > i->k) { | |
187 | if (bkey_cmp(top->k, i->k) >= 0) | |
188 | sort_key_next(iter, i); | |
189 | else | |
190 | bch_cut_front(top->k, i->k); | |
191 | ||
192 | heap_sift(iter, i - top, bch_extent_sort_cmp); | |
193 | } else { | |
194 | /* can't happen because of comparison func */ | |
195 | BUG_ON(!bkey_cmp(&START_KEY(top->k), &START_KEY(i->k))); | |
196 | ||
197 | if (bkey_cmp(i->k, top->k) < 0) { | |
198 | bkey_copy(tmp, top->k); | |
199 | ||
200 | bch_cut_back(&START_KEY(i->k), tmp); | |
201 | bch_cut_front(i->k, top->k); | |
202 | heap_sift(iter, 0, bch_extent_sort_cmp); | |
203 | ||
204 | return tmp; | |
205 | } else { | |
206 | bch_cut_back(&START_KEY(i->k), top->k); | |
207 | } | |
208 | } | |
209 | } | |
210 | ||
211 | return NULL; | |
212 | } | |
213 | ||
a85e968e | 214 | static bool bch_extent_invalid(struct btree_keys *bk, const struct bkey *k) |
65d45231 | 215 | { |
a85e968e | 216 | struct btree *b = container_of(bk, struct btree, keys); |
65d45231 KO |
217 | char buf[80]; |
218 | ||
219 | if (!KEY_SIZE(k)) | |
220 | return true; | |
221 | ||
222 | if (KEY_SIZE(k) > KEY_OFFSET(k)) | |
223 | goto bad; | |
224 | ||
225 | if (__ptr_invalid(b->c, k)) | |
226 | goto bad; | |
227 | ||
228 | return false; | |
229 | bad: | |
230 | bch_bkey_to_text(buf, sizeof(buf), k); | |
231 | cache_bug(b->c, "spotted extent %s: %s", buf, bch_ptr_status(b->c, k)); | |
232 | return true; | |
233 | } | |
234 | ||
235 | static bool bch_extent_bad_expensive(struct btree *b, const struct bkey *k, | |
236 | unsigned ptr) | |
237 | { | |
238 | struct bucket *g = PTR_BUCKET(b->c, k, ptr); | |
239 | char buf[80]; | |
240 | ||
241 | if (mutex_trylock(&b->c->bucket_lock)) { | |
242 | if (b->c->gc_mark_valid && | |
243 | ((GC_MARK(g) != GC_MARK_DIRTY && | |
244 | KEY_DIRTY(k)) || | |
245 | GC_MARK(g) == GC_MARK_METADATA)) | |
246 | goto err; | |
247 | ||
248 | if (g->prio == BTREE_PRIO) | |
249 | goto err; | |
250 | ||
251 | mutex_unlock(&b->c->bucket_lock); | |
252 | } | |
253 | ||
254 | return false; | |
255 | err: | |
256 | mutex_unlock(&b->c->bucket_lock); | |
257 | bch_bkey_to_text(buf, sizeof(buf), k); | |
258 | btree_bug(b, | |
259 | "inconsistent extent pointer %s:\nbucket %zu pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i", | |
260 | buf, PTR_BUCKET_NR(b->c, k, ptr), atomic_read(&g->pin), | |
261 | g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen); | |
262 | return true; | |
263 | } | |
264 | ||
a85e968e | 265 | static bool bch_extent_bad(struct btree_keys *bk, const struct bkey *k) |
65d45231 | 266 | { |
a85e968e | 267 | struct btree *b = container_of(bk, struct btree, keys); |
65d45231 KO |
268 | struct bucket *g; |
269 | unsigned i, stale; | |
270 | ||
271 | if (!KEY_PTRS(k) || | |
a85e968e | 272 | bch_extent_invalid(bk, k)) |
65d45231 KO |
273 | return true; |
274 | ||
275 | for (i = 0; i < KEY_PTRS(k); i++) | |
276 | if (!ptr_available(b->c, k, i)) | |
277 | return true; | |
278 | ||
279 | if (!expensive_debug_checks(b->c) && KEY_DIRTY(k)) | |
280 | return false; | |
281 | ||
282 | for (i = 0; i < KEY_PTRS(k); i++) { | |
283 | g = PTR_BUCKET(b->c, k, i); | |
284 | stale = ptr_stale(b->c, k, i); | |
285 | ||
286 | btree_bug_on(stale > 96, b, | |
287 | "key too stale: %i, need_gc %u", | |
288 | stale, b->c->need_gc); | |
289 | ||
290 | btree_bug_on(stale && KEY_DIRTY(k) && KEY_SIZE(k), | |
291 | b, "stale dirty pointer"); | |
292 | ||
293 | if (stale) | |
294 | return true; | |
295 | ||
296 | if (expensive_debug_checks(b->c) && | |
297 | bch_extent_bad_expensive(b, k, i)) | |
298 | return true; | |
299 | } | |
300 | ||
301 | return false; | |
302 | } | |
303 | ||
304 | static uint64_t merge_chksums(struct bkey *l, struct bkey *r) | |
305 | { | |
306 | return (l->ptr[KEY_PTRS(l)] + r->ptr[KEY_PTRS(r)]) & | |
307 | ~((uint64_t)1 << 63); | |
308 | } | |
309 | ||
a85e968e | 310 | static bool bch_extent_merge(struct btree_keys *bk, struct bkey *l, struct bkey *r) |
65d45231 | 311 | { |
a85e968e | 312 | struct btree *b = container_of(bk, struct btree, keys); |
65d45231 KO |
313 | unsigned i; |
314 | ||
315 | if (key_merging_disabled(b->c)) | |
316 | return false; | |
317 | ||
318 | if (KEY_PTRS(l) != KEY_PTRS(r) || | |
319 | KEY_DIRTY(l) != KEY_DIRTY(r) || | |
320 | bkey_cmp(l, &START_KEY(r))) | |
321 | return false; | |
322 | ||
323 | for (i = 0; i < KEY_PTRS(l); i++) | |
324 | if (l->ptr[i] + PTR(0, KEY_SIZE(l), 0) != r->ptr[i] || | |
325 | PTR_BUCKET_NR(b->c, l, i) != PTR_BUCKET_NR(b->c, r, i)) | |
326 | return false; | |
327 | ||
328 | /* Keys with no pointers aren't restricted to one bucket and could | |
329 | * overflow KEY_SIZE | |
330 | */ | |
331 | if (KEY_SIZE(l) + KEY_SIZE(r) > USHRT_MAX) { | |
332 | SET_KEY_OFFSET(l, KEY_OFFSET(l) + USHRT_MAX - KEY_SIZE(l)); | |
333 | SET_KEY_SIZE(l, USHRT_MAX); | |
334 | ||
335 | bch_cut_front(l, r); | |
336 | return false; | |
337 | } | |
338 | ||
339 | if (KEY_CSUM(l)) { | |
340 | if (KEY_CSUM(r)) | |
341 | l->ptr[KEY_PTRS(l)] = merge_chksums(l, r); | |
342 | else | |
343 | SET_KEY_CSUM(l, 0); | |
344 | } | |
345 | ||
346 | SET_KEY_OFFSET(l, KEY_OFFSET(l) + KEY_SIZE(r)); | |
347 | SET_KEY_SIZE(l, KEY_SIZE(l) + KEY_SIZE(r)); | |
348 | ||
349 | return true; | |
350 | } | |
351 | ||
352 | const struct btree_keys_ops bch_extent_keys_ops = { | |
353 | .sort_cmp = bch_extent_sort_cmp, | |
354 | .sort_fixup = bch_extent_sort_fixup, | |
355 | .key_invalid = bch_extent_invalid, | |
356 | .key_bad = bch_extent_bad, | |
357 | .key_merge = bch_extent_merge, | |
358 | .is_extents = true, | |
359 | }; |