Commit | Line | Data |
---|---|---|
3241b1d3 JT |
1 | /* |
2 | * Copyright (C) 2011 Red Hat, Inc. | |
3 | * | |
4 | * This file is released under the GPL. | |
5 | */ | |
6 | ||
7 | #include "dm-btree.h" | |
8 | #include "dm-btree-internal.h" | |
9 | #include "dm-transaction-manager.h" | |
10 | ||
1944ce60 | 11 | #include <linux/export.h> |
3241b1d3 JT |
12 | |
13 | /* | |
14 | * Removing an entry from a btree | |
15 | * ============================== | |
16 | * | |
17 | * A very important constraint for our btree is that no node, except the | |
18 | * root, may have fewer than a certain number of entries. | |
19 | * (MIN_ENTRIES <= nr_entries <= MAX_ENTRIES). | |
20 | * | |
21 | * Ensuring this is complicated by the way we want to only ever hold the | |
22 | * locks on 2 nodes concurrently, and only change nodes in a top to bottom | |
23 | * fashion. | |
24 | * | |
25 | * Each node may have a left or right sibling. When decending the spine, | |
26 | * if a node contains only MIN_ENTRIES then we try and increase this to at | |
27 | * least MIN_ENTRIES + 1. We do this in the following ways: | |
28 | * | |
29 | * [A] No siblings => this can only happen if the node is the root, in which | |
30 | * case we copy the childs contents over the root. | |
31 | * | |
32 | * [B] No left sibling | |
33 | * ==> rebalance(node, right sibling) | |
34 | * | |
35 | * [C] No right sibling | |
36 | * ==> rebalance(left sibling, node) | |
37 | * | |
38 | * [D] Both siblings, total_entries(left, node, right) <= DEL_THRESHOLD | |
39 | * ==> delete node adding it's contents to left and right | |
40 | * | |
41 | * [E] Both siblings, total_entries(left, node, right) > DEL_THRESHOLD | |
42 | * ==> rebalance(left, node, right) | |
43 | * | |
44 | * After these operations it's possible that the our original node no | |
45 | * longer contains the desired sub tree. For this reason this rebalancing | |
46 | * is performed on the children of the current node. This also avoids | |
47 | * having a special case for the root. | |
48 | * | |
49 | * Once this rebalancing has occurred we can then step into the child node | |
50 | * for internal nodes. Or delete the entry for leaf nodes. | |
51 | */ | |
52 | ||
53 | /* | |
54 | * Some little utilities for moving node data around. | |
55 | */ | |
550929fa | 56 | static void node_shift(struct btree_node *n, int shift) |
3241b1d3 JT |
57 | { |
58 | uint32_t nr_entries = le32_to_cpu(n->header.nr_entries); | |
59 | uint32_t value_size = le32_to_cpu(n->header.value_size); | |
60 | ||
61 | if (shift < 0) { | |
62 | shift = -shift; | |
63 | BUG_ON(shift > nr_entries); | |
a3aefb39 | 64 | BUG_ON((void *) key_ptr(n, shift) >= value_ptr(n, shift)); |
3241b1d3 JT |
65 | memmove(key_ptr(n, 0), |
66 | key_ptr(n, shift), | |
67 | (nr_entries - shift) * sizeof(__le64)); | |
a3aefb39 JT |
68 | memmove(value_ptr(n, 0), |
69 | value_ptr(n, shift), | |
3241b1d3 JT |
70 | (nr_entries - shift) * value_size); |
71 | } else { | |
72 | BUG_ON(nr_entries + shift > le32_to_cpu(n->header.max_entries)); | |
73 | memmove(key_ptr(n, shift), | |
74 | key_ptr(n, 0), | |
75 | nr_entries * sizeof(__le64)); | |
a3aefb39 JT |
76 | memmove(value_ptr(n, shift), |
77 | value_ptr(n, 0), | |
3241b1d3 JT |
78 | nr_entries * value_size); |
79 | } | |
80 | } | |
81 | ||
550929fa | 82 | static void node_copy(struct btree_node *left, struct btree_node *right, int shift) |
3241b1d3 JT |
83 | { |
84 | uint32_t nr_left = le32_to_cpu(left->header.nr_entries); | |
85 | uint32_t value_size = le32_to_cpu(left->header.value_size); | |
86 | BUG_ON(value_size != le32_to_cpu(right->header.value_size)); | |
87 | ||
88 | if (shift < 0) { | |
89 | shift = -shift; | |
90 | BUG_ON(nr_left + shift > le32_to_cpu(left->header.max_entries)); | |
91 | memcpy(key_ptr(left, nr_left), | |
92 | key_ptr(right, 0), | |
93 | shift * sizeof(__le64)); | |
a3aefb39 JT |
94 | memcpy(value_ptr(left, nr_left), |
95 | value_ptr(right, 0), | |
3241b1d3 JT |
96 | shift * value_size); |
97 | } else { | |
98 | BUG_ON(shift > le32_to_cpu(right->header.max_entries)); | |
99 | memcpy(key_ptr(right, 0), | |
100 | key_ptr(left, nr_left - shift), | |
101 | shift * sizeof(__le64)); | |
a3aefb39 JT |
102 | memcpy(value_ptr(right, 0), |
103 | value_ptr(left, nr_left - shift), | |
3241b1d3 JT |
104 | shift * value_size); |
105 | } | |
106 | } | |
107 | ||
108 | /* | |
109 | * Delete a specific entry from a leaf node. | |
110 | */ | |
550929fa | 111 | static void delete_at(struct btree_node *n, unsigned index) |
3241b1d3 JT |
112 | { |
113 | unsigned nr_entries = le32_to_cpu(n->header.nr_entries); | |
114 | unsigned nr_to_copy = nr_entries - (index + 1); | |
115 | uint32_t value_size = le32_to_cpu(n->header.value_size); | |
116 | BUG_ON(index >= nr_entries); | |
117 | ||
118 | if (nr_to_copy) { | |
119 | memmove(key_ptr(n, index), | |
120 | key_ptr(n, index + 1), | |
121 | nr_to_copy * sizeof(__le64)); | |
122 | ||
a3aefb39 JT |
123 | memmove(value_ptr(n, index), |
124 | value_ptr(n, index + 1), | |
3241b1d3 JT |
125 | nr_to_copy * value_size); |
126 | } | |
127 | ||
128 | n->header.nr_entries = cpu_to_le32(nr_entries - 1); | |
129 | } | |
130 | ||
550929fa | 131 | static unsigned merge_threshold(struct btree_node *n) |
3241b1d3 | 132 | { |
b0988900 | 133 | return le32_to_cpu(n->header.max_entries) / 3; |
3241b1d3 JT |
134 | } |
135 | ||
136 | struct child { | |
137 | unsigned index; | |
138 | struct dm_block *block; | |
550929fa | 139 | struct btree_node *n; |
3241b1d3 JT |
140 | }; |
141 | ||
f046f89a JT |
142 | static int init_child(struct dm_btree_info *info, struct dm_btree_value_type *vt, |
143 | struct btree_node *parent, | |
3241b1d3 JT |
144 | unsigned index, struct child *result) |
145 | { | |
146 | int r, inc; | |
147 | dm_block_t root; | |
148 | ||
149 | result->index = index; | |
150 | root = value64(parent, index); | |
151 | ||
152 | r = dm_tm_shadow_block(info->tm, root, &btree_node_validator, | |
153 | &result->block, &inc); | |
154 | if (r) | |
155 | return r; | |
156 | ||
157 | result->n = dm_block_data(result->block); | |
158 | ||
159 | if (inc) | |
f046f89a | 160 | inc_children(info->tm, result->n, vt); |
3241b1d3 | 161 | |
a3aefb39 | 162 | *((__le64 *) value_ptr(parent, index)) = |
3241b1d3 JT |
163 | cpu_to_le64(dm_block_location(result->block)); |
164 | ||
165 | return 0; | |
166 | } | |
167 | ||
4c7da06f | 168 | static void exit_child(struct dm_btree_info *info, struct child *c) |
3241b1d3 | 169 | { |
4c7da06f | 170 | dm_tm_unlock(info->tm, c->block); |
3241b1d3 JT |
171 | } |
172 | ||
550929fa | 173 | static void shift(struct btree_node *left, struct btree_node *right, int count) |
3241b1d3 | 174 | { |
b0988900 JT |
175 | uint32_t nr_left = le32_to_cpu(left->header.nr_entries); |
176 | uint32_t nr_right = le32_to_cpu(right->header.nr_entries); | |
177 | uint32_t max_entries = le32_to_cpu(left->header.max_entries); | |
178 | uint32_t r_max_entries = le32_to_cpu(right->header.max_entries); | |
179 | ||
180 | BUG_ON(max_entries != r_max_entries); | |
181 | BUG_ON(nr_left - count > max_entries); | |
182 | BUG_ON(nr_right + count > max_entries); | |
183 | ||
3241b1d3 JT |
184 | if (!count) |
185 | return; | |
186 | ||
187 | if (count > 0) { | |
188 | node_shift(right, count); | |
189 | node_copy(left, right, count); | |
190 | } else { | |
191 | node_copy(left, right, count); | |
192 | node_shift(right, count); | |
193 | } | |
194 | ||
b0988900 JT |
195 | left->header.nr_entries = cpu_to_le32(nr_left - count); |
196 | right->header.nr_entries = cpu_to_le32(nr_right + count); | |
3241b1d3 JT |
197 | } |
198 | ||
550929fa | 199 | static void __rebalance2(struct dm_btree_info *info, struct btree_node *parent, |
3241b1d3 JT |
200 | struct child *l, struct child *r) |
201 | { | |
550929fa MP |
202 | struct btree_node *left = l->n; |
203 | struct btree_node *right = r->n; | |
3241b1d3 JT |
204 | uint32_t nr_left = le32_to_cpu(left->header.nr_entries); |
205 | uint32_t nr_right = le32_to_cpu(right->header.nr_entries); | |
474e5595 HT |
206 | /* |
207 | * Ensure the number of entries in each child will be greater | |
208 | * than or equal to (max_entries / 3 + 1), so no matter which | |
209 | * child is used for removal, the number will still be not | |
210 | * less than (max_entries / 3). | |
211 | */ | |
212 | unsigned int threshold = 2 * (merge_threshold(left) + 1); | |
3241b1d3 | 213 | |
b0988900 | 214 | if (nr_left + nr_right < threshold) { |
3241b1d3 JT |
215 | /* |
216 | * Merge | |
217 | */ | |
218 | node_copy(left, right, -nr_right); | |
219 | left->header.nr_entries = cpu_to_le32(nr_left + nr_right); | |
220 | delete_at(parent, r->index); | |
221 | ||
222 | /* | |
223 | * We need to decrement the right block, but not it's | |
224 | * children, since they're still referenced by left. | |
225 | */ | |
226 | dm_tm_dec(info->tm, dm_block_location(r->block)); | |
227 | } else { | |
228 | /* | |
229 | * Rebalance. | |
230 | */ | |
231 | unsigned target_left = (nr_left + nr_right) / 2; | |
3241b1d3 JT |
232 | shift(left, right, nr_left - target_left); |
233 | *key_ptr(parent, r->index) = right->keys[0]; | |
234 | } | |
235 | } | |
236 | ||
237 | static int rebalance2(struct shadow_spine *s, struct dm_btree_info *info, | |
f046f89a | 238 | struct dm_btree_value_type *vt, unsigned left_index) |
3241b1d3 JT |
239 | { |
240 | int r; | |
550929fa | 241 | struct btree_node *parent; |
3241b1d3 JT |
242 | struct child left, right; |
243 | ||
244 | parent = dm_block_data(shadow_current(s)); | |
245 | ||
f046f89a | 246 | r = init_child(info, vt, parent, left_index, &left); |
3241b1d3 JT |
247 | if (r) |
248 | return r; | |
249 | ||
f046f89a | 250 | r = init_child(info, vt, parent, left_index + 1, &right); |
3241b1d3 JT |
251 | if (r) { |
252 | exit_child(info, &left); | |
253 | return r; | |
254 | } | |
255 | ||
256 | __rebalance2(info, parent, &left, &right); | |
257 | ||
4c7da06f MP |
258 | exit_child(info, &left); |
259 | exit_child(info, &right); | |
3241b1d3 | 260 | |
4c7da06f | 261 | return 0; |
3241b1d3 JT |
262 | } |
263 | ||
b0988900 JT |
264 | /* |
265 | * We dump as many entries from center as possible into left, then the rest | |
266 | * in right, then rebalance2. This wastes some cpu, but I want something | |
267 | * simple atm. | |
268 | */ | |
550929fa | 269 | static void delete_center_node(struct dm_btree_info *info, struct btree_node *parent, |
b0988900 | 270 | struct child *l, struct child *c, struct child *r, |
550929fa | 271 | struct btree_node *left, struct btree_node *center, struct btree_node *right, |
b0988900 JT |
272 | uint32_t nr_left, uint32_t nr_center, uint32_t nr_right) |
273 | { | |
274 | uint32_t max_entries = le32_to_cpu(left->header.max_entries); | |
275 | unsigned shift = min(max_entries - nr_left, nr_center); | |
276 | ||
277 | BUG_ON(nr_left + shift > max_entries); | |
278 | node_copy(left, center, -shift); | |
279 | left->header.nr_entries = cpu_to_le32(nr_left + shift); | |
280 | ||
281 | if (shift != nr_center) { | |
282 | shift = nr_center - shift; | |
283 | BUG_ON((nr_right + shift) > max_entries); | |
284 | node_shift(right, shift); | |
285 | node_copy(center, right, shift); | |
286 | right->header.nr_entries = cpu_to_le32(nr_right + shift); | |
287 | } | |
288 | *key_ptr(parent, r->index) = right->keys[0]; | |
289 | ||
290 | delete_at(parent, c->index); | |
291 | r->index--; | |
292 | ||
293 | dm_tm_dec(info->tm, dm_block_location(c->block)); | |
294 | __rebalance2(info, parent, l, r); | |
295 | } | |
296 | ||
297 | /* | |
298 | * Redistributes entries among 3 sibling nodes. | |
299 | */ | |
550929fa | 300 | static void redistribute3(struct dm_btree_info *info, struct btree_node *parent, |
b0988900 | 301 | struct child *l, struct child *c, struct child *r, |
550929fa | 302 | struct btree_node *left, struct btree_node *center, struct btree_node *right, |
b0988900 JT |
303 | uint32_t nr_left, uint32_t nr_center, uint32_t nr_right) |
304 | { | |
305 | int s; | |
306 | uint32_t max_entries = le32_to_cpu(left->header.max_entries); | |
2871c69e JT |
307 | unsigned total = nr_left + nr_center + nr_right; |
308 | unsigned target_right = total / 3; | |
309 | unsigned remainder = (target_right * 3) != total; | |
310 | unsigned target_left = target_right + remainder; | |
311 | ||
312 | BUG_ON(target_left > max_entries); | |
313 | BUG_ON(target_right > max_entries); | |
b0988900 JT |
314 | |
315 | if (nr_left < nr_right) { | |
2871c69e | 316 | s = nr_left - target_left; |
b0988900 JT |
317 | |
318 | if (s < 0 && nr_center < -s) { | |
319 | /* not enough in central node */ | |
4c7e3093 DY |
320 | shift(left, center, -nr_center); |
321 | s += nr_center; | |
b0988900 JT |
322 | shift(left, right, s); |
323 | nr_right += s; | |
324 | } else | |
325 | shift(left, center, s); | |
326 | ||
2871c69e | 327 | shift(center, right, target_right - nr_right); |
b0988900 JT |
328 | |
329 | } else { | |
2871c69e | 330 | s = target_right - nr_right; |
b0988900 JT |
331 | if (s > 0 && nr_center < s) { |
332 | /* not enough in central node */ | |
333 | shift(center, right, nr_center); | |
4c7e3093 | 334 | s -= nr_center; |
b0988900 JT |
335 | shift(left, right, s); |
336 | nr_left -= s; | |
337 | } else | |
338 | shift(center, right, s); | |
339 | ||
2871c69e | 340 | shift(left, center, nr_left - target_left); |
b0988900 JT |
341 | } |
342 | ||
343 | *key_ptr(parent, c->index) = center->keys[0]; | |
344 | *key_ptr(parent, r->index) = right->keys[0]; | |
345 | } | |
346 | ||
550929fa | 347 | static void __rebalance3(struct dm_btree_info *info, struct btree_node *parent, |
3241b1d3 JT |
348 | struct child *l, struct child *c, struct child *r) |
349 | { | |
550929fa MP |
350 | struct btree_node *left = l->n; |
351 | struct btree_node *center = c->n; | |
352 | struct btree_node *right = r->n; | |
3241b1d3 JT |
353 | |
354 | uint32_t nr_left = le32_to_cpu(left->header.nr_entries); | |
355 | uint32_t nr_center = le32_to_cpu(center->header.nr_entries); | |
356 | uint32_t nr_right = le32_to_cpu(right->header.nr_entries); | |
3241b1d3 | 357 | |
b0988900 | 358 | unsigned threshold = merge_threshold(left) * 4 + 1; |
3241b1d3 JT |
359 | |
360 | BUG_ON(left->header.max_entries != center->header.max_entries); | |
361 | BUG_ON(center->header.max_entries != right->header.max_entries); | |
362 | ||
b0988900 JT |
363 | if ((nr_left + nr_center + nr_right) < threshold) |
364 | delete_center_node(info, parent, l, c, r, left, center, right, | |
365 | nr_left, nr_center, nr_right); | |
366 | else | |
367 | redistribute3(info, parent, l, c, r, left, center, right, | |
368 | nr_left, nr_center, nr_right); | |
3241b1d3 JT |
369 | } |
370 | ||
371 | static int rebalance3(struct shadow_spine *s, struct dm_btree_info *info, | |
f046f89a | 372 | struct dm_btree_value_type *vt, unsigned left_index) |
3241b1d3 JT |
373 | { |
374 | int r; | |
550929fa | 375 | struct btree_node *parent = dm_block_data(shadow_current(s)); |
3241b1d3 JT |
376 | struct child left, center, right; |
377 | ||
378 | /* | |
379 | * FIXME: fill out an array? | |
380 | */ | |
f046f89a | 381 | r = init_child(info, vt, parent, left_index, &left); |
3241b1d3 JT |
382 | if (r) |
383 | return r; | |
384 | ||
f046f89a | 385 | r = init_child(info, vt, parent, left_index + 1, ¢er); |
3241b1d3 JT |
386 | if (r) { |
387 | exit_child(info, &left); | |
388 | return r; | |
389 | } | |
390 | ||
f046f89a | 391 | r = init_child(info, vt, parent, left_index + 2, &right); |
3241b1d3 JT |
392 | if (r) { |
393 | exit_child(info, &left); | |
394 | exit_child(info, ¢er); | |
395 | return r; | |
396 | } | |
397 | ||
398 | __rebalance3(info, parent, &left, ¢er, &right); | |
399 | ||
4c7da06f MP |
400 | exit_child(info, &left); |
401 | exit_child(info, ¢er); | |
402 | exit_child(info, &right); | |
3241b1d3 JT |
403 | |
404 | return 0; | |
405 | } | |
406 | ||
3241b1d3 | 407 | static int rebalance_children(struct shadow_spine *s, |
f046f89a JT |
408 | struct dm_btree_info *info, |
409 | struct dm_btree_value_type *vt, uint64_t key) | |
3241b1d3 JT |
410 | { |
411 | int i, r, has_left_sibling, has_right_sibling; | |
550929fa | 412 | struct btree_node *n; |
3241b1d3 JT |
413 | |
414 | n = dm_block_data(shadow_current(s)); | |
415 | ||
416 | if (le32_to_cpu(n->header.nr_entries) == 1) { | |
417 | struct dm_block *child; | |
418 | dm_block_t b = value64(n, 0); | |
419 | ||
420 | r = dm_tm_read_lock(info->tm, b, &btree_node_validator, &child); | |
421 | if (r) | |
422 | return r; | |
423 | ||
424 | memcpy(n, dm_block_data(child), | |
425 | dm_bm_block_size(dm_tm_get_bm(info->tm))); | |
3241b1d3 JT |
426 | |
427 | dm_tm_dec(info->tm, dm_block_location(child)); | |
1b8d2789 | 428 | dm_tm_unlock(info->tm, child); |
3241b1d3 JT |
429 | return 0; |
430 | } | |
431 | ||
432 | i = lower_bound(n, key); | |
433 | if (i < 0) | |
434 | return -ENODATA; | |
435 | ||
3241b1d3 JT |
436 | has_left_sibling = i > 0; |
437 | has_right_sibling = i < (le32_to_cpu(n->header.nr_entries) - 1); | |
438 | ||
439 | if (!has_left_sibling) | |
f046f89a | 440 | r = rebalance2(s, info, vt, i); |
3241b1d3 JT |
441 | |
442 | else if (!has_right_sibling) | |
f046f89a | 443 | r = rebalance2(s, info, vt, i - 1); |
3241b1d3 JT |
444 | |
445 | else | |
f046f89a | 446 | r = rebalance3(s, info, vt, i - 1); |
3241b1d3 JT |
447 | |
448 | return r; | |
449 | } | |
450 | ||
550929fa | 451 | static int do_leaf(struct btree_node *n, uint64_t key, unsigned *index) |
3241b1d3 JT |
452 | { |
453 | int i = lower_bound(n, key); | |
454 | ||
455 | if ((i < 0) || | |
456 | (i >= le32_to_cpu(n->header.nr_entries)) || | |
457 | (le64_to_cpu(n->keys[i]) != key)) | |
458 | return -ENODATA; | |
459 | ||
460 | *index = i; | |
461 | ||
462 | return 0; | |
463 | } | |
464 | ||
465 | /* | |
466 | * Prepares for removal from one level of the hierarchy. The caller must | |
467 | * call delete_at() to remove the entry at index. | |
468 | */ | |
469 | static int remove_raw(struct shadow_spine *s, struct dm_btree_info *info, | |
470 | struct dm_btree_value_type *vt, dm_block_t root, | |
471 | uint64_t key, unsigned *index) | |
472 | { | |
473 | int i = *index, r; | |
550929fa | 474 | struct btree_node *n; |
3241b1d3 JT |
475 | |
476 | for (;;) { | |
477 | r = shadow_step(s, root, vt); | |
478 | if (r < 0) | |
479 | break; | |
480 | ||
481 | /* | |
482 | * We have to patch up the parent node, ugly, but I don't | |
483 | * see a way to do this automatically as part of the spine | |
484 | * op. | |
485 | */ | |
486 | if (shadow_has_parent(s)) { | |
487 | __le64 location = cpu_to_le64(dm_block_location(shadow_current(s))); | |
a3aefb39 | 488 | memcpy(value_ptr(dm_block_data(shadow_parent(s)), i), |
3241b1d3 JT |
489 | &location, sizeof(__le64)); |
490 | } | |
491 | ||
492 | n = dm_block_data(shadow_current(s)); | |
493 | ||
494 | if (le32_to_cpu(n->header.flags) & LEAF_NODE) | |
495 | return do_leaf(n, key, index); | |
496 | ||
f046f89a | 497 | r = rebalance_children(s, info, vt, key); |
3241b1d3 JT |
498 | if (r) |
499 | break; | |
500 | ||
501 | n = dm_block_data(shadow_current(s)); | |
502 | if (le32_to_cpu(n->header.flags) & LEAF_NODE) | |
503 | return do_leaf(n, key, index); | |
504 | ||
505 | i = lower_bound(n, key); | |
506 | ||
507 | /* | |
508 | * We know the key is present, or else | |
509 | * rebalance_children would have returned | |
510 | * -ENODATA | |
511 | */ | |
512 | root = value64(n, i); | |
513 | } | |
514 | ||
515 | return r; | |
516 | } | |
517 | ||
518 | int dm_btree_remove(struct dm_btree_info *info, dm_block_t root, | |
519 | uint64_t *keys, dm_block_t *new_root) | |
520 | { | |
521 | unsigned level, last_level = info->levels - 1; | |
522 | int index = 0, r = 0; | |
523 | struct shadow_spine spine; | |
550929fa | 524 | struct btree_node *n; |
b0dc3c8b | 525 | struct dm_btree_value_type le64_vt; |
3241b1d3 | 526 | |
b0dc3c8b | 527 | init_le64_type(info->tm, &le64_vt); |
3241b1d3 JT |
528 | init_shadow_spine(&spine, info); |
529 | for (level = 0; level < info->levels; level++) { | |
530 | r = remove_raw(&spine, info, | |
531 | (level == last_level ? | |
b0dc3c8b | 532 | &info->value_type : &le64_vt), |
3241b1d3 JT |
533 | root, keys[level], (unsigned *)&index); |
534 | if (r < 0) | |
535 | break; | |
536 | ||
537 | n = dm_block_data(shadow_current(&spine)); | |
538 | if (level != last_level) { | |
539 | root = value64(n, index); | |
540 | continue; | |
541 | } | |
542 | ||
543 | BUG_ON(index < 0 || index >= le32_to_cpu(n->header.nr_entries)); | |
544 | ||
545 | if (info->value_type.dec) | |
546 | info->value_type.dec(info->value_type.context, | |
be500ed7 | 547 | value_ptr(n, index), 1); |
3241b1d3 JT |
548 | |
549 | delete_at(n, index); | |
550 | } | |
551 | ||
b6e58b54 HT |
552 | if (!r) |
553 | *new_root = shadow_root(&spine); | |
3241b1d3 JT |
554 | exit_shadow_spine(&spine); |
555 | ||
556 | return r; | |
557 | } | |
558 | EXPORT_SYMBOL_GPL(dm_btree_remove); | |
4ec331c3 JT |
559 | |
560 | /*----------------------------------------------------------------*/ | |
561 | ||
562 | static int remove_nearest(struct shadow_spine *s, struct dm_btree_info *info, | |
563 | struct dm_btree_value_type *vt, dm_block_t root, | |
564 | uint64_t key, int *index) | |
565 | { | |
566 | int i = *index, r; | |
567 | struct btree_node *n; | |
568 | ||
569 | for (;;) { | |
570 | r = shadow_step(s, root, vt); | |
571 | if (r < 0) | |
572 | break; | |
573 | ||
574 | /* | |
575 | * We have to patch up the parent node, ugly, but I don't | |
576 | * see a way to do this automatically as part of the spine | |
577 | * op. | |
578 | */ | |
579 | if (shadow_has_parent(s)) { | |
580 | __le64 location = cpu_to_le64(dm_block_location(shadow_current(s))); | |
581 | memcpy(value_ptr(dm_block_data(shadow_parent(s)), i), | |
582 | &location, sizeof(__le64)); | |
583 | } | |
584 | ||
585 | n = dm_block_data(shadow_current(s)); | |
586 | ||
587 | if (le32_to_cpu(n->header.flags) & LEAF_NODE) { | |
588 | *index = lower_bound(n, key); | |
589 | return 0; | |
590 | } | |
591 | ||
592 | r = rebalance_children(s, info, vt, key); | |
593 | if (r) | |
594 | break; | |
595 | ||
596 | n = dm_block_data(shadow_current(s)); | |
597 | if (le32_to_cpu(n->header.flags) & LEAF_NODE) { | |
598 | *index = lower_bound(n, key); | |
599 | return 0; | |
600 | } | |
601 | ||
602 | i = lower_bound(n, key); | |
603 | ||
604 | /* | |
605 | * We know the key is present, or else | |
606 | * rebalance_children would have returned | |
607 | * -ENODATA | |
608 | */ | |
609 | root = value64(n, i); | |
610 | } | |
611 | ||
612 | return r; | |
613 | } | |
614 | ||
615 | static int remove_one(struct dm_btree_info *info, dm_block_t root, | |
616 | uint64_t *keys, uint64_t end_key, | |
617 | dm_block_t *new_root, unsigned *nr_removed) | |
618 | { | |
619 | unsigned level, last_level = info->levels - 1; | |
620 | int index = 0, r = 0; | |
621 | struct shadow_spine spine; | |
622 | struct btree_node *n; | |
b0dc3c8b | 623 | struct dm_btree_value_type le64_vt; |
4ec331c3 JT |
624 | uint64_t k; |
625 | ||
b0dc3c8b | 626 | init_le64_type(info->tm, &le64_vt); |
4ec331c3 JT |
627 | init_shadow_spine(&spine, info); |
628 | for (level = 0; level < last_level; level++) { | |
b0dc3c8b | 629 | r = remove_raw(&spine, info, &le64_vt, |
4ec331c3 JT |
630 | root, keys[level], (unsigned *) &index); |
631 | if (r < 0) | |
632 | goto out; | |
633 | ||
634 | n = dm_block_data(shadow_current(&spine)); | |
635 | root = value64(n, index); | |
636 | } | |
637 | ||
638 | r = remove_nearest(&spine, info, &info->value_type, | |
639 | root, keys[last_level], &index); | |
640 | if (r < 0) | |
641 | goto out; | |
642 | ||
643 | n = dm_block_data(shadow_current(&spine)); | |
644 | ||
645 | if (index < 0) | |
646 | index = 0; | |
647 | ||
648 | if (index >= le32_to_cpu(n->header.nr_entries)) { | |
649 | r = -ENODATA; | |
650 | goto out; | |
651 | } | |
652 | ||
653 | k = le64_to_cpu(n->keys[index]); | |
654 | if (k >= keys[last_level] && k < end_key) { | |
655 | if (info->value_type.dec) | |
656 | info->value_type.dec(info->value_type.context, | |
be500ed7 | 657 | value_ptr(n, index), 1); |
4ec331c3 JT |
658 | |
659 | delete_at(n, index); | |
aa0cd28d | 660 | keys[last_level] = k + 1ull; |
4ec331c3 JT |
661 | |
662 | } else | |
663 | r = -ENODATA; | |
664 | ||
665 | out: | |
666 | *new_root = shadow_root(&spine); | |
667 | exit_shadow_spine(&spine); | |
668 | ||
669 | return r; | |
670 | } | |
671 | ||
672 | int dm_btree_remove_leaves(struct dm_btree_info *info, dm_block_t root, | |
673 | uint64_t *first_key, uint64_t end_key, | |
674 | dm_block_t *new_root, unsigned *nr_removed) | |
675 | { | |
676 | int r; | |
677 | ||
678 | *nr_removed = 0; | |
679 | do { | |
680 | r = remove_one(info, root, first_key, end_key, &root, nr_removed); | |
681 | if (!r) | |
682 | (*nr_removed)++; | |
683 | } while (!r); | |
684 | ||
685 | *new_root = root; | |
686 | return r == -ENODATA ? 0 : r; | |
687 | } | |
688 | EXPORT_SYMBOL_GPL(dm_btree_remove_leaves); |