Commit | Line | Data |
---|---|---|
3bd94003 | 1 | // SPDX-License-Identifier: GPL-2.0-only |
3241b1d3 JT |
2 | /* |
3 | * Copyright (C) 2011 Red Hat, Inc. | |
4 | * | |
5 | * This file is released under the GPL. | |
6 | */ | |
7 | ||
8 | #include "dm-btree.h" | |
9 | #include "dm-btree-internal.h" | |
10 | #include "dm-transaction-manager.h" | |
11 | ||
1944ce60 | 12 | #include <linux/export.h> |
c671ffa5 JT |
13 | #include <linux/device-mapper.h> |
14 | ||
15 | #define DM_MSG_PREFIX "btree" | |
3241b1d3 JT |
16 | |
17 | /* | |
18 | * Removing an entry from a btree | |
19 | * ============================== | |
20 | * | |
21 | * A very important constraint for our btree is that no node, except the | |
22 | * root, may have fewer than a certain number of entries. | |
23 | * (MIN_ENTRIES <= nr_entries <= MAX_ENTRIES). | |
24 | * | |
25 | * Ensuring this is complicated by the way we want to only ever hold the | |
26 | * locks on 2 nodes concurrently, and only change nodes in a top to bottom | |
27 | * fashion. | |
28 | * | |
29 | * Each node may have a left or right sibling. When decending the spine, | |
30 | * if a node contains only MIN_ENTRIES then we try and increase this to at | |
31 | * least MIN_ENTRIES + 1. We do this in the following ways: | |
32 | * | |
33 | * [A] No siblings => this can only happen if the node is the root, in which | |
34 | * case we copy the childs contents over the root. | |
35 | * | |
36 | * [B] No left sibling | |
37 | * ==> rebalance(node, right sibling) | |
38 | * | |
39 | * [C] No right sibling | |
40 | * ==> rebalance(left sibling, node) | |
41 | * | |
42 | * [D] Both siblings, total_entries(left, node, right) <= DEL_THRESHOLD | |
43 | * ==> delete node adding it's contents to left and right | |
44 | * | |
45 | * [E] Both siblings, total_entries(left, node, right) > DEL_THRESHOLD | |
46 | * ==> rebalance(left, node, right) | |
47 | * | |
48 | * After these operations it's possible that the our original node no | |
49 | * longer contains the desired sub tree. For this reason this rebalancing | |
50 | * is performed on the children of the current node. This also avoids | |
51 | * having a special case for the root. | |
52 | * | |
53 | * Once this rebalancing has occurred we can then step into the child node | |
54 | * for internal nodes. Or delete the entry for leaf nodes. | |
55 | */ | |
56 | ||
57 | /* | |
58 | * Some little utilities for moving node data around. | |
59 | */ | |
550929fa | 60 | static void node_shift(struct btree_node *n, int shift) |
3241b1d3 JT |
61 | { |
62 | uint32_t nr_entries = le32_to_cpu(n->header.nr_entries); | |
63 | uint32_t value_size = le32_to_cpu(n->header.value_size); | |
64 | ||
65 | if (shift < 0) { | |
66 | shift = -shift; | |
67 | BUG_ON(shift > nr_entries); | |
a3aefb39 | 68 | BUG_ON((void *) key_ptr(n, shift) >= value_ptr(n, shift)); |
3241b1d3 JT |
69 | memmove(key_ptr(n, 0), |
70 | key_ptr(n, shift), | |
71 | (nr_entries - shift) * sizeof(__le64)); | |
a3aefb39 JT |
72 | memmove(value_ptr(n, 0), |
73 | value_ptr(n, shift), | |
3241b1d3 JT |
74 | (nr_entries - shift) * value_size); |
75 | } else { | |
76 | BUG_ON(nr_entries + shift > le32_to_cpu(n->header.max_entries)); | |
77 | memmove(key_ptr(n, shift), | |
78 | key_ptr(n, 0), | |
79 | nr_entries * sizeof(__le64)); | |
a3aefb39 JT |
80 | memmove(value_ptr(n, shift), |
81 | value_ptr(n, 0), | |
3241b1d3 JT |
82 | nr_entries * value_size); |
83 | } | |
84 | } | |
85 | ||
c671ffa5 | 86 | static int node_copy(struct btree_node *left, struct btree_node *right, int shift) |
3241b1d3 JT |
87 | { |
88 | uint32_t nr_left = le32_to_cpu(left->header.nr_entries); | |
89 | uint32_t value_size = le32_to_cpu(left->header.value_size); | |
0ef0b471 | 90 | |
c671ffa5 JT |
91 | if (value_size != le32_to_cpu(right->header.value_size)) { |
92 | DMERR("mismatched value size"); | |
93 | return -EILSEQ; | |
94 | } | |
3241b1d3 JT |
95 | |
96 | if (shift < 0) { | |
97 | shift = -shift; | |
c671ffa5 JT |
98 | |
99 | if (nr_left + shift > le32_to_cpu(left->header.max_entries)) { | |
100 | DMERR("bad shift"); | |
101 | return -EINVAL; | |
102 | } | |
103 | ||
3241b1d3 JT |
104 | memcpy(key_ptr(left, nr_left), |
105 | key_ptr(right, 0), | |
106 | shift * sizeof(__le64)); | |
a3aefb39 JT |
107 | memcpy(value_ptr(left, nr_left), |
108 | value_ptr(right, 0), | |
3241b1d3 JT |
109 | shift * value_size); |
110 | } else { | |
c671ffa5 JT |
111 | if (shift > le32_to_cpu(right->header.max_entries)) { |
112 | DMERR("bad shift"); | |
113 | return -EINVAL; | |
114 | } | |
115 | ||
3241b1d3 JT |
116 | memcpy(key_ptr(right, 0), |
117 | key_ptr(left, nr_left - shift), | |
118 | shift * sizeof(__le64)); | |
a3aefb39 JT |
119 | memcpy(value_ptr(right, 0), |
120 | value_ptr(left, nr_left - shift), | |
3241b1d3 JT |
121 | shift * value_size); |
122 | } | |
c671ffa5 | 123 | return 0; |
3241b1d3 JT |
124 | } |
125 | ||
126 | /* | |
127 | * Delete a specific entry from a leaf node. | |
128 | */ | |
86a3238c | 129 | static void delete_at(struct btree_node *n, unsigned int index) |
3241b1d3 | 130 | { |
86a3238c HM |
131 | unsigned int nr_entries = le32_to_cpu(n->header.nr_entries); |
132 | unsigned int nr_to_copy = nr_entries - (index + 1); | |
3241b1d3 | 133 | uint32_t value_size = le32_to_cpu(n->header.value_size); |
0ef0b471 | 134 | |
3241b1d3 JT |
135 | BUG_ON(index >= nr_entries); |
136 | ||
137 | if (nr_to_copy) { | |
138 | memmove(key_ptr(n, index), | |
139 | key_ptr(n, index + 1), | |
140 | nr_to_copy * sizeof(__le64)); | |
141 | ||
a3aefb39 JT |
142 | memmove(value_ptr(n, index), |
143 | value_ptr(n, index + 1), | |
3241b1d3 JT |
144 | nr_to_copy * value_size); |
145 | } | |
146 | ||
147 | n->header.nr_entries = cpu_to_le32(nr_entries - 1); | |
148 | } | |
149 | ||
86a3238c | 150 | static unsigned int merge_threshold(struct btree_node *n) |
3241b1d3 | 151 | { |
b0988900 | 152 | return le32_to_cpu(n->header.max_entries) / 3; |
3241b1d3 JT |
153 | } |
154 | ||
155 | struct child { | |
86a3238c | 156 | unsigned int index; |
3241b1d3 | 157 | struct dm_block *block; |
550929fa | 158 | struct btree_node *n; |
3241b1d3 JT |
159 | }; |
160 | ||
f046f89a JT |
161 | static int init_child(struct dm_btree_info *info, struct dm_btree_value_type *vt, |
162 | struct btree_node *parent, | |
86a3238c | 163 | unsigned int index, struct child *result) |
3241b1d3 JT |
164 | { |
165 | int r, inc; | |
166 | dm_block_t root; | |
167 | ||
168 | result->index = index; | |
169 | root = value64(parent, index); | |
170 | ||
171 | r = dm_tm_shadow_block(info->tm, root, &btree_node_validator, | |
172 | &result->block, &inc); | |
173 | if (r) | |
174 | return r; | |
175 | ||
176 | result->n = dm_block_data(result->block); | |
177 | ||
178 | if (inc) | |
f046f89a | 179 | inc_children(info->tm, result->n, vt); |
3241b1d3 | 180 | |
a3aefb39 | 181 | *((__le64 *) value_ptr(parent, index)) = |
3241b1d3 JT |
182 | cpu_to_le64(dm_block_location(result->block)); |
183 | ||
184 | return 0; | |
185 | } | |
186 | ||
4c7da06f | 187 | static void exit_child(struct dm_btree_info *info, struct child *c) |
3241b1d3 | 188 | { |
4c7da06f | 189 | dm_tm_unlock(info->tm, c->block); |
3241b1d3 JT |
190 | } |
191 | ||
c671ffa5 | 192 | static int shift(struct btree_node *left, struct btree_node *right, int count) |
3241b1d3 | 193 | { |
c671ffa5 | 194 | int r; |
b0988900 JT |
195 | uint32_t nr_left = le32_to_cpu(left->header.nr_entries); |
196 | uint32_t nr_right = le32_to_cpu(right->header.nr_entries); | |
197 | uint32_t max_entries = le32_to_cpu(left->header.max_entries); | |
198 | uint32_t r_max_entries = le32_to_cpu(right->header.max_entries); | |
199 | ||
c671ffa5 JT |
200 | if (max_entries != r_max_entries) { |
201 | DMERR("node max_entries mismatch"); | |
202 | return -EILSEQ; | |
203 | } | |
204 | ||
205 | if (nr_left - count > max_entries) { | |
206 | DMERR("node shift out of bounds"); | |
207 | return -EINVAL; | |
208 | } | |
209 | ||
210 | if (nr_right + count > max_entries) { | |
211 | DMERR("node shift out of bounds"); | |
212 | return -EINVAL; | |
213 | } | |
b0988900 | 214 | |
3241b1d3 | 215 | if (!count) |
c671ffa5 | 216 | return 0; |
3241b1d3 JT |
217 | |
218 | if (count > 0) { | |
219 | node_shift(right, count); | |
c671ffa5 JT |
220 | r = node_copy(left, right, count); |
221 | if (r) | |
222 | return r; | |
3241b1d3 | 223 | } else { |
c671ffa5 JT |
224 | r = node_copy(left, right, count); |
225 | if (r) | |
226 | return r; | |
3241b1d3 JT |
227 | node_shift(right, count); |
228 | } | |
229 | ||
b0988900 JT |
230 | left->header.nr_entries = cpu_to_le32(nr_left - count); |
231 | right->header.nr_entries = cpu_to_le32(nr_right + count); | |
c671ffa5 JT |
232 | |
233 | return 0; | |
3241b1d3 JT |
234 | } |
235 | ||
c671ffa5 JT |
236 | static int __rebalance2(struct dm_btree_info *info, struct btree_node *parent, |
237 | struct child *l, struct child *r) | |
3241b1d3 | 238 | { |
c671ffa5 | 239 | int ret; |
550929fa MP |
240 | struct btree_node *left = l->n; |
241 | struct btree_node *right = r->n; | |
3241b1d3 JT |
242 | uint32_t nr_left = le32_to_cpu(left->header.nr_entries); |
243 | uint32_t nr_right = le32_to_cpu(right->header.nr_entries); | |
474e5595 HT |
244 | /* |
245 | * Ensure the number of entries in each child will be greater | |
246 | * than or equal to (max_entries / 3 + 1), so no matter which | |
247 | * child is used for removal, the number will still be not | |
248 | * less than (max_entries / 3). | |
249 | */ | |
250 | unsigned int threshold = 2 * (merge_threshold(left) + 1); | |
3241b1d3 | 251 | |
b0988900 | 252 | if (nr_left + nr_right < threshold) { |
3241b1d3 JT |
253 | /* |
254 | * Merge | |
255 | */ | |
256 | node_copy(left, right, -nr_right); | |
257 | left->header.nr_entries = cpu_to_le32(nr_left + nr_right); | |
258 | delete_at(parent, r->index); | |
259 | ||
260 | /* | |
261 | * We need to decrement the right block, but not it's | |
262 | * children, since they're still referenced by left. | |
263 | */ | |
264 | dm_tm_dec(info->tm, dm_block_location(r->block)); | |
265 | } else { | |
266 | /* | |
267 | * Rebalance. | |
268 | */ | |
86a3238c | 269 | unsigned int target_left = (nr_left + nr_right) / 2; |
0ef0b471 | 270 | |
c671ffa5 JT |
271 | ret = shift(left, right, nr_left - target_left); |
272 | if (ret) | |
273 | return ret; | |
3241b1d3 JT |
274 | *key_ptr(parent, r->index) = right->keys[0]; |
275 | } | |
c671ffa5 | 276 | return 0; |
3241b1d3 JT |
277 | } |
278 | ||
279 | static int rebalance2(struct shadow_spine *s, struct dm_btree_info *info, | |
86a3238c | 280 | struct dm_btree_value_type *vt, unsigned int left_index) |
3241b1d3 JT |
281 | { |
282 | int r; | |
550929fa | 283 | struct btree_node *parent; |
3241b1d3 JT |
284 | struct child left, right; |
285 | ||
286 | parent = dm_block_data(shadow_current(s)); | |
287 | ||
f046f89a | 288 | r = init_child(info, vt, parent, left_index, &left); |
3241b1d3 JT |
289 | if (r) |
290 | return r; | |
291 | ||
f046f89a | 292 | r = init_child(info, vt, parent, left_index + 1, &right); |
3241b1d3 JT |
293 | if (r) { |
294 | exit_child(info, &left); | |
295 | return r; | |
296 | } | |
297 | ||
c671ffa5 | 298 | r = __rebalance2(info, parent, &left, &right); |
3241b1d3 | 299 | |
4c7da06f MP |
300 | exit_child(info, &left); |
301 | exit_child(info, &right); | |
3241b1d3 | 302 | |
c671ffa5 | 303 | return r; |
3241b1d3 JT |
304 | } |
305 | ||
b0988900 JT |
306 | /* |
307 | * We dump as many entries from center as possible into left, then the rest | |
308 | * in right, then rebalance2. This wastes some cpu, but I want something | |
309 | * simple atm. | |
310 | */ | |
c671ffa5 JT |
311 | static int delete_center_node(struct dm_btree_info *info, struct btree_node *parent, |
312 | struct child *l, struct child *c, struct child *r, | |
313 | struct btree_node *left, struct btree_node *center, struct btree_node *right, | |
314 | uint32_t nr_left, uint32_t nr_center, uint32_t nr_right) | |
b0988900 JT |
315 | { |
316 | uint32_t max_entries = le32_to_cpu(left->header.max_entries); | |
86a3238c | 317 | unsigned int shift = min(max_entries - nr_left, nr_center); |
b0988900 | 318 | |
c671ffa5 JT |
319 | if (nr_left + shift > max_entries) { |
320 | DMERR("node shift out of bounds"); | |
321 | return -EINVAL; | |
322 | } | |
323 | ||
b0988900 JT |
324 | node_copy(left, center, -shift); |
325 | left->header.nr_entries = cpu_to_le32(nr_left + shift); | |
326 | ||
327 | if (shift != nr_center) { | |
328 | shift = nr_center - shift; | |
c671ffa5 JT |
329 | |
330 | if ((nr_right + shift) > max_entries) { | |
331 | DMERR("node shift out of bounds"); | |
332 | return -EINVAL; | |
333 | } | |
334 | ||
b0988900 JT |
335 | node_shift(right, shift); |
336 | node_copy(center, right, shift); | |
337 | right->header.nr_entries = cpu_to_le32(nr_right + shift); | |
338 | } | |
339 | *key_ptr(parent, r->index) = right->keys[0]; | |
340 | ||
341 | delete_at(parent, c->index); | |
342 | r->index--; | |
343 | ||
344 | dm_tm_dec(info->tm, dm_block_location(c->block)); | |
c671ffa5 | 345 | return __rebalance2(info, parent, l, r); |
b0988900 JT |
346 | } |
347 | ||
348 | /* | |
349 | * Redistributes entries among 3 sibling nodes. | |
350 | */ | |
c671ffa5 JT |
351 | static int redistribute3(struct dm_btree_info *info, struct btree_node *parent, |
352 | struct child *l, struct child *c, struct child *r, | |
353 | struct btree_node *left, struct btree_node *center, struct btree_node *right, | |
354 | uint32_t nr_left, uint32_t nr_center, uint32_t nr_right) | |
b0988900 | 355 | { |
c671ffa5 | 356 | int s, ret; |
b0988900 | 357 | uint32_t max_entries = le32_to_cpu(left->header.max_entries); |
86a3238c HM |
358 | unsigned int total = nr_left + nr_center + nr_right; |
359 | unsigned int target_right = total / 3; | |
360 | unsigned int remainder = (target_right * 3) != total; | |
361 | unsigned int target_left = target_right + remainder; | |
2871c69e JT |
362 | |
363 | BUG_ON(target_left > max_entries); | |
364 | BUG_ON(target_right > max_entries); | |
b0988900 JT |
365 | |
366 | if (nr_left < nr_right) { | |
2871c69e | 367 | s = nr_left - target_left; |
b0988900 JT |
368 | |
369 | if (s < 0 && nr_center < -s) { | |
370 | /* not enough in central node */ | |
c671ffa5 JT |
371 | ret = shift(left, center, -nr_center); |
372 | if (ret) | |
373 | return ret; | |
b0988900 | 374 | |
c671ffa5 JT |
375 | s += nr_center; |
376 | ret = shift(left, right, s); | |
377 | if (ret) | |
378 | return ret; | |
b0988900 | 379 | |
c671ffa5 JT |
380 | nr_right += s; |
381 | } else { | |
382 | ret = shift(left, center, s); | |
383 | if (ret) | |
384 | return ret; | |
385 | } | |
386 | ||
387 | ret = shift(center, right, target_right - nr_right); | |
388 | if (ret) | |
389 | return ret; | |
b0988900 | 390 | } else { |
2871c69e | 391 | s = target_right - nr_right; |
b0988900 JT |
392 | if (s > 0 && nr_center < s) { |
393 | /* not enough in central node */ | |
c671ffa5 JT |
394 | ret = shift(center, right, nr_center); |
395 | if (ret) | |
396 | return ret; | |
4c7e3093 | 397 | s -= nr_center; |
c671ffa5 JT |
398 | ret = shift(left, right, s); |
399 | if (ret) | |
400 | return ret; | |
b0988900 | 401 | nr_left -= s; |
c671ffa5 JT |
402 | } else { |
403 | ret = shift(center, right, s); | |
404 | if (ret) | |
405 | return ret; | |
406 | } | |
407 | ||
408 | ret = shift(left, center, nr_left - target_left); | |
409 | if (ret) | |
410 | return ret; | |
b0988900 JT |
411 | } |
412 | ||
413 | *key_ptr(parent, c->index) = center->keys[0]; | |
414 | *key_ptr(parent, r->index) = right->keys[0]; | |
c671ffa5 | 415 | return 0; |
b0988900 JT |
416 | } |
417 | ||
c671ffa5 JT |
418 | static int __rebalance3(struct dm_btree_info *info, struct btree_node *parent, |
419 | struct child *l, struct child *c, struct child *r) | |
3241b1d3 | 420 | { |
550929fa MP |
421 | struct btree_node *left = l->n; |
422 | struct btree_node *center = c->n; | |
423 | struct btree_node *right = r->n; | |
3241b1d3 JT |
424 | |
425 | uint32_t nr_left = le32_to_cpu(left->header.nr_entries); | |
426 | uint32_t nr_center = le32_to_cpu(center->header.nr_entries); | |
427 | uint32_t nr_right = le32_to_cpu(right->header.nr_entries); | |
3241b1d3 | 428 | |
86a3238c | 429 | unsigned int threshold = merge_threshold(left) * 4 + 1; |
3241b1d3 | 430 | |
c671ffa5 JT |
431 | if ((left->header.max_entries != center->header.max_entries) || |
432 | (center->header.max_entries != right->header.max_entries)) { | |
433 | DMERR("bad btree metadata, max_entries differ"); | |
434 | return -EILSEQ; | |
435 | } | |
3241b1d3 | 436 | |
c671ffa5 JT |
437 | if ((nr_left + nr_center + nr_right) < threshold) { |
438 | return delete_center_node(info, parent, l, c, r, left, center, right, | |
439 | nr_left, nr_center, nr_right); | |
440 | } | |
441 | ||
442 | return redistribute3(info, parent, l, c, r, left, center, right, | |
443 | nr_left, nr_center, nr_right); | |
3241b1d3 JT |
444 | } |
445 | ||
446 | static int rebalance3(struct shadow_spine *s, struct dm_btree_info *info, | |
86a3238c | 447 | struct dm_btree_value_type *vt, unsigned int left_index) |
3241b1d3 JT |
448 | { |
449 | int r; | |
550929fa | 450 | struct btree_node *parent = dm_block_data(shadow_current(s)); |
3241b1d3 JT |
451 | struct child left, center, right; |
452 | ||
453 | /* | |
454 | * FIXME: fill out an array? | |
455 | */ | |
f046f89a | 456 | r = init_child(info, vt, parent, left_index, &left); |
3241b1d3 JT |
457 | if (r) |
458 | return r; | |
459 | ||
f046f89a | 460 | r = init_child(info, vt, parent, left_index + 1, ¢er); |
3241b1d3 JT |
461 | if (r) { |
462 | exit_child(info, &left); | |
463 | return r; | |
464 | } | |
465 | ||
f046f89a | 466 | r = init_child(info, vt, parent, left_index + 2, &right); |
3241b1d3 JT |
467 | if (r) { |
468 | exit_child(info, &left); | |
469 | exit_child(info, ¢er); | |
470 | return r; | |
471 | } | |
472 | ||
c671ffa5 | 473 | r = __rebalance3(info, parent, &left, ¢er, &right); |
3241b1d3 | 474 | |
4c7da06f MP |
475 | exit_child(info, &left); |
476 | exit_child(info, ¢er); | |
477 | exit_child(info, &right); | |
3241b1d3 | 478 | |
c671ffa5 | 479 | return r; |
3241b1d3 JT |
480 | } |
481 | ||
3241b1d3 | 482 | static int rebalance_children(struct shadow_spine *s, |
f046f89a JT |
483 | struct dm_btree_info *info, |
484 | struct dm_btree_value_type *vt, uint64_t key) | |
3241b1d3 JT |
485 | { |
486 | int i, r, has_left_sibling, has_right_sibling; | |
550929fa | 487 | struct btree_node *n; |
3241b1d3 JT |
488 | |
489 | n = dm_block_data(shadow_current(s)); | |
490 | ||
491 | if (le32_to_cpu(n->header.nr_entries) == 1) { | |
492 | struct dm_block *child; | |
493 | dm_block_t b = value64(n, 0); | |
494 | ||
495 | r = dm_tm_read_lock(info->tm, b, &btree_node_validator, &child); | |
496 | if (r) | |
497 | return r; | |
498 | ||
499 | memcpy(n, dm_block_data(child), | |
500 | dm_bm_block_size(dm_tm_get_bm(info->tm))); | |
3241b1d3 JT |
501 | |
502 | dm_tm_dec(info->tm, dm_block_location(child)); | |
1b8d2789 | 503 | dm_tm_unlock(info->tm, child); |
3241b1d3 JT |
504 | return 0; |
505 | } | |
506 | ||
507 | i = lower_bound(n, key); | |
508 | if (i < 0) | |
509 | return -ENODATA; | |
510 | ||
3241b1d3 JT |
511 | has_left_sibling = i > 0; |
512 | has_right_sibling = i < (le32_to_cpu(n->header.nr_entries) - 1); | |
513 | ||
514 | if (!has_left_sibling) | |
f046f89a | 515 | r = rebalance2(s, info, vt, i); |
3241b1d3 JT |
516 | |
517 | else if (!has_right_sibling) | |
f046f89a | 518 | r = rebalance2(s, info, vt, i - 1); |
3241b1d3 JT |
519 | |
520 | else | |
f046f89a | 521 | r = rebalance3(s, info, vt, i - 1); |
3241b1d3 JT |
522 | |
523 | return r; | |
524 | } | |
525 | ||
86a3238c | 526 | static int do_leaf(struct btree_node *n, uint64_t key, unsigned int *index) |
3241b1d3 JT |
527 | { |
528 | int i = lower_bound(n, key); | |
529 | ||
530 | if ((i < 0) || | |
531 | (i >= le32_to_cpu(n->header.nr_entries)) || | |
532 | (le64_to_cpu(n->keys[i]) != key)) | |
533 | return -ENODATA; | |
534 | ||
535 | *index = i; | |
536 | ||
537 | return 0; | |
538 | } | |
539 | ||
540 | /* | |
541 | * Prepares for removal from one level of the hierarchy. The caller must | |
542 | * call delete_at() to remove the entry at index. | |
543 | */ | |
544 | static int remove_raw(struct shadow_spine *s, struct dm_btree_info *info, | |
545 | struct dm_btree_value_type *vt, dm_block_t root, | |
86a3238c | 546 | uint64_t key, unsigned int *index) |
3241b1d3 JT |
547 | { |
548 | int i = *index, r; | |
550929fa | 549 | struct btree_node *n; |
3241b1d3 JT |
550 | |
551 | for (;;) { | |
552 | r = shadow_step(s, root, vt); | |
553 | if (r < 0) | |
554 | break; | |
555 | ||
556 | /* | |
557 | * We have to patch up the parent node, ugly, but I don't | |
558 | * see a way to do this automatically as part of the spine | |
559 | * op. | |
560 | */ | |
561 | if (shadow_has_parent(s)) { | |
562 | __le64 location = cpu_to_le64(dm_block_location(shadow_current(s))); | |
0ef0b471 | 563 | |
a3aefb39 | 564 | memcpy(value_ptr(dm_block_data(shadow_parent(s)), i), |
3241b1d3 JT |
565 | &location, sizeof(__le64)); |
566 | } | |
567 | ||
568 | n = dm_block_data(shadow_current(s)); | |
569 | ||
570 | if (le32_to_cpu(n->header.flags) & LEAF_NODE) | |
571 | return do_leaf(n, key, index); | |
572 | ||
f046f89a | 573 | r = rebalance_children(s, info, vt, key); |
3241b1d3 JT |
574 | if (r) |
575 | break; | |
576 | ||
577 | n = dm_block_data(shadow_current(s)); | |
578 | if (le32_to_cpu(n->header.flags) & LEAF_NODE) | |
579 | return do_leaf(n, key, index); | |
580 | ||
581 | i = lower_bound(n, key); | |
582 | ||
583 | /* | |
584 | * We know the key is present, or else | |
585 | * rebalance_children would have returned | |
586 | * -ENODATA | |
587 | */ | |
588 | root = value64(n, i); | |
589 | } | |
590 | ||
591 | return r; | |
592 | } | |
593 | ||
594 | int dm_btree_remove(struct dm_btree_info *info, dm_block_t root, | |
595 | uint64_t *keys, dm_block_t *new_root) | |
596 | { | |
86a3238c | 597 | unsigned int level, last_level = info->levels - 1; |
3241b1d3 JT |
598 | int index = 0, r = 0; |
599 | struct shadow_spine spine; | |
550929fa | 600 | struct btree_node *n; |
b0dc3c8b | 601 | struct dm_btree_value_type le64_vt; |
3241b1d3 | 602 | |
b0dc3c8b | 603 | init_le64_type(info->tm, &le64_vt); |
3241b1d3 JT |
604 | init_shadow_spine(&spine, info); |
605 | for (level = 0; level < info->levels; level++) { | |
606 | r = remove_raw(&spine, info, | |
607 | (level == last_level ? | |
b0dc3c8b | 608 | &info->value_type : &le64_vt), |
86a3238c | 609 | root, keys[level], (unsigned int *)&index); |
3241b1d3 JT |
610 | if (r < 0) |
611 | break; | |
612 | ||
613 | n = dm_block_data(shadow_current(&spine)); | |
614 | if (level != last_level) { | |
615 | root = value64(n, index); | |
616 | continue; | |
617 | } | |
618 | ||
619 | BUG_ON(index < 0 || index >= le32_to_cpu(n->header.nr_entries)); | |
620 | ||
621 | if (info->value_type.dec) | |
622 | info->value_type.dec(info->value_type.context, | |
be500ed7 | 623 | value_ptr(n, index), 1); |
3241b1d3 JT |
624 | |
625 | delete_at(n, index); | |
626 | } | |
627 | ||
b6e58b54 HT |
628 | if (!r) |
629 | *new_root = shadow_root(&spine); | |
3241b1d3 JT |
630 | exit_shadow_spine(&spine); |
631 | ||
632 | return r; | |
633 | } | |
634 | EXPORT_SYMBOL_GPL(dm_btree_remove); | |
4ec331c3 JT |
635 | |
636 | /*----------------------------------------------------------------*/ | |
637 | ||
638 | static int remove_nearest(struct shadow_spine *s, struct dm_btree_info *info, | |
639 | struct dm_btree_value_type *vt, dm_block_t root, | |
640 | uint64_t key, int *index) | |
641 | { | |
642 | int i = *index, r; | |
643 | struct btree_node *n; | |
644 | ||
645 | for (;;) { | |
646 | r = shadow_step(s, root, vt); | |
647 | if (r < 0) | |
648 | break; | |
649 | ||
650 | /* | |
651 | * We have to patch up the parent node, ugly, but I don't | |
652 | * see a way to do this automatically as part of the spine | |
653 | * op. | |
654 | */ | |
655 | if (shadow_has_parent(s)) { | |
656 | __le64 location = cpu_to_le64(dm_block_location(shadow_current(s))); | |
0ef0b471 | 657 | |
4ec331c3 JT |
658 | memcpy(value_ptr(dm_block_data(shadow_parent(s)), i), |
659 | &location, sizeof(__le64)); | |
660 | } | |
661 | ||
662 | n = dm_block_data(shadow_current(s)); | |
663 | ||
664 | if (le32_to_cpu(n->header.flags) & LEAF_NODE) { | |
665 | *index = lower_bound(n, key); | |
666 | return 0; | |
667 | } | |
668 | ||
669 | r = rebalance_children(s, info, vt, key); | |
670 | if (r) | |
671 | break; | |
672 | ||
673 | n = dm_block_data(shadow_current(s)); | |
674 | if (le32_to_cpu(n->header.flags) & LEAF_NODE) { | |
675 | *index = lower_bound(n, key); | |
676 | return 0; | |
677 | } | |
678 | ||
679 | i = lower_bound(n, key); | |
680 | ||
681 | /* | |
682 | * We know the key is present, or else | |
683 | * rebalance_children would have returned | |
684 | * -ENODATA | |
685 | */ | |
686 | root = value64(n, i); | |
687 | } | |
688 | ||
689 | return r; | |
690 | } | |
691 | ||
692 | static int remove_one(struct dm_btree_info *info, dm_block_t root, | |
693 | uint64_t *keys, uint64_t end_key, | |
86a3238c | 694 | dm_block_t *new_root, unsigned int *nr_removed) |
4ec331c3 | 695 | { |
86a3238c | 696 | unsigned int level, last_level = info->levels - 1; |
4ec331c3 JT |
697 | int index = 0, r = 0; |
698 | struct shadow_spine spine; | |
699 | struct btree_node *n; | |
b0dc3c8b | 700 | struct dm_btree_value_type le64_vt; |
4ec331c3 JT |
701 | uint64_t k; |
702 | ||
b0dc3c8b | 703 | init_le64_type(info->tm, &le64_vt); |
4ec331c3 JT |
704 | init_shadow_spine(&spine, info); |
705 | for (level = 0; level < last_level; level++) { | |
b0dc3c8b | 706 | r = remove_raw(&spine, info, &le64_vt, |
86a3238c | 707 | root, keys[level], (unsigned int *) &index); |
4ec331c3 JT |
708 | if (r < 0) |
709 | goto out; | |
710 | ||
711 | n = dm_block_data(shadow_current(&spine)); | |
712 | root = value64(n, index); | |
713 | } | |
714 | ||
715 | r = remove_nearest(&spine, info, &info->value_type, | |
716 | root, keys[last_level], &index); | |
717 | if (r < 0) | |
718 | goto out; | |
719 | ||
720 | n = dm_block_data(shadow_current(&spine)); | |
721 | ||
722 | if (index < 0) | |
723 | index = 0; | |
724 | ||
725 | if (index >= le32_to_cpu(n->header.nr_entries)) { | |
726 | r = -ENODATA; | |
727 | goto out; | |
728 | } | |
729 | ||
730 | k = le64_to_cpu(n->keys[index]); | |
731 | if (k >= keys[last_level] && k < end_key) { | |
732 | if (info->value_type.dec) | |
733 | info->value_type.dec(info->value_type.context, | |
be500ed7 | 734 | value_ptr(n, index), 1); |
4ec331c3 JT |
735 | |
736 | delete_at(n, index); | |
aa0cd28d | 737 | keys[last_level] = k + 1ull; |
4ec331c3 JT |
738 | |
739 | } else | |
740 | r = -ENODATA; | |
741 | ||
742 | out: | |
743 | *new_root = shadow_root(&spine); | |
744 | exit_shadow_spine(&spine); | |
745 | ||
746 | return r; | |
747 | } | |
748 | ||
749 | int dm_btree_remove_leaves(struct dm_btree_info *info, dm_block_t root, | |
750 | uint64_t *first_key, uint64_t end_key, | |
86a3238c | 751 | dm_block_t *new_root, unsigned int *nr_removed) |
4ec331c3 JT |
752 | { |
753 | int r; | |
754 | ||
755 | *nr_removed = 0; | |
756 | do { | |
757 | r = remove_one(info, root, first_key, end_key, &root, nr_removed); | |
758 | if (!r) | |
759 | (*nr_removed)++; | |
760 | } while (!r); | |
761 | ||
762 | *new_root = root; | |
763 | return r == -ENODATA ? 0 : r; | |
764 | } | |
765 | EXPORT_SYMBOL_GPL(dm_btree_remove_leaves); |