Merge tag 'devicetree-fixes-for-6.9-1' of git://git.kernel.org/pub/scm/linux/kernel...
[linux-block.git] / drivers / md / persistent-data / dm-btree-remove.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (C) 2011 Red Hat, Inc.
4  *
5  * This file is released under the GPL.
6  */
7
8 #include "dm-btree.h"
9 #include "dm-btree-internal.h"
10 #include "dm-transaction-manager.h"
11
12 #include <linux/export.h>
13 #include <linux/device-mapper.h>
14
15 #define DM_MSG_PREFIX "btree"
16
17 /*
18  * Removing an entry from a btree
19  * ==============================
20  *
21  * A very important constraint for our btree is that no node, except the
22  * root, may have fewer than a certain number of entries.
23  * (MIN_ENTRIES <= nr_entries <= MAX_ENTRIES).
24  *
25  * Ensuring this is complicated by the way we want to only ever hold the
26  * locks on 2 nodes concurrently, and only change nodes in a top to bottom
27  * fashion.
28  *
29  * Each node may have a left or right sibling.  When decending the spine,
30  * if a node contains only MIN_ENTRIES then we try and increase this to at
31  * least MIN_ENTRIES + 1.  We do this in the following ways:
32  *
33  * [A] No siblings => this can only happen if the node is the root, in which
34  *     case we copy the childs contents over the root.
35  *
36  * [B] No left sibling
37  *     ==> rebalance(node, right sibling)
38  *
39  * [C] No right sibling
40  *     ==> rebalance(left sibling, node)
41  *
42  * [D] Both siblings, total_entries(left, node, right) <= DEL_THRESHOLD
43  *     ==> delete node adding it's contents to left and right
44  *
45  * [E] Both siblings, total_entries(left, node, right) > DEL_THRESHOLD
46  *     ==> rebalance(left, node, right)
47  *
48  * After these operations it's possible that the our original node no
49  * longer contains the desired sub tree.  For this reason this rebalancing
50  * is performed on the children of the current node.  This also avoids
51  * having a special case for the root.
52  *
53  * Once this rebalancing has occurred we can then step into the child node
54  * for internal nodes.  Or delete the entry for leaf nodes.
55  */
56
57 /*
58  * Some little utilities for moving node data around.
59  */
60 static void node_shift(struct btree_node *n, int shift)
61 {
62         uint32_t nr_entries = le32_to_cpu(n->header.nr_entries);
63         uint32_t value_size = le32_to_cpu(n->header.value_size);
64
65         if (shift < 0) {
66                 shift = -shift;
67                 BUG_ON(shift > nr_entries);
68                 BUG_ON((void *) key_ptr(n, shift) >= value_ptr(n, shift));
69                 memmove(key_ptr(n, 0),
70                         key_ptr(n, shift),
71                         (nr_entries - shift) * sizeof(__le64));
72                 memmove(value_ptr(n, 0),
73                         value_ptr(n, shift),
74                         (nr_entries - shift) * value_size);
75         } else {
76                 BUG_ON(nr_entries + shift > le32_to_cpu(n->header.max_entries));
77                 memmove(key_ptr(n, shift),
78                         key_ptr(n, 0),
79                         nr_entries * sizeof(__le64));
80                 memmove(value_ptr(n, shift),
81                         value_ptr(n, 0),
82                         nr_entries * value_size);
83         }
84 }
85
86 static int node_copy(struct btree_node *left, struct btree_node *right, int shift)
87 {
88         uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
89         uint32_t value_size = le32_to_cpu(left->header.value_size);
90
91         if (value_size != le32_to_cpu(right->header.value_size)) {
92                 DMERR("mismatched value size");
93                 return -EILSEQ;
94         }
95
96         if (shift < 0) {
97                 shift = -shift;
98
99                 if (nr_left + shift > le32_to_cpu(left->header.max_entries)) {
100                         DMERR("bad shift");
101                         return -EINVAL;
102                 }
103
104                 memcpy(key_ptr(left, nr_left),
105                        key_ptr(right, 0),
106                        shift * sizeof(__le64));
107                 memcpy(value_ptr(left, nr_left),
108                        value_ptr(right, 0),
109                        shift * value_size);
110         } else {
111                 if (shift > le32_to_cpu(right->header.max_entries)) {
112                         DMERR("bad shift");
113                         return -EINVAL;
114                 }
115
116                 memcpy(key_ptr(right, 0),
117                        key_ptr(left, nr_left - shift),
118                        shift * sizeof(__le64));
119                 memcpy(value_ptr(right, 0),
120                        value_ptr(left, nr_left - shift),
121                        shift * value_size);
122         }
123         return 0;
124 }
125
126 /*
127  * Delete a specific entry from a leaf node.
128  */
129 static void delete_at(struct btree_node *n, unsigned int index)
130 {
131         unsigned int nr_entries = le32_to_cpu(n->header.nr_entries);
132         unsigned int nr_to_copy = nr_entries - (index + 1);
133         uint32_t value_size = le32_to_cpu(n->header.value_size);
134
135         BUG_ON(index >= nr_entries);
136
137         if (nr_to_copy) {
138                 memmove(key_ptr(n, index),
139                         key_ptr(n, index + 1),
140                         nr_to_copy * sizeof(__le64));
141
142                 memmove(value_ptr(n, index),
143                         value_ptr(n, index + 1),
144                         nr_to_copy * value_size);
145         }
146
147         n->header.nr_entries = cpu_to_le32(nr_entries - 1);
148 }
149
150 static unsigned int merge_threshold(struct btree_node *n)
151 {
152         return le32_to_cpu(n->header.max_entries) / 3;
153 }
154
155 struct child {
156         unsigned int index;
157         struct dm_block *block;
158         struct btree_node *n;
159 };
160
161 static int init_child(struct dm_btree_info *info, struct dm_btree_value_type *vt,
162                       struct btree_node *parent,
163                       unsigned int index, struct child *result)
164 {
165         int r, inc;
166         dm_block_t root;
167
168         result->index = index;
169         root = value64(parent, index);
170
171         r = dm_tm_shadow_block(info->tm, root, &btree_node_validator,
172                                &result->block, &inc);
173         if (r)
174                 return r;
175
176         result->n = dm_block_data(result->block);
177
178         if (inc)
179                 inc_children(info->tm, result->n, vt);
180
181         *((__le64 *) value_ptr(parent, index)) =
182                 cpu_to_le64(dm_block_location(result->block));
183
184         return 0;
185 }
186
187 static void exit_child(struct dm_btree_info *info, struct child *c)
188 {
189         dm_tm_unlock(info->tm, c->block);
190 }
191
192 static int shift(struct btree_node *left, struct btree_node *right, int count)
193 {
194         int r;
195         uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
196         uint32_t nr_right = le32_to_cpu(right->header.nr_entries);
197         uint32_t max_entries = le32_to_cpu(left->header.max_entries);
198         uint32_t r_max_entries = le32_to_cpu(right->header.max_entries);
199
200         if (max_entries != r_max_entries) {
201                 DMERR("node max_entries mismatch");
202                 return -EILSEQ;
203         }
204
205         if (nr_left - count > max_entries) {
206                 DMERR("node shift out of bounds");
207                 return -EINVAL;
208         }
209
210         if (nr_right + count > max_entries) {
211                 DMERR("node shift out of bounds");
212                 return -EINVAL;
213         }
214
215         if (!count)
216                 return 0;
217
218         if (count > 0) {
219                 node_shift(right, count);
220                 r = node_copy(left, right, count);
221                 if (r)
222                         return r;
223         } else {
224                 r = node_copy(left, right, count);
225                 if (r)
226                         return r;
227                 node_shift(right, count);
228         }
229
230         left->header.nr_entries = cpu_to_le32(nr_left - count);
231         right->header.nr_entries = cpu_to_le32(nr_right + count);
232
233         return 0;
234 }
235
236 static int __rebalance2(struct dm_btree_info *info, struct btree_node *parent,
237                         struct child *l, struct child *r)
238 {
239         int ret;
240         struct btree_node *left = l->n;
241         struct btree_node *right = r->n;
242         uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
243         uint32_t nr_right = le32_to_cpu(right->header.nr_entries);
244         /*
245          * Ensure the number of entries in each child will be greater
246          * than or equal to (max_entries / 3 + 1), so no matter which
247          * child is used for removal, the number will still be not
248          * less than (max_entries / 3).
249          */
250         unsigned int threshold = 2 * (merge_threshold(left) + 1);
251
252         if (nr_left + nr_right < threshold) {
253                 /*
254                  * Merge
255                  */
256                 node_copy(left, right, -nr_right);
257                 left->header.nr_entries = cpu_to_le32(nr_left + nr_right);
258                 delete_at(parent, r->index);
259
260                 /*
261                  * We need to decrement the right block, but not it's
262                  * children, since they're still referenced by left.
263                  */
264                 dm_tm_dec(info->tm, dm_block_location(r->block));
265         } else {
266                 /*
267                  * Rebalance.
268                  */
269                 unsigned int target_left = (nr_left + nr_right) / 2;
270
271                 ret = shift(left, right, nr_left - target_left);
272                 if (ret)
273                         return ret;
274                 *key_ptr(parent, r->index) = right->keys[0];
275         }
276         return 0;
277 }
278
279 static int rebalance2(struct shadow_spine *s, struct dm_btree_info *info,
280                       struct dm_btree_value_type *vt, unsigned int left_index)
281 {
282         int r;
283         struct btree_node *parent;
284         struct child left, right;
285
286         parent = dm_block_data(shadow_current(s));
287
288         r = init_child(info, vt, parent, left_index, &left);
289         if (r)
290                 return r;
291
292         r = init_child(info, vt, parent, left_index + 1, &right);
293         if (r) {
294                 exit_child(info, &left);
295                 return r;
296         }
297
298         r = __rebalance2(info, parent, &left, &right);
299
300         exit_child(info, &left);
301         exit_child(info, &right);
302
303         return r;
304 }
305
306 /*
307  * We dump as many entries from center as possible into left, then the rest
308  * in right, then rebalance2.  This wastes some cpu, but I want something
309  * simple atm.
310  */
311 static int delete_center_node(struct dm_btree_info *info, struct btree_node *parent,
312                               struct child *l, struct child *c, struct child *r,
313                               struct btree_node *left, struct btree_node *center, struct btree_node *right,
314                               uint32_t nr_left, uint32_t nr_center, uint32_t nr_right)
315 {
316         uint32_t max_entries = le32_to_cpu(left->header.max_entries);
317         unsigned int shift = min(max_entries - nr_left, nr_center);
318
319         if (nr_left + shift > max_entries) {
320                 DMERR("node shift out of bounds");
321                 return -EINVAL;
322         }
323
324         node_copy(left, center, -shift);
325         left->header.nr_entries = cpu_to_le32(nr_left + shift);
326
327         if (shift != nr_center) {
328                 shift = nr_center - shift;
329
330                 if ((nr_right + shift) > max_entries) {
331                         DMERR("node shift out of bounds");
332                         return -EINVAL;
333                 }
334
335                 node_shift(right, shift);
336                 node_copy(center, right, shift);
337                 right->header.nr_entries = cpu_to_le32(nr_right + shift);
338         }
339         *key_ptr(parent, r->index) = right->keys[0];
340
341         delete_at(parent, c->index);
342         r->index--;
343
344         dm_tm_dec(info->tm, dm_block_location(c->block));
345         return __rebalance2(info, parent, l, r);
346 }
347
348 /*
349  * Redistributes entries among 3 sibling nodes.
350  */
351 static int redistribute3(struct dm_btree_info *info, struct btree_node *parent,
352                          struct child *l, struct child *c, struct child *r,
353                          struct btree_node *left, struct btree_node *center, struct btree_node *right,
354                          uint32_t nr_left, uint32_t nr_center, uint32_t nr_right)
355 {
356         int s, ret;
357         uint32_t max_entries = le32_to_cpu(left->header.max_entries);
358         unsigned int total = nr_left + nr_center + nr_right;
359         unsigned int target_right = total / 3;
360         unsigned int remainder = (target_right * 3) != total;
361         unsigned int target_left = target_right + remainder;
362
363         BUG_ON(target_left > max_entries);
364         BUG_ON(target_right > max_entries);
365
366         if (nr_left < nr_right) {
367                 s = nr_left - target_left;
368
369                 if (s < 0 && nr_center < -s) {
370                         /* not enough in central node */
371                         ret = shift(left, center, -nr_center);
372                         if (ret)
373                                 return ret;
374
375                         s += nr_center;
376                         ret = shift(left, right, s);
377                         if (ret)
378                                 return ret;
379
380                         nr_right += s;
381                 } else {
382                         ret = shift(left, center, s);
383                         if (ret)
384                                 return ret;
385                 }
386
387                 ret = shift(center, right, target_right - nr_right);
388                 if (ret)
389                         return ret;
390         } else {
391                 s = target_right - nr_right;
392                 if (s > 0 && nr_center < s) {
393                         /* not enough in central node */
394                         ret = shift(center, right, nr_center);
395                         if (ret)
396                                 return ret;
397                         s -= nr_center;
398                         ret = shift(left, right, s);
399                         if (ret)
400                                 return ret;
401                         nr_left -= s;
402                 } else {
403                         ret = shift(center, right, s);
404                         if (ret)
405                                 return ret;
406                 }
407
408                 ret = shift(left, center, nr_left - target_left);
409                 if (ret)
410                         return ret;
411         }
412
413         *key_ptr(parent, c->index) = center->keys[0];
414         *key_ptr(parent, r->index) = right->keys[0];
415         return 0;
416 }
417
418 static int __rebalance3(struct dm_btree_info *info, struct btree_node *parent,
419                         struct child *l, struct child *c, struct child *r)
420 {
421         struct btree_node *left = l->n;
422         struct btree_node *center = c->n;
423         struct btree_node *right = r->n;
424
425         uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
426         uint32_t nr_center = le32_to_cpu(center->header.nr_entries);
427         uint32_t nr_right = le32_to_cpu(right->header.nr_entries);
428
429         unsigned int threshold = merge_threshold(left) * 4 + 1;
430
431         if ((left->header.max_entries != center->header.max_entries) ||
432             (center->header.max_entries != right->header.max_entries)) {
433                 DMERR("bad btree metadata, max_entries differ");
434                 return -EILSEQ;
435         }
436
437         if ((nr_left + nr_center + nr_right) < threshold) {
438                 return delete_center_node(info, parent, l, c, r, left, center, right,
439                                           nr_left, nr_center, nr_right);
440         }
441
442         return redistribute3(info, parent, l, c, r, left, center, right,
443                              nr_left, nr_center, nr_right);
444 }
445
446 static int rebalance3(struct shadow_spine *s, struct dm_btree_info *info,
447                       struct dm_btree_value_type *vt, unsigned int left_index)
448 {
449         int r;
450         struct btree_node *parent = dm_block_data(shadow_current(s));
451         struct child left, center, right;
452
453         /*
454          * FIXME: fill out an array?
455          */
456         r = init_child(info, vt, parent, left_index, &left);
457         if (r)
458                 return r;
459
460         r = init_child(info, vt, parent, left_index + 1, &center);
461         if (r) {
462                 exit_child(info, &left);
463                 return r;
464         }
465
466         r = init_child(info, vt, parent, left_index + 2, &right);
467         if (r) {
468                 exit_child(info, &left);
469                 exit_child(info, &center);
470                 return r;
471         }
472
473         r = __rebalance3(info, parent, &left, &center, &right);
474
475         exit_child(info, &left);
476         exit_child(info, &center);
477         exit_child(info, &right);
478
479         return r;
480 }
481
482 static int rebalance_children(struct shadow_spine *s,
483                               struct dm_btree_info *info,
484                               struct dm_btree_value_type *vt, uint64_t key)
485 {
486         int i, r, has_left_sibling, has_right_sibling;
487         struct btree_node *n;
488
489         n = dm_block_data(shadow_current(s));
490
491         if (le32_to_cpu(n->header.nr_entries) == 1) {
492                 struct dm_block *child;
493                 dm_block_t b = value64(n, 0);
494
495                 r = dm_tm_read_lock(info->tm, b, &btree_node_validator, &child);
496                 if (r)
497                         return r;
498
499                 memcpy(n, dm_block_data(child),
500                        dm_bm_block_size(dm_tm_get_bm(info->tm)));
501
502                 dm_tm_dec(info->tm, dm_block_location(child));
503                 dm_tm_unlock(info->tm, child);
504                 return 0;
505         }
506
507         i = lower_bound(n, key);
508         if (i < 0)
509                 return -ENODATA;
510
511         has_left_sibling = i > 0;
512         has_right_sibling = i < (le32_to_cpu(n->header.nr_entries) - 1);
513
514         if (!has_left_sibling)
515                 r = rebalance2(s, info, vt, i);
516
517         else if (!has_right_sibling)
518                 r = rebalance2(s, info, vt, i - 1);
519
520         else
521                 r = rebalance3(s, info, vt, i - 1);
522
523         return r;
524 }
525
526 static int do_leaf(struct btree_node *n, uint64_t key, unsigned int *index)
527 {
528         int i = lower_bound(n, key);
529
530         if ((i < 0) ||
531             (i >= le32_to_cpu(n->header.nr_entries)) ||
532             (le64_to_cpu(n->keys[i]) != key))
533                 return -ENODATA;
534
535         *index = i;
536
537         return 0;
538 }
539
540 /*
541  * Prepares for removal from one level of the hierarchy.  The caller must
542  * call delete_at() to remove the entry at index.
543  */
544 static int remove_raw(struct shadow_spine *s, struct dm_btree_info *info,
545                       struct dm_btree_value_type *vt, dm_block_t root,
546                       uint64_t key, unsigned int *index)
547 {
548         int i = *index, r;
549         struct btree_node *n;
550
551         for (;;) {
552                 r = shadow_step(s, root, vt);
553                 if (r < 0)
554                         break;
555
556                 /*
557                  * We have to patch up the parent node, ugly, but I don't
558                  * see a way to do this automatically as part of the spine
559                  * op.
560                  */
561                 if (shadow_has_parent(s)) {
562                         __le64 location = cpu_to_le64(dm_block_location(shadow_current(s)));
563
564                         memcpy(value_ptr(dm_block_data(shadow_parent(s)), i),
565                                &location, sizeof(__le64));
566                 }
567
568                 n = dm_block_data(shadow_current(s));
569
570                 if (le32_to_cpu(n->header.flags) & LEAF_NODE)
571                         return do_leaf(n, key, index);
572
573                 r = rebalance_children(s, info, vt, key);
574                 if (r)
575                         break;
576
577                 n = dm_block_data(shadow_current(s));
578                 if (le32_to_cpu(n->header.flags) & LEAF_NODE)
579                         return do_leaf(n, key, index);
580
581                 i = lower_bound(n, key);
582
583                 /*
584                  * We know the key is present, or else
585                  * rebalance_children would have returned
586                  * -ENODATA
587                  */
588                 root = value64(n, i);
589         }
590
591         return r;
592 }
593
594 int dm_btree_remove(struct dm_btree_info *info, dm_block_t root,
595                     uint64_t *keys, dm_block_t *new_root)
596 {
597         unsigned int level, last_level = info->levels - 1;
598         int index = 0, r = 0;
599         struct shadow_spine spine;
600         struct btree_node *n;
601         struct dm_btree_value_type le64_vt;
602
603         init_le64_type(info->tm, &le64_vt);
604         init_shadow_spine(&spine, info);
605         for (level = 0; level < info->levels; level++) {
606                 r = remove_raw(&spine, info,
607                                (level == last_level ?
608                                 &info->value_type : &le64_vt),
609                                root, keys[level], (unsigned int *)&index);
610                 if (r < 0)
611                         break;
612
613                 n = dm_block_data(shadow_current(&spine));
614                 if (level != last_level) {
615                         root = value64(n, index);
616                         continue;
617                 }
618
619                 BUG_ON(index < 0 || index >= le32_to_cpu(n->header.nr_entries));
620
621                 if (info->value_type.dec)
622                         info->value_type.dec(info->value_type.context,
623                                              value_ptr(n, index), 1);
624
625                 delete_at(n, index);
626         }
627
628         if (!r)
629                 *new_root = shadow_root(&spine);
630         exit_shadow_spine(&spine);
631
632         return r;
633 }
634 EXPORT_SYMBOL_GPL(dm_btree_remove);
635
636 /*----------------------------------------------------------------*/
637
638 static int remove_nearest(struct shadow_spine *s, struct dm_btree_info *info,
639                           struct dm_btree_value_type *vt, dm_block_t root,
640                           uint64_t key, int *index)
641 {
642         int i = *index, r;
643         struct btree_node *n;
644
645         for (;;) {
646                 r = shadow_step(s, root, vt);
647                 if (r < 0)
648                         break;
649
650                 /*
651                  * We have to patch up the parent node, ugly, but I don't
652                  * see a way to do this automatically as part of the spine
653                  * op.
654                  */
655                 if (shadow_has_parent(s)) {
656                         __le64 location = cpu_to_le64(dm_block_location(shadow_current(s)));
657
658                         memcpy(value_ptr(dm_block_data(shadow_parent(s)), i),
659                                &location, sizeof(__le64));
660                 }
661
662                 n = dm_block_data(shadow_current(s));
663
664                 if (le32_to_cpu(n->header.flags) & LEAF_NODE) {
665                         *index = lower_bound(n, key);
666                         return 0;
667                 }
668
669                 r = rebalance_children(s, info, vt, key);
670                 if (r)
671                         break;
672
673                 n = dm_block_data(shadow_current(s));
674                 if (le32_to_cpu(n->header.flags) & LEAF_NODE) {
675                         *index = lower_bound(n, key);
676                         return 0;
677                 }
678
679                 i = lower_bound(n, key);
680
681                 /*
682                  * We know the key is present, or else
683                  * rebalance_children would have returned
684                  * -ENODATA
685                  */
686                 root = value64(n, i);
687         }
688
689         return r;
690 }
691
692 static int remove_one(struct dm_btree_info *info, dm_block_t root,
693                       uint64_t *keys, uint64_t end_key,
694                       dm_block_t *new_root, unsigned int *nr_removed)
695 {
696         unsigned int level, last_level = info->levels - 1;
697         int index = 0, r = 0;
698         struct shadow_spine spine;
699         struct btree_node *n;
700         struct dm_btree_value_type le64_vt;
701         uint64_t k;
702
703         init_le64_type(info->tm, &le64_vt);
704         init_shadow_spine(&spine, info);
705         for (level = 0; level < last_level; level++) {
706                 r = remove_raw(&spine, info, &le64_vt,
707                                root, keys[level], (unsigned int *) &index);
708                 if (r < 0)
709                         goto out;
710
711                 n = dm_block_data(shadow_current(&spine));
712                 root = value64(n, index);
713         }
714
715         r = remove_nearest(&spine, info, &info->value_type,
716                            root, keys[last_level], &index);
717         if (r < 0)
718                 goto out;
719
720         n = dm_block_data(shadow_current(&spine));
721
722         if (index < 0)
723                 index = 0;
724
725         if (index >= le32_to_cpu(n->header.nr_entries)) {
726                 r = -ENODATA;
727                 goto out;
728         }
729
730         k = le64_to_cpu(n->keys[index]);
731         if (k >= keys[last_level] && k < end_key) {
732                 if (info->value_type.dec)
733                         info->value_type.dec(info->value_type.context,
734                                              value_ptr(n, index), 1);
735
736                 delete_at(n, index);
737                 keys[last_level] = k + 1ull;
738
739         } else
740                 r = -ENODATA;
741
742 out:
743         *new_root = shadow_root(&spine);
744         exit_shadow_spine(&spine);
745
746         return r;
747 }
748
749 int dm_btree_remove_leaves(struct dm_btree_info *info, dm_block_t root,
750                            uint64_t *first_key, uint64_t end_key,
751                            dm_block_t *new_root, unsigned int *nr_removed)
752 {
753         int r;
754
755         *nr_removed = 0;
756         do {
757                 r = remove_one(info, root, first_key, end_key, &root, nr_removed);
758                 if (!r)
759                         (*nr_removed)++;
760         } while (!r);
761
762         *new_root = root;
763         return r == -ENODATA ? 0 : r;
764 }
765 EXPORT_SYMBOL_GPL(dm_btree_remove_leaves);