bcachefs: Split out bkey_sort.c
[linux-block.git] / fs / bcachefs / extents.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
4  *
5  * Code for managing the extent btree and dynamically updating the writeback
6  * dirty sector count.
7  */
8
9 #include "bcachefs.h"
10 #include "bkey_methods.h"
11 #include "btree_gc.h"
12 #include "btree_update.h"
13 #include "btree_update_interior.h"
14 #include "buckets.h"
15 #include "checksum.h"
16 #include "debug.h"
17 #include "dirent.h"
18 #include "disk_groups.h"
19 #include "error.h"
20 #include "extents.h"
21 #include "inode.h"
22 #include "journal.h"
23 #include "replicas.h"
24 #include "super.h"
25 #include "super-io.h"
26 #include "trace.h"
27 #include "util.h"
28 #include "xattr.h"
29
30 /* Common among btree and extent ptrs */
31
32 const struct bch_extent_ptr *
33 bch2_extent_has_device(struct bkey_s_c_extent e, unsigned dev)
34 {
35         const struct bch_extent_ptr *ptr;
36
37         extent_for_each_ptr(e, ptr)
38                 if (ptr->dev == dev)
39                         return ptr;
40
41         return NULL;
42 }
43
44 void bch2_extent_drop_device(struct bkey_s_extent e, unsigned dev)
45 {
46         struct bch_extent_ptr *ptr;
47
48         bch2_extent_drop_ptrs(e, ptr, ptr->dev == dev);
49 }
50
51 const struct bch_extent_ptr *
52 bch2_extent_has_group(struct bch_fs *c, struct bkey_s_c_extent e, unsigned group)
53 {
54         const struct bch_extent_ptr *ptr;
55
56         extent_for_each_ptr(e, ptr) {
57                 struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
58
59                 if (ca->mi.group &&
60                     ca->mi.group - 1 == group)
61                         return ptr;
62         }
63
64         return NULL;
65 }
66
67 const struct bch_extent_ptr *
68 bch2_extent_has_target(struct bch_fs *c, struct bkey_s_c_extent e, unsigned target)
69 {
70         const struct bch_extent_ptr *ptr;
71
72         extent_for_each_ptr(e, ptr)
73                 if (bch2_dev_in_target(c, ptr->dev, target) &&
74                     (!ptr->cached ||
75                      !ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr)))
76                         return ptr;
77
78         return NULL;
79 }
80
81 unsigned bch2_extent_nr_ptrs(struct bkey_s_c_extent e)
82 {
83         const struct bch_extent_ptr *ptr;
84         unsigned nr_ptrs = 0;
85
86         extent_for_each_ptr(e, ptr)
87                 nr_ptrs++;
88
89         return nr_ptrs;
90 }
91
92 unsigned bch2_extent_nr_dirty_ptrs(struct bkey_s_c k)
93 {
94         struct bkey_s_c_extent e;
95         const struct bch_extent_ptr *ptr;
96         unsigned nr_ptrs = 0;
97
98         switch (k.k->type) {
99         case BCH_EXTENT:
100         case BCH_EXTENT_CACHED:
101                 e = bkey_s_c_to_extent(k);
102
103                 extent_for_each_ptr(e, ptr)
104                         nr_ptrs += !ptr->cached;
105                 break;
106
107         case BCH_RESERVATION:
108                 nr_ptrs = bkey_s_c_to_reservation(k).v->nr_replicas;
109                 break;
110         }
111
112         return nr_ptrs;
113 }
114
115 static unsigned bch2_extent_ptr_durability(struct bch_fs *c,
116                                            struct extent_ptr_decoded p)
117 {
118         unsigned i, durability = 0;
119         struct bch_dev *ca;
120
121         if (p.ptr.cached)
122                 return 0;
123
124         ca = bch_dev_bkey_exists(c, p.ptr.dev);
125
126         if (ca->mi.state != BCH_MEMBER_STATE_FAILED)
127                 durability = max_t(unsigned, durability, ca->mi.durability);
128
129         for (i = 0; i < p.ec_nr; i++) {
130                 struct stripe *s =
131                         genradix_ptr(&c->stripes[0], p.idx);
132
133                 if (WARN_ON(!s))
134                         continue;
135
136                 durability = max_t(unsigned, durability, s->nr_redundant);
137         }
138
139         return durability;
140 }
141
142 unsigned bch2_extent_durability(struct bch_fs *c, struct bkey_s_c_extent e)
143 {
144         const union bch_extent_entry *entry;
145         struct extent_ptr_decoded p;
146         unsigned durability = 0;
147
148         extent_for_each_ptr_decode(e, p, entry)
149                 durability += bch2_extent_ptr_durability(c, p);
150
151         return durability;
152 }
153
154 unsigned bch2_extent_is_compressed(struct bkey_s_c k)
155 {
156         unsigned ret = 0;
157
158         switch (k.k->type) {
159         case BCH_EXTENT:
160         case BCH_EXTENT_CACHED: {
161                 struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
162                 const union bch_extent_entry *entry;
163                 struct extent_ptr_decoded p;
164
165                 extent_for_each_ptr_decode(e, p, entry)
166                         if (!p.ptr.cached &&
167                             p.crc.compression_type != BCH_COMPRESSION_NONE &&
168                             p.crc.compressed_size < p.crc.live_size)
169                                 ret += p.crc.compressed_size;
170         }
171         }
172
173         return ret;
174 }
175
176 bool bch2_extent_matches_ptr(struct bch_fs *c, struct bkey_s_c_extent e,
177                              struct bch_extent_ptr m, u64 offset)
178 {
179         const union bch_extent_entry *entry;
180         struct extent_ptr_decoded p;
181
182         extent_for_each_ptr_decode(e, p, entry)
183                 if (p.ptr.dev   == m.dev &&
184                     p.ptr.gen   == m.gen &&
185                     (s64) p.ptr.offset + p.crc.offset - bkey_start_offset(e.k) ==
186                     (s64) m.offset  - offset)
187                         return true;
188
189         return false;
190 }
191
192 static union bch_extent_entry *extent_entry_prev(struct bkey_s_extent e,
193                                           union bch_extent_entry *entry)
194 {
195         union bch_extent_entry *i = e.v->start;
196
197         if (i == entry)
198                 return NULL;
199
200         while (extent_entry_next(i) != entry)
201                 i = extent_entry_next(i);
202         return i;
203 }
204
205 union bch_extent_entry *bch2_extent_drop_ptr(struct bkey_s_extent e,
206                                              struct bch_extent_ptr *ptr)
207 {
208         union bch_extent_entry *dst, *src, *prev;
209         bool drop_crc = true;
210
211         EBUG_ON(ptr < &e.v->start->ptr ||
212                 ptr >= &extent_entry_last(e)->ptr);
213         EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr);
214
215         src = extent_entry_next(to_entry(ptr));
216         if (src != extent_entry_last(e) &&
217             !extent_entry_is_crc(src))
218                 drop_crc = false;
219
220         dst = to_entry(ptr);
221         while ((prev = extent_entry_prev(e, dst))) {
222                 if (extent_entry_is_ptr(prev))
223                         break;
224
225                 if (extent_entry_is_crc(prev)) {
226                         if (drop_crc)
227                                 dst = prev;
228                         break;
229                 }
230
231                 dst = prev;
232         }
233
234         memmove_u64s_down(dst, src,
235                           (u64 *) extent_entry_last(e) - (u64 *) src);
236         e.k->u64s -= (u64 *) src - (u64 *) dst;
237
238         return dst;
239 }
240
241 static inline bool can_narrow_crc(struct bch_extent_crc_unpacked u,
242                                   struct bch_extent_crc_unpacked n)
243 {
244         return !u.compression_type &&
245                 u.csum_type &&
246                 u.uncompressed_size > u.live_size &&
247                 bch2_csum_type_is_encryption(u.csum_type) ==
248                 bch2_csum_type_is_encryption(n.csum_type);
249 }
250
251 bool bch2_can_narrow_extent_crcs(struct bkey_s_c_extent e,
252                                  struct bch_extent_crc_unpacked n)
253 {
254         struct bch_extent_crc_unpacked crc;
255         const union bch_extent_entry *i;
256
257         if (!n.csum_type)
258                 return false;
259
260         extent_for_each_crc(e, crc, i)
261                 if (can_narrow_crc(crc, n))
262                         return true;
263
264         return false;
265 }
266
267 /*
268  * We're writing another replica for this extent, so while we've got the data in
269  * memory we'll be computing a new checksum for the currently live data.
270  *
271  * If there are other replicas we aren't moving, and they are checksummed but
272  * not compressed, we can modify them to point to only the data that is
273  * currently live (so that readers won't have to bounce) while we've got the
274  * checksum we need:
275  */
276 bool bch2_extent_narrow_crcs(struct bkey_i_extent *e,
277                              struct bch_extent_crc_unpacked n)
278 {
279         struct bch_extent_crc_unpacked u;
280         struct extent_ptr_decoded p;
281         union bch_extent_entry *i;
282         bool ret = false;
283
284         /* Find a checksum entry that covers only live data: */
285         if (!n.csum_type) {
286                 extent_for_each_crc(extent_i_to_s(e), u, i)
287                         if (!u.compression_type &&
288                             u.csum_type &&
289                             u.live_size == u.uncompressed_size) {
290                                 n = u;
291                                 goto found;
292                         }
293                 return false;
294         }
295 found:
296         BUG_ON(n.compression_type);
297         BUG_ON(n.offset);
298         BUG_ON(n.live_size != e->k.size);
299
300 restart_narrow_pointers:
301         extent_for_each_ptr_decode(extent_i_to_s(e), p, i)
302                 if (can_narrow_crc(p.crc, n)) {
303                         bch2_extent_drop_ptr(extent_i_to_s(e), &i->ptr);
304                         p.ptr.offset += p.crc.offset;
305                         p.crc = n;
306                         bch2_extent_ptr_decoded_append(e, &p);
307                         ret = true;
308                         goto restart_narrow_pointers;
309                 }
310
311         return ret;
312 }
313
314 /* returns true if not equal */
315 static inline bool bch2_crc_unpacked_cmp(struct bch_extent_crc_unpacked l,
316                                          struct bch_extent_crc_unpacked r)
317 {
318         return (l.csum_type             != r.csum_type ||
319                 l.compression_type      != r.compression_type ||
320                 l.compressed_size       != r.compressed_size ||
321                 l.uncompressed_size     != r.uncompressed_size ||
322                 l.offset                != r.offset ||
323                 l.live_size             != r.live_size ||
324                 l.nonce                 != r.nonce ||
325                 bch2_crc_cmp(l.csum, r.csum));
326 }
327
328 static void bch2_extent_drop_stale(struct bch_fs *c, struct bkey_s_extent e)
329 {
330         struct bch_extent_ptr *ptr;
331
332         bch2_extent_drop_ptrs(e, ptr,
333                 ptr->cached &&
334                 ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr));
335 }
336
337 bool bch2_ptr_normalize(struct bch_fs *c, struct btree *b, struct bkey_s k)
338 {
339         return bch2_extent_normalize(c, k);
340 }
341
342 void bch2_ptr_swab(const struct bkey_format *f, struct bkey_packed *k)
343 {
344         switch (k->type) {
345         case BCH_EXTENT:
346         case BCH_EXTENT_CACHED: {
347                 union bch_extent_entry *entry;
348                 u64 *d = (u64 *) bkeyp_val(f, k);
349                 unsigned i;
350
351                 for (i = 0; i < bkeyp_val_u64s(f, k); i++)
352                         d[i] = swab64(d[i]);
353
354                 for (entry = (union bch_extent_entry *) d;
355                      entry < (union bch_extent_entry *) (d + bkeyp_val_u64s(f, k));
356                      entry = extent_entry_next(entry)) {
357                         switch (extent_entry_type(entry)) {
358                         case BCH_EXTENT_ENTRY_ptr:
359                                 break;
360                         case BCH_EXTENT_ENTRY_crc32:
361                                 entry->crc32.csum = swab32(entry->crc32.csum);
362                                 break;
363                         case BCH_EXTENT_ENTRY_crc64:
364                                 entry->crc64.csum_hi = swab16(entry->crc64.csum_hi);
365                                 entry->crc64.csum_lo = swab64(entry->crc64.csum_lo);
366                                 break;
367                         case BCH_EXTENT_ENTRY_crc128:
368                                 entry->crc128.csum.hi = (__force __le64)
369                                         swab64((__force u64) entry->crc128.csum.hi);
370                                 entry->crc128.csum.lo = (__force __le64)
371                                         swab64((__force u64) entry->crc128.csum.lo);
372                                 break;
373                         case BCH_EXTENT_ENTRY_stripe_ptr:
374                                 break;
375                         }
376                 }
377                 break;
378         }
379         }
380 }
381
382 static const char *extent_ptr_invalid(const struct bch_fs *c,
383                                       struct bkey_s_c_extent e,
384                                       const struct bch_extent_ptr *ptr,
385                                       unsigned size_ondisk,
386                                       bool metadata)
387 {
388         const struct bch_extent_ptr *ptr2;
389         struct bch_dev *ca;
390
391         if (ptr->dev >= c->sb.nr_devices ||
392             !c->devs[ptr->dev])
393                 return "pointer to invalid device";
394
395         ca = bch_dev_bkey_exists(c, ptr->dev);
396         if (!ca)
397                 return "pointer to invalid device";
398
399         extent_for_each_ptr(e, ptr2)
400                 if (ptr != ptr2 && ptr->dev == ptr2->dev)
401                         return "multiple pointers to same device";
402
403         if (ptr->offset + size_ondisk > bucket_to_sector(ca, ca->mi.nbuckets))
404                 return "offset past end of device";
405
406         if (ptr->offset < bucket_to_sector(ca, ca->mi.first_bucket))
407                 return "offset before first bucket";
408
409         if (bucket_remainder(ca, ptr->offset) +
410             size_ondisk > ca->mi.bucket_size)
411                 return "spans multiple buckets";
412
413         return NULL;
414 }
415
416 static void extent_print_ptrs(struct printbuf *out, struct bch_fs *c,
417                               struct bkey_s_c_extent e)
418 {
419         const union bch_extent_entry *entry;
420         struct bch_extent_crc_unpacked crc;
421         const struct bch_extent_ptr *ptr;
422         const struct bch_extent_stripe_ptr *ec;
423         struct bch_dev *ca;
424         bool first = true;
425
426         extent_for_each_entry(e, entry) {
427                 if (!first)
428                         pr_buf(out, " ");
429
430                 switch (__extent_entry_type(entry)) {
431                 case BCH_EXTENT_ENTRY_ptr:
432                         ptr = entry_to_ptr(entry);
433                         ca = ptr->dev < c->sb.nr_devices && c->devs[ptr->dev]
434                                 ? bch_dev_bkey_exists(c, ptr->dev)
435                                 : NULL;
436
437                         pr_buf(out, "ptr: %u:%llu gen %u%s%s", ptr->dev,
438                                (u64) ptr->offset, ptr->gen,
439                                ptr->cached ? " cached" : "",
440                                ca && ptr_stale(ca, ptr)
441                                ? " stale" : "");
442                         break;
443                 case BCH_EXTENT_ENTRY_crc32:
444                 case BCH_EXTENT_ENTRY_crc64:
445                 case BCH_EXTENT_ENTRY_crc128:
446                         crc = bch2_extent_crc_unpack(e.k, entry_to_crc(entry));
447
448                         pr_buf(out, "crc: c_size %u size %u offset %u nonce %u csum %u compress %u",
449                                crc.compressed_size,
450                                crc.uncompressed_size,
451                                crc.offset, crc.nonce,
452                                crc.csum_type,
453                                crc.compression_type);
454                         break;
455                 case BCH_EXTENT_ENTRY_stripe_ptr:
456                         ec = &entry->stripe_ptr;
457
458                         pr_buf(out, "ec: idx %llu block %u",
459                                (u64) ec->idx, ec->block);
460                         break;
461                 default:
462                         pr_buf(out, "(invalid extent entry %.16llx)", *((u64 *) entry));
463                         goto out;
464                 }
465
466                 first = false;
467         }
468 out:
469         if (bkey_extent_is_cached(e.k))
470                 pr_buf(out, " cached");
471 }
472
473 static struct bch_dev_io_failures *dev_io_failures(struct bch_io_failures *f,
474                                                    unsigned dev)
475 {
476         struct bch_dev_io_failures *i;
477
478         for (i = f->devs; i < f->devs + f->nr; i++)
479                 if (i->dev == dev)
480                         return i;
481
482         return NULL;
483 }
484
485 void bch2_mark_io_failure(struct bch_io_failures *failed,
486                           struct extent_ptr_decoded *p)
487 {
488         struct bch_dev_io_failures *f = dev_io_failures(failed, p->ptr.dev);
489
490         if (!f) {
491                 BUG_ON(failed->nr >= ARRAY_SIZE(failed->devs));
492
493                 f = &failed->devs[failed->nr++];
494                 f->dev          = p->ptr.dev;
495                 f->idx          = p->idx;
496                 f->nr_failed    = 1;
497                 f->nr_retries   = 0;
498         } else if (p->idx != f->idx) {
499                 f->idx          = p->idx;
500                 f->nr_failed    = 1;
501                 f->nr_retries   = 0;
502         } else {
503                 f->nr_failed++;
504         }
505 }
506
507 /*
508  * returns true if p1 is better than p2:
509  */
510 static inline bool ptr_better(struct bch_fs *c,
511                               const struct extent_ptr_decoded p1,
512                               const struct extent_ptr_decoded p2)
513 {
514         if (likely(!p1.idx && !p2.idx)) {
515                 struct bch_dev *dev1 = bch_dev_bkey_exists(c, p1.ptr.dev);
516                 struct bch_dev *dev2 = bch_dev_bkey_exists(c, p2.ptr.dev);
517
518                 u64 l1 = atomic64_read(&dev1->cur_latency[READ]);
519                 u64 l2 = atomic64_read(&dev2->cur_latency[READ]);
520
521                 /* Pick at random, biased in favor of the faster device: */
522
523                 return bch2_rand_range(l1 + l2) > l1;
524         }
525
526         if (force_reconstruct_read(c))
527                 return p1.idx > p2.idx;
528
529         return p1.idx < p2.idx;
530 }
531
532 static int extent_pick_read_device(struct bch_fs *c,
533                                    struct bkey_s_c_extent e,
534                                    struct bch_io_failures *failed,
535                                    struct extent_ptr_decoded *pick)
536 {
537         const union bch_extent_entry *entry;
538         struct extent_ptr_decoded p;
539         struct bch_dev_io_failures *f;
540         struct bch_dev *ca;
541         int ret = 0;
542
543         extent_for_each_ptr_decode(e, p, entry) {
544                 ca = bch_dev_bkey_exists(c, p.ptr.dev);
545
546                 if (p.ptr.cached && ptr_stale(ca, &p.ptr))
547                         continue;
548
549                 f = failed ? dev_io_failures(failed, p.ptr.dev) : NULL;
550                 if (f)
551                         p.idx = f->nr_failed < f->nr_retries
552                                 ? f->idx
553                                 : f->idx + 1;
554
555                 if (!p.idx &&
556                     !bch2_dev_is_readable(ca))
557                         p.idx++;
558
559                 if (force_reconstruct_read(c) &&
560                     !p.idx && p.ec_nr)
561                         p.idx++;
562
563                 if (p.idx >= p.ec_nr + 1)
564                         continue;
565
566                 if (ret && !ptr_better(c, p, *pick))
567                         continue;
568
569                 *pick = p;
570                 ret = 1;
571         }
572
573         return ret;
574 }
575
576 /* Btree ptrs */
577
578 const char *bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k)
579 {
580         if (bkey_extent_is_cached(k.k))
581                 return "cached";
582
583         if (k.k->size)
584                 return "nonzero key size";
585
586         if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX)
587                 return "value too big";
588
589         switch (k.k->type) {
590         case BCH_EXTENT: {
591                 struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
592                 const union bch_extent_entry *entry;
593                 const struct bch_extent_ptr *ptr;
594                 const char *reason;
595
596                 extent_for_each_entry(e, entry) {
597                         if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX)
598                                 return "invalid extent entry type";
599
600                         if (!extent_entry_is_ptr(entry))
601                                 return "has non ptr field";
602                 }
603
604                 extent_for_each_ptr(e, ptr) {
605                         reason = extent_ptr_invalid(c, e, ptr,
606                                                     c->opts.btree_node_size,
607                                                     true);
608                         if (reason)
609                                 return reason;
610                 }
611
612                 return NULL;
613         }
614
615         default:
616                 return "invalid value type";
617         }
618 }
619
620 void bch2_btree_ptr_debugcheck(struct bch_fs *c, struct btree *b,
621                                struct bkey_s_c k)
622 {
623         struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
624         const struct bch_extent_ptr *ptr;
625         unsigned seq;
626         const char *err;
627         char buf[160];
628         struct bucket_mark mark;
629         struct bch_dev *ca;
630         unsigned replicas = 0;
631         bool bad;
632
633         extent_for_each_ptr(e, ptr) {
634                 ca = bch_dev_bkey_exists(c, ptr->dev);
635                 replicas++;
636
637                 if (!test_bit(BCH_FS_ALLOC_READ_DONE, &c->flags))
638                         continue;
639
640                 err = "stale";
641                 if (ptr_stale(ca, ptr))
642                         goto err;
643
644                 do {
645                         seq = read_seqcount_begin(&c->gc_pos_lock);
646                         mark = ptr_bucket_mark(ca, ptr);
647
648                         bad = gc_pos_cmp(c->gc_pos, gc_pos_btree_node(b)) > 0 &&
649                                 (mark.data_type != BCH_DATA_BTREE ||
650                                  mark.dirty_sectors < c->opts.btree_node_size);
651                 } while (read_seqcount_retry(&c->gc_pos_lock, seq));
652
653                 err = "inconsistent";
654                 if (bad)
655                         goto err;
656         }
657
658         if (!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) &&
659             !bch2_bkey_replicas_marked(c, btree_node_type(b),
660                                        e.s_c, false)) {
661                 bch2_bkey_val_to_text(&PBUF(buf), c, btree_node_type(b), k);
662                 bch2_fs_bug(c,
663                         "btree key bad (replicas not marked in superblock):\n%s",
664                         buf);
665                 return;
666         }
667
668         return;
669 err:
670         bch2_bkey_val_to_text(&PBUF(buf), c, btree_node_type(b), k);
671         bch2_fs_bug(c, "%s btree pointer %s: bucket %zi gen %i mark %08x",
672                     err, buf, PTR_BUCKET_NR(ca, ptr),
673                     mark.gen, (unsigned) mark.v.counter);
674 }
675
676 void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c,
677                             struct bkey_s_c k)
678 {
679         const char *invalid;
680
681         if (bkey_extent_is_data(k.k))
682                 extent_print_ptrs(out, c, bkey_s_c_to_extent(k));
683
684         invalid = bch2_btree_ptr_invalid(c, k);
685         if (invalid)
686                 pr_buf(out, " invalid: %s", invalid);
687 }
688
689 int bch2_btree_pick_ptr(struct bch_fs *c, const struct btree *b,
690                         struct bch_io_failures *failed,
691                         struct extent_ptr_decoded *pick)
692 {
693         return extent_pick_read_device(c, bkey_i_to_s_c_extent(&b->key),
694                                        failed, pick);
695 }
696
697 /* Extents */
698
699 bool __bch2_cut_front(struct bpos where, struct bkey_s k)
700 {
701         u64 len = 0;
702
703         if (bkey_cmp(where, bkey_start_pos(k.k)) <= 0)
704                 return false;
705
706         EBUG_ON(bkey_cmp(where, k.k->p) > 0);
707
708         len = k.k->p.offset - where.offset;
709
710         BUG_ON(len > k.k->size);
711
712         /*
713          * Don't readjust offset if the key size is now 0, because that could
714          * cause offset to point to the next bucket:
715          */
716         if (!len)
717                 k.k->type = KEY_TYPE_DELETED;
718         else if (bkey_extent_is_data(k.k)) {
719                 struct bkey_s_extent e = bkey_s_to_extent(k);
720                 union bch_extent_entry *entry;
721                 bool seen_crc = false;
722
723                 extent_for_each_entry(e, entry) {
724                         switch (extent_entry_type(entry)) {
725                         case BCH_EXTENT_ENTRY_ptr:
726                                 if (!seen_crc)
727                                         entry->ptr.offset += e.k->size - len;
728                                 break;
729                         case BCH_EXTENT_ENTRY_crc32:
730                                 entry->crc32.offset += e.k->size - len;
731                                 break;
732                         case BCH_EXTENT_ENTRY_crc64:
733                                 entry->crc64.offset += e.k->size - len;
734                                 break;
735                         case BCH_EXTENT_ENTRY_crc128:
736                                 entry->crc128.offset += e.k->size - len;
737                                 break;
738                         case BCH_EXTENT_ENTRY_stripe_ptr:
739                                 break;
740                         }
741
742                         if (extent_entry_is_crc(entry))
743                                 seen_crc = true;
744                 }
745         }
746
747         k.k->size = len;
748
749         return true;
750 }
751
752 bool bch2_cut_back(struct bpos where, struct bkey *k)
753 {
754         u64 len = 0;
755
756         if (bkey_cmp(where, k->p) >= 0)
757                 return false;
758
759         EBUG_ON(bkey_cmp(where, bkey_start_pos(k)) < 0);
760
761         len = where.offset - bkey_start_offset(k);
762
763         BUG_ON(len > k->size);
764
765         k->p = where;
766         k->size = len;
767
768         if (!len)
769                 k->type = KEY_TYPE_DELETED;
770
771         return true;
772 }
773
774 /**
775  * bch_key_resize - adjust size of @k
776  *
777  * bkey_start_offset(k) will be preserved, modifies where the extent ends
778  */
779 void bch2_key_resize(struct bkey *k,
780                     unsigned new_size)
781 {
782         k->p.offset -= k->size;
783         k->p.offset += new_size;
784         k->size = new_size;
785 }
786
787 static bool extent_i_save(struct btree *b, struct bkey_packed *dst,
788                           struct bkey_i *src)
789 {
790         struct bkey_format *f = &b->format;
791         struct bkey_i *dst_unpacked;
792         struct bkey_packed tmp;
793
794         if ((dst_unpacked = packed_to_bkey(dst)))
795                 dst_unpacked->k = src->k;
796         else if (bch2_bkey_pack_key(&tmp, &src->k, f))
797                 memcpy_u64s(dst, &tmp, f->key_u64s);
798         else
799                 return false;
800
801         memcpy_u64s(bkeyp_val(f, dst), &src->v, bkey_val_u64s(&src->k));
802         return true;
803 }
804
805 struct extent_insert_state {
806         struct btree_insert             *trans;
807         struct btree_insert_entry       *insert;
808         struct bpos                     committed;
809
810         /* for deleting: */
811         struct bkey_i                   whiteout;
812         bool                            update_journal;
813         bool                            update_btree;
814         bool                            deleting;
815 };
816
817 static bool bch2_extent_merge_inline(struct bch_fs *,
818                                      struct btree_iter *,
819                                      struct bkey_packed *,
820                                      struct bkey_packed *,
821                                      bool);
822
823 static void verify_extent_nonoverlapping(struct btree *b,
824                                          struct btree_node_iter *_iter,
825                                          struct bkey_i *insert)
826 {
827 #ifdef CONFIG_BCACHEFS_DEBUG
828         struct btree_node_iter iter;
829         struct bkey_packed *k;
830         struct bkey uk;
831
832         iter = *_iter;
833         k = bch2_btree_node_iter_prev_filter(&iter, b, KEY_TYPE_DISCARD);
834         BUG_ON(k &&
835                (uk = bkey_unpack_key(b, k),
836                 bkey_cmp(uk.p, bkey_start_pos(&insert->k)) > 0));
837
838         iter = *_iter;
839         k = bch2_btree_node_iter_peek_filter(&iter, b, KEY_TYPE_DISCARD);
840 #if 0
841         BUG_ON(k &&
842                (uk = bkey_unpack_key(b, k),
843                 bkey_cmp(insert->k.p, bkey_start_pos(&uk))) > 0);
844 #else
845         if (k &&
846             (uk = bkey_unpack_key(b, k),
847              bkey_cmp(insert->k.p, bkey_start_pos(&uk))) > 0) {
848                 char buf1[100];
849                 char buf2[100];
850
851                 bch2_bkey_to_text(&PBUF(buf1), &insert->k);
852                 bch2_bkey_to_text(&PBUF(buf2), &uk);
853
854                 bch2_dump_btree_node(b);
855                 panic("insert > next :\n"
856                       "insert %s\n"
857                       "next   %s\n",
858                       buf1, buf2);
859         }
860 #endif
861
862 #endif
863 }
864
865 static void verify_modified_extent(struct btree_iter *iter,
866                                    struct bkey_packed *k)
867 {
868         bch2_btree_iter_verify(iter, iter->l[0].b);
869         bch2_verify_insert_pos(iter->l[0].b, k, k, k->u64s);
870 }
871
872 static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter,
873                                struct bkey_i *insert)
874 {
875         struct btree_iter_level *l = &iter->l[0];
876         struct btree_node_iter node_iter;
877         struct bkey_packed *k;
878
879         BUG_ON(insert->k.u64s > bch_btree_keys_u64s_remaining(c, l->b));
880
881         EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size);
882         verify_extent_nonoverlapping(l->b, &l->iter, insert);
883
884         node_iter = l->iter;
885         k = bch2_btree_node_iter_prev_filter(&node_iter, l->b, KEY_TYPE_DISCARD);
886         if (k && !bkey_written(l->b, k) &&
887             bch2_extent_merge_inline(c, iter, k, bkey_to_packed(insert), true))
888                 return;
889
890         node_iter = l->iter;
891         k = bch2_btree_node_iter_peek_filter(&node_iter, l->b, KEY_TYPE_DISCARD);
892         if (k && !bkey_written(l->b, k) &&
893             bch2_extent_merge_inline(c, iter, bkey_to_packed(insert), k, false))
894                 return;
895
896         k = bch2_btree_node_iter_bset_pos(&l->iter, l->b, bset_tree_last(l->b));
897
898         bch2_bset_insert(l->b, &l->iter, k, insert, 0);
899         bch2_btree_node_iter_fix(iter, l->b, &l->iter, k, 0, k->u64s);
900         bch2_btree_iter_verify(iter, l->b);
901 }
902
903 static void extent_insert_committed(struct extent_insert_state *s)
904 {
905         struct bch_fs *c = s->trans->c;
906         struct btree_iter *iter = s->insert->iter;
907         struct bkey_i *insert = s->insert->k;
908         BKEY_PADDED(k) split;
909
910         EBUG_ON(bkey_cmp(insert->k.p, s->committed) < 0);
911         EBUG_ON(bkey_cmp(s->committed, bkey_start_pos(&insert->k)) < 0);
912
913         bkey_copy(&split.k, insert);
914         if (s->deleting)
915                 split.k.k.type = KEY_TYPE_DISCARD;
916
917         bch2_cut_back(s->committed, &split.k.k);
918
919         if (!bkey_cmp(s->committed, iter->pos))
920                 return;
921
922         bch2_btree_iter_set_pos_same_leaf(iter, s->committed);
923
924         if (s->update_btree) {
925                 if (debug_check_bkeys(c))
926                         bch2_bkey_debugcheck(c, iter->l[0].b,
927                                              bkey_i_to_s_c(&split.k));
928
929                 EBUG_ON(bkey_deleted(&split.k.k) || !split.k.k.size);
930
931                 extent_bset_insert(c, iter, &split.k);
932         }
933
934         if (s->update_journal) {
935                 bkey_copy(&split.k, !s->deleting ? insert : &s->whiteout);
936                 if (s->deleting)
937                         split.k.k.type = KEY_TYPE_DISCARD;
938
939                 bch2_cut_back(s->committed, &split.k.k);
940
941                 EBUG_ON(bkey_deleted(&split.k.k) || !split.k.k.size);
942
943                 bch2_btree_journal_key(s->trans, iter, &split.k);
944         }
945
946         bch2_cut_front(s->committed, insert);
947
948         insert->k.needs_whiteout        = false;
949 }
950
951 void bch2_extent_trim_atomic(struct bkey_i *k, struct btree_iter *iter)
952 {
953         struct btree *b = iter->l[0].b;
954
955         BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK);
956
957         bch2_cut_back(b->key.k.p, &k->k);
958
959         BUG_ON(bkey_cmp(bkey_start_pos(&k->k), b->data->min_key) < 0);
960 }
961
962 enum btree_insert_ret
963 bch2_extent_can_insert(struct btree_insert *trans,
964                        struct btree_insert_entry *insert,
965                        unsigned *u64s)
966 {
967         struct btree_iter_level *l = &insert->iter->l[0];
968         struct btree_node_iter node_iter = l->iter;
969         enum bch_extent_overlap overlap;
970         struct bkey_packed *_k;
971         struct bkey unpacked;
972         struct bkey_s_c k;
973         int sectors;
974
975         BUG_ON(trans->flags & BTREE_INSERT_ATOMIC &&
976                !bch2_extent_is_atomic(&insert->k->k, insert->iter));
977
978         /*
979          * We avoid creating whiteouts whenever possible when deleting, but
980          * those optimizations mean we may potentially insert two whiteouts
981          * instead of one (when we overlap with the front of one extent and the
982          * back of another):
983          */
984         if (bkey_whiteout(&insert->k->k))
985                 *u64s += BKEY_U64s;
986
987         _k = bch2_btree_node_iter_peek_filter(&node_iter, l->b,
988                                               KEY_TYPE_DISCARD);
989         if (!_k)
990                 return BTREE_INSERT_OK;
991
992         k = bkey_disassemble(l->b, _k, &unpacked);
993
994         overlap = bch2_extent_overlap(&insert->k->k, k.k);
995
996         /* account for having to split existing extent: */
997         if (overlap == BCH_EXTENT_OVERLAP_MIDDLE)
998                 *u64s += _k->u64s;
999
1000         if (overlap == BCH_EXTENT_OVERLAP_MIDDLE &&
1001             (sectors = bch2_extent_is_compressed(k))) {
1002                 int flags = BCH_DISK_RESERVATION_BTREE_LOCKS_HELD;
1003
1004                 if (trans->flags & BTREE_INSERT_NOFAIL)
1005                         flags |= BCH_DISK_RESERVATION_NOFAIL;
1006
1007                 switch (bch2_disk_reservation_add(trans->c,
1008                                 trans->disk_res,
1009                                 sectors, flags)) {
1010                 case 0:
1011                         break;
1012                 case -ENOSPC:
1013                         return BTREE_INSERT_ENOSPC;
1014                 case -EINTR:
1015                         return BTREE_INSERT_NEED_GC_LOCK;
1016                 default:
1017                         BUG();
1018                 }
1019         }
1020
1021         return BTREE_INSERT_OK;
1022 }
1023
1024 static void
1025 extent_squash(struct extent_insert_state *s, struct bkey_i *insert,
1026               struct bkey_packed *_k, struct bkey_s k,
1027               enum bch_extent_overlap overlap)
1028 {
1029         struct bch_fs *c = s->trans->c;
1030         struct btree_iter *iter = s->insert->iter;
1031         struct btree_iter_level *l = &iter->l[0];
1032
1033         switch (overlap) {
1034         case BCH_EXTENT_OVERLAP_FRONT:
1035                 /* insert overlaps with start of k: */
1036                 __bch2_cut_front(insert->k.p, k);
1037                 BUG_ON(bkey_deleted(k.k));
1038                 extent_save(l->b, _k, k.k);
1039                 verify_modified_extent(iter, _k);
1040                 break;
1041
1042         case BCH_EXTENT_OVERLAP_BACK:
1043                 /* insert overlaps with end of k: */
1044                 bch2_cut_back(bkey_start_pos(&insert->k), k.k);
1045                 BUG_ON(bkey_deleted(k.k));
1046                 extent_save(l->b, _k, k.k);
1047
1048                 /*
1049                  * As the auxiliary tree is indexed by the end of the
1050                  * key and we've just changed the end, update the
1051                  * auxiliary tree.
1052                  */
1053                 bch2_bset_fix_invalidated_key(l->b, _k);
1054                 bch2_btree_node_iter_fix(iter, l->b, &l->iter,
1055                                          _k, _k->u64s, _k->u64s);
1056                 verify_modified_extent(iter, _k);
1057                 break;
1058
1059         case BCH_EXTENT_OVERLAP_ALL: {
1060                 /* The insert key completely covers k, invalidate k */
1061                 if (!bkey_whiteout(k.k))
1062                         btree_account_key_drop(l->b, _k);
1063
1064                 k.k->size = 0;
1065                 k.k->type = KEY_TYPE_DELETED;
1066
1067                 if (_k >= btree_bset_last(l->b)->start) {
1068                         unsigned u64s = _k->u64s;
1069
1070                         bch2_bset_delete(l->b, _k, _k->u64s);
1071                         bch2_btree_node_iter_fix(iter, l->b, &l->iter,
1072                                                  _k, u64s, 0);
1073                         bch2_btree_iter_verify(iter, l->b);
1074                 } else {
1075                         extent_save(l->b, _k, k.k);
1076                         bch2_btree_node_iter_fix(iter, l->b, &l->iter,
1077                                                  _k, _k->u64s, _k->u64s);
1078                         verify_modified_extent(iter, _k);
1079                 }
1080
1081                 break;
1082         }
1083         case BCH_EXTENT_OVERLAP_MIDDLE: {
1084                 BKEY_PADDED(k) split;
1085                 /*
1086                  * The insert key falls 'in the middle' of k
1087                  * The insert key splits k in 3:
1088                  * - start only in k, preserve
1089                  * - middle common section, invalidate in k
1090                  * - end only in k, preserve
1091                  *
1092                  * We update the old key to preserve the start,
1093                  * insert will be the new common section,
1094                  * we manually insert the end that we are preserving.
1095                  *
1096                  * modify k _before_ doing the insert (which will move
1097                  * what k points to)
1098                  */
1099                 bkey_reassemble(&split.k, k.s_c);
1100                 split.k.k.needs_whiteout |= bkey_written(l->b, _k);
1101
1102                 bch2_cut_back(bkey_start_pos(&insert->k), &split.k.k);
1103                 BUG_ON(bkey_deleted(&split.k.k));
1104
1105                 __bch2_cut_front(insert->k.p, k);
1106                 BUG_ON(bkey_deleted(k.k));
1107                 extent_save(l->b, _k, k.k);
1108                 verify_modified_extent(iter, _k);
1109
1110                 extent_bset_insert(c, iter, &split.k);
1111                 break;
1112         }
1113         }
1114 }
1115
1116 static void __bch2_insert_fixup_extent(struct extent_insert_state *s)
1117 {
1118         struct btree_iter *iter = s->insert->iter;
1119         struct btree_iter_level *l = &iter->l[0];
1120         struct bkey_packed *_k;
1121         struct bkey unpacked;
1122         struct bkey_i *insert = s->insert->k;
1123
1124         while (bkey_cmp(s->committed, insert->k.p) < 0 &&
1125                (_k = bch2_btree_node_iter_peek_filter(&l->iter, l->b,
1126                                                       KEY_TYPE_DISCARD))) {
1127                 struct bkey_s k = __bkey_disassemble(l->b, _k, &unpacked);
1128                 enum bch_extent_overlap overlap = bch2_extent_overlap(&insert->k, k.k);
1129
1130                 EBUG_ON(bkey_cmp(iter->pos, k.k->p) >= 0);
1131
1132                 if (bkey_cmp(bkey_start_pos(k.k), insert->k.p) >= 0)
1133                         break;
1134
1135                 s->committed = bpos_min(s->insert->k->k.p, k.k->p);
1136
1137                 if (!bkey_whiteout(k.k))
1138                         s->update_journal = true;
1139
1140                 if (!s->update_journal) {
1141                         bch2_cut_front(s->committed, insert);
1142                         bch2_cut_front(s->committed, &s->whiteout);
1143                         bch2_btree_iter_set_pos_same_leaf(iter, s->committed);
1144                         goto next;
1145                 }
1146
1147                 /*
1148                  * When deleting, if possible just do it by switching the type
1149                  * of the key we're deleting, instead of creating and inserting
1150                  * a new whiteout:
1151                  */
1152                 if (s->deleting &&
1153                     !s->update_btree &&
1154                     !bkey_cmp(insert->k.p, k.k->p) &&
1155                     !bkey_cmp(bkey_start_pos(&insert->k), bkey_start_pos(k.k))) {
1156                         if (!bkey_whiteout(k.k)) {
1157                                 btree_account_key_drop(l->b, _k);
1158                                 _k->type = KEY_TYPE_DISCARD;
1159                                 reserve_whiteout(l->b, _k);
1160                         }
1161                         break;
1162                 }
1163
1164                 if (k.k->needs_whiteout || bkey_written(l->b, _k)) {
1165                         insert->k.needs_whiteout = true;
1166                         s->update_btree = true;
1167                 }
1168
1169                 if (s->update_btree &&
1170                     overlap == BCH_EXTENT_OVERLAP_ALL &&
1171                     bkey_whiteout(k.k) &&
1172                     k.k->needs_whiteout) {
1173                         unreserve_whiteout(l->b, _k);
1174                         _k->needs_whiteout = false;
1175                 }
1176
1177                 extent_squash(s, insert, _k, k, overlap);
1178
1179                 if (!s->update_btree)
1180                         bch2_cut_front(s->committed, insert);
1181 next:
1182                 if (overlap == BCH_EXTENT_OVERLAP_FRONT ||
1183                     overlap == BCH_EXTENT_OVERLAP_MIDDLE)
1184                         break;
1185         }
1186
1187         if (bkey_cmp(s->committed, insert->k.p) < 0)
1188                 s->committed = bpos_min(s->insert->k->k.p, l->b->key.k.p);
1189
1190         /*
1191          * may have skipped past some deleted extents greater than the insert
1192          * key, before we got to a non deleted extent and knew we could bail out
1193          * rewind the iterator a bit if necessary:
1194          */
1195         {
1196                 struct btree_node_iter node_iter = l->iter;
1197
1198                 while ((_k = bch2_btree_node_iter_prev_all(&node_iter, l->b)) &&
1199                        bkey_cmp_left_packed(l->b, _k, &s->committed) > 0)
1200                         l->iter = node_iter;
1201         }
1202 }
1203
1204 /**
1205  * bch_extent_insert_fixup - insert a new extent and deal with overlaps
1206  *
1207  * this may result in not actually doing the insert, or inserting some subset
1208  * of the insert key. For cmpxchg operations this is where that logic lives.
1209  *
1210  * All subsets of @insert that need to be inserted are inserted using
1211  * bch2_btree_insert_and_journal(). If @b or @res fills up, this function
1212  * returns false, setting @iter->pos for the prefix of @insert that actually got
1213  * inserted.
1214  *
1215  * BSET INVARIANTS: this function is responsible for maintaining all the
1216  * invariants for bsets of extents in memory. things get really hairy with 0
1217  * size extents
1218  *
1219  * within one bset:
1220  *
1221  * bkey_start_pos(bkey_next(k)) >= k
1222  * or bkey_start_offset(bkey_next(k)) >= k->offset
1223  *
1224  * i.e. strict ordering, no overlapping extents.
1225  *
1226  * multiple bsets (i.e. full btree node):
1227  *
1228  * âˆ€ k, j
1229  *   k.size != 0 âˆ§ j.size != 0 â†’
1230  *     Â¬ (k > bkey_start_pos(j) âˆ§ k < j)
1231  *
1232  * i.e. no two overlapping keys _of nonzero size_
1233  *
1234  * We can't realistically maintain this invariant for zero size keys because of
1235  * the key merging done in bch2_btree_insert_key() - for two mergeable keys k, j
1236  * there may be another 0 size key between them in another bset, and it will
1237  * thus overlap with the merged key.
1238  *
1239  * In addition, the end of iter->pos indicates how much has been processed.
1240  * If the end of iter->pos is not the same as the end of insert, then
1241  * key insertion needs to continue/be retried.
1242  */
1243 enum btree_insert_ret
1244 bch2_insert_fixup_extent(struct btree_insert *trans,
1245                          struct btree_insert_entry *insert)
1246 {
1247         struct btree_iter *iter = insert->iter;
1248         struct btree *b         = iter->l[0].b;
1249         struct extent_insert_state s = {
1250                 .trans          = trans,
1251                 .insert         = insert,
1252                 .committed      = iter->pos,
1253
1254                 .whiteout       = *insert->k,
1255                 .update_journal = !bkey_whiteout(&insert->k->k),
1256                 .update_btree   = !bkey_whiteout(&insert->k->k),
1257                 .deleting       = bkey_whiteout(&insert->k->k),
1258         };
1259
1260         EBUG_ON(iter->level);
1261         EBUG_ON(!insert->k->k.size);
1262
1263         /*
1264          * As we process overlapping extents, we advance @iter->pos both to
1265          * signal to our caller (btree_insert_key()) how much of @insert->k has
1266          * been inserted, and also to keep @iter->pos consistent with
1267          * @insert->k and the node iterator that we're advancing:
1268          */
1269         EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k->k)));
1270
1271         __bch2_insert_fixup_extent(&s);
1272
1273         extent_insert_committed(&s);
1274
1275         EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k->k)));
1276         EBUG_ON(bkey_cmp(iter->pos, s.committed));
1277
1278         if (insert->k->k.size) {
1279                 /* got to the end of this leaf node */
1280                 BUG_ON(bkey_cmp(iter->pos, b->key.k.p));
1281                 return BTREE_INSERT_NEED_TRAVERSE;
1282         }
1283
1284         return BTREE_INSERT_OK;
1285 }
1286
1287 const char *bch2_extent_invalid(const struct bch_fs *c, struct bkey_s_c k)
1288 {
1289         if (bkey_val_u64s(k.k) > BKEY_EXTENT_VAL_U64s_MAX)
1290                 return "value too big";
1291
1292         if (!k.k->size)
1293                 return "zero key size";
1294
1295         switch (k.k->type) {
1296         case BCH_EXTENT:
1297         case BCH_EXTENT_CACHED: {
1298                 struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
1299                 const union bch_extent_entry *entry;
1300                 struct bch_extent_crc_unpacked crc;
1301                 const struct bch_extent_ptr *ptr;
1302                 unsigned size_ondisk = e.k->size;
1303                 const char *reason;
1304                 unsigned nonce = UINT_MAX;
1305
1306                 extent_for_each_entry(e, entry) {
1307                         if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX)
1308                                 return "invalid extent entry type";
1309
1310                         switch (extent_entry_type(entry)) {
1311                         case BCH_EXTENT_ENTRY_ptr:
1312                                 ptr = entry_to_ptr(entry);
1313
1314                                 reason = extent_ptr_invalid(c, e, &entry->ptr,
1315                                                             size_ondisk, false);
1316                                 if (reason)
1317                                         return reason;
1318                                 break;
1319                         case BCH_EXTENT_ENTRY_crc32:
1320                         case BCH_EXTENT_ENTRY_crc64:
1321                         case BCH_EXTENT_ENTRY_crc128:
1322                                 crc = bch2_extent_crc_unpack(e.k, entry_to_crc(entry));
1323
1324                                 if (crc.offset + e.k->size >
1325                                     crc.uncompressed_size)
1326                                         return "checksum offset + key size > uncompressed size";
1327
1328                                 size_ondisk = crc.compressed_size;
1329
1330                                 if (!bch2_checksum_type_valid(c, crc.csum_type))
1331                                         return "invalid checksum type";
1332
1333                                 if (crc.compression_type >= BCH_COMPRESSION_NR)
1334                                         return "invalid compression type";
1335
1336                                 if (bch2_csum_type_is_encryption(crc.csum_type)) {
1337                                         if (nonce == UINT_MAX)
1338                                                 nonce = crc.offset + crc.nonce;
1339                                         else if (nonce != crc.offset + crc.nonce)
1340                                                 return "incorrect nonce";
1341                                 }
1342                                 break;
1343                         case BCH_EXTENT_ENTRY_stripe_ptr:
1344                                 break;
1345                         }
1346                 }
1347
1348                 return NULL;
1349         }
1350
1351         case BCH_RESERVATION: {
1352                 struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
1353
1354                 if (bkey_val_bytes(k.k) != sizeof(struct bch_reservation))
1355                         return "incorrect value size";
1356
1357                 if (!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX)
1358                         return "invalid nr_replicas";
1359
1360                 return NULL;
1361         }
1362
1363         default:
1364                 return "invalid value type";
1365         }
1366 }
1367
1368 static void bch2_extent_debugcheck_extent(struct bch_fs *c, struct btree *b,
1369                                           struct bkey_s_c_extent e)
1370 {
1371         const struct bch_extent_ptr *ptr;
1372         struct bch_dev *ca;
1373         struct bucket_mark mark;
1374         unsigned seq, stale;
1375         char buf[160];
1376         bool bad;
1377         unsigned replicas = 0;
1378
1379         /*
1380          * XXX: we should be doing most/all of these checks at startup time,
1381          * where we check bch2_bkey_invalid() in btree_node_read_done()
1382          *
1383          * But note that we can't check for stale pointers or incorrect gc marks
1384          * until after journal replay is done (it might be an extent that's
1385          * going to get overwritten during replay)
1386          */
1387
1388         extent_for_each_ptr(e, ptr) {
1389                 ca = bch_dev_bkey_exists(c, ptr->dev);
1390                 replicas++;
1391
1392                 /*
1393                  * If journal replay hasn't finished, we might be seeing keys
1394                  * that will be overwritten by the time journal replay is done:
1395                  */
1396                 if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))
1397                         continue;
1398
1399                 stale = 0;
1400
1401                 do {
1402                         seq = read_seqcount_begin(&c->gc_pos_lock);
1403                         mark = ptr_bucket_mark(ca, ptr);
1404
1405                         /* between mark and bucket gen */
1406                         smp_rmb();
1407
1408                         stale = ptr_stale(ca, ptr);
1409
1410                         bch2_fs_bug_on(stale && !ptr->cached, c,
1411                                          "stale dirty pointer");
1412
1413                         bch2_fs_bug_on(stale > 96, c,
1414                                          "key too stale: %i",
1415                                          stale);
1416
1417                         if (stale)
1418                                 break;
1419
1420                         bad = gc_pos_cmp(c->gc_pos, gc_pos_btree_node(b)) > 0 &&
1421                                 (mark.data_type != BCH_DATA_USER ||
1422                                  !(ptr->cached
1423                                    ? mark.cached_sectors
1424                                    : mark.dirty_sectors));
1425                 } while (read_seqcount_retry(&c->gc_pos_lock, seq));
1426
1427                 if (bad)
1428                         goto bad_ptr;
1429         }
1430
1431         if (replicas > BCH_REPLICAS_MAX) {
1432                 bch2_bkey_val_to_text(&PBUF(buf), c, btree_node_type(b),
1433                                       e.s_c);
1434                 bch2_fs_bug(c,
1435                         "extent key bad (too many replicas: %u): %s",
1436                         replicas, buf);
1437                 return;
1438         }
1439
1440         if (!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) &&
1441             !bch2_bkey_replicas_marked(c, btree_node_type(b),
1442                                        e.s_c, false)) {
1443                 bch2_bkey_val_to_text(&PBUF(buf), c, btree_node_type(b),
1444                                       e.s_c);
1445                 bch2_fs_bug(c,
1446                         "extent key bad (replicas not marked in superblock):\n%s",
1447                         buf);
1448                 return;
1449         }
1450
1451         return;
1452
1453 bad_ptr:
1454         bch2_bkey_val_to_text(&PBUF(buf), c, btree_node_type(b),
1455                               e.s_c);
1456         bch2_fs_bug(c, "extent pointer bad gc mark: %s:\nbucket %zu "
1457                    "gen %i type %u", buf,
1458                    PTR_BUCKET_NR(ca, ptr), mark.gen, mark.data_type);
1459 }
1460
1461 void bch2_extent_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k)
1462 {
1463         switch (k.k->type) {
1464         case BCH_EXTENT:
1465         case BCH_EXTENT_CACHED:
1466                 bch2_extent_debugcheck_extent(c, b, bkey_s_c_to_extent(k));
1467                 break;
1468         case BCH_RESERVATION:
1469                 break;
1470         default:
1471                 BUG();
1472         }
1473 }
1474
1475 void bch2_extent_to_text(struct printbuf *out, struct bch_fs *c,
1476                          struct bkey_s_c k)
1477 {
1478         const char *invalid;
1479
1480         if (bkey_extent_is_data(k.k))
1481                 extent_print_ptrs(out, c, bkey_s_c_to_extent(k));
1482
1483         invalid = bch2_extent_invalid(c, k);
1484         if (invalid)
1485                 pr_buf(out, " invalid: %s", invalid);
1486 }
1487
1488 static void bch2_extent_crc_init(union bch_extent_crc *crc,
1489                                  struct bch_extent_crc_unpacked new)
1490 {
1491 #define common_fields(_crc)                                             \
1492                 .csum_type              = _crc.csum_type,               \
1493                 .compression_type       = _crc.compression_type,        \
1494                 ._compressed_size       = _crc.compressed_size - 1,     \
1495                 ._uncompressed_size     = _crc.uncompressed_size - 1,   \
1496                 .offset                 = _crc.offset
1497
1498         if (bch_crc_bytes[new.csum_type]        <= 4 &&
1499             new.uncompressed_size               <= CRC32_SIZE_MAX &&
1500             new.nonce                           <= CRC32_NONCE_MAX) {
1501                 crc->crc32 = (struct bch_extent_crc32) {
1502                         .type = 1 << BCH_EXTENT_ENTRY_crc32,
1503                         common_fields(new),
1504                         .csum                   = *((__le32 *) &new.csum.lo),
1505                 };
1506                 return;
1507         }
1508
1509         if (bch_crc_bytes[new.csum_type]        <= 10 &&
1510             new.uncompressed_size               <= CRC64_SIZE_MAX &&
1511             new.nonce                           <= CRC64_NONCE_MAX) {
1512                 crc->crc64 = (struct bch_extent_crc64) {
1513                         .type = 1 << BCH_EXTENT_ENTRY_crc64,
1514                         common_fields(new),
1515                         .nonce                  = new.nonce,
1516                         .csum_lo                = new.csum.lo,
1517                         .csum_hi                = *((__le16 *) &new.csum.hi),
1518                 };
1519                 return;
1520         }
1521
1522         if (bch_crc_bytes[new.csum_type]        <= 16 &&
1523             new.uncompressed_size               <= CRC128_SIZE_MAX &&
1524             new.nonce                           <= CRC128_NONCE_MAX) {
1525                 crc->crc128 = (struct bch_extent_crc128) {
1526                         .type = 1 << BCH_EXTENT_ENTRY_crc128,
1527                         common_fields(new),
1528                         .nonce                  = new.nonce,
1529                         .csum                   = new.csum,
1530                 };
1531                 return;
1532         }
1533 #undef common_fields
1534         BUG();
1535 }
1536
1537 void bch2_extent_crc_append(struct bkey_i_extent *e,
1538                             struct bch_extent_crc_unpacked new)
1539 {
1540         bch2_extent_crc_init((void *) extent_entry_last(extent_i_to_s(e)), new);
1541         __extent_entry_push(e);
1542 }
1543
1544 static inline void __extent_entry_insert(struct bkey_i_extent *e,
1545                                          union bch_extent_entry *dst,
1546                                          union bch_extent_entry *new)
1547 {
1548         union bch_extent_entry *end = extent_entry_last(extent_i_to_s(e));
1549
1550         memmove_u64s_up((u64 *) dst + extent_entry_u64s(new),
1551                         dst, (u64 *) end - (u64 *) dst);
1552         e->k.u64s += extent_entry_u64s(new);
1553         memcpy_u64s_small(dst, new, extent_entry_u64s(new));
1554 }
1555
1556 void bch2_extent_ptr_decoded_append(struct bkey_i_extent *e,
1557                                     struct extent_ptr_decoded *p)
1558 {
1559         struct bch_extent_crc_unpacked crc = bch2_extent_crc_unpack(&e->k, NULL);
1560         union bch_extent_entry *pos;
1561         unsigned i;
1562
1563         if (!bch2_crc_unpacked_cmp(crc, p->crc)) {
1564                 pos = e->v.start;
1565                 goto found;
1566         }
1567
1568         extent_for_each_crc(extent_i_to_s(e), crc, pos)
1569                 if (!bch2_crc_unpacked_cmp(crc, p->crc)) {
1570                         pos = extent_entry_next(pos);
1571                         goto found;
1572                 }
1573
1574         bch2_extent_crc_append(e, p->crc);
1575         pos = extent_entry_last(extent_i_to_s(e));
1576 found:
1577         p->ptr.type = 1 << BCH_EXTENT_ENTRY_ptr;
1578         __extent_entry_insert(e, pos, to_entry(&p->ptr));
1579
1580         for (i = 0; i < p->ec_nr; i++) {
1581                 p->ec[i].type = 1 << BCH_EXTENT_ENTRY_stripe_ptr;
1582                 __extent_entry_insert(e, pos, to_entry(&p->ec[i]));
1583         }
1584 }
1585
1586 /*
1587  * bch_extent_normalize - clean up an extent, dropping stale pointers etc.
1588  *
1589  * Returns true if @k should be dropped entirely
1590  *
1591  * For existing keys, only called when btree nodes are being rewritten, not when
1592  * they're merely being compacted/resorted in memory.
1593  */
1594 bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k)
1595 {
1596         struct bkey_s_extent e;
1597
1598         switch (k.k->type) {
1599         case KEY_TYPE_ERROR:
1600                 return false;
1601
1602         case KEY_TYPE_DELETED:
1603                 return true;
1604         case KEY_TYPE_DISCARD:
1605                 return bversion_zero(k.k->version);
1606         case KEY_TYPE_COOKIE:
1607                 return false;
1608
1609         case BCH_EXTENT:
1610         case BCH_EXTENT_CACHED:
1611                 e = bkey_s_to_extent(k);
1612
1613                 bch2_extent_drop_stale(c, e);
1614
1615                 if (!bkey_val_u64s(e.k)) {
1616                         if (bkey_extent_is_cached(e.k)) {
1617                                 k.k->type = KEY_TYPE_DISCARD;
1618                                 if (bversion_zero(k.k->version))
1619                                         return true;
1620                         } else {
1621                                 k.k->type = KEY_TYPE_ERROR;
1622                         }
1623                 }
1624
1625                 return false;
1626         case BCH_RESERVATION:
1627                 return false;
1628         default:
1629                 BUG();
1630         }
1631 }
1632
1633 void bch2_extent_mark_replicas_cached(struct bch_fs *c,
1634                                       struct bkey_s_extent e,
1635                                       unsigned target,
1636                                       unsigned nr_desired_replicas)
1637 {
1638         union bch_extent_entry *entry;
1639         struct extent_ptr_decoded p;
1640         int extra = bch2_extent_durability(c, e.c) - nr_desired_replicas;
1641
1642         if (target && extra > 0)
1643                 extent_for_each_ptr_decode(e, p, entry) {
1644                         int n = bch2_extent_ptr_durability(c, p);
1645
1646                         if (n && n <= extra &&
1647                             !bch2_dev_in_target(c, p.ptr.dev, target)) {
1648                                 entry->ptr.cached = true;
1649                                 extra -= n;
1650                         }
1651                 }
1652
1653         if (extra > 0)
1654                 extent_for_each_ptr_decode(e, p, entry) {
1655                         int n = bch2_extent_ptr_durability(c, p);
1656
1657                         if (n && n <= extra) {
1658                                 entry->ptr.cached = true;
1659                                 extra -= n;
1660                         }
1661                 }
1662 }
1663
1664 /*
1665  * This picks a non-stale pointer, preferably from a device other than @avoid.
1666  * Avoid can be NULL, meaning pick any. If there are no non-stale pointers to
1667  * other devices, it will still pick a pointer from avoid.
1668  */
1669 int bch2_extent_pick_ptr(struct bch_fs *c, struct bkey_s_c k,
1670                          struct bch_io_failures *failed,
1671                          struct extent_ptr_decoded *pick)
1672 {
1673         int ret;
1674
1675         switch (k.k->type) {
1676         case KEY_TYPE_ERROR:
1677                 return -EIO;
1678
1679         case BCH_EXTENT:
1680         case BCH_EXTENT_CACHED:
1681                 ret = extent_pick_read_device(c, bkey_s_c_to_extent(k),
1682                                               failed, pick);
1683
1684                 if (!ret && !bkey_extent_is_cached(k.k))
1685                         ret = -EIO;
1686
1687                 return ret;
1688
1689         default:
1690                 return 0;
1691         }
1692 }
1693
1694 enum merge_result bch2_extent_merge(struct bch_fs *c, struct btree *b,
1695                                     struct bkey_i *l, struct bkey_i *r)
1696 {
1697         struct bkey_s_extent el, er;
1698         union bch_extent_entry *en_l, *en_r;
1699
1700         if (key_merging_disabled(c))
1701                 return BCH_MERGE_NOMERGE;
1702
1703         /*
1704          * Generic header checks
1705          * Assumes left and right are in order
1706          * Left and right must be exactly aligned
1707          */
1708
1709         if (l->k.u64s           != r->k.u64s ||
1710             l->k.type           != r->k.type ||
1711             bversion_cmp(l->k.version, r->k.version) ||
1712             bkey_cmp(l->k.p, bkey_start_pos(&r->k)))
1713                 return BCH_MERGE_NOMERGE;
1714
1715         switch (l->k.type) {
1716         case KEY_TYPE_DISCARD:
1717         case KEY_TYPE_ERROR:
1718                 /* These types are mergeable, and no val to check */
1719                 break;
1720
1721         case BCH_EXTENT:
1722         case BCH_EXTENT_CACHED:
1723                 el = bkey_i_to_s_extent(l);
1724                 er = bkey_i_to_s_extent(r);
1725
1726                 extent_for_each_entry(el, en_l) {
1727                         struct bch_extent_ptr *lp, *rp;
1728                         struct bch_dev *ca;
1729
1730                         en_r = vstruct_idx(er.v, (u64 *) en_l - el.v->_data);
1731
1732                         if ((extent_entry_type(en_l) !=
1733                              extent_entry_type(en_r)) ||
1734                             !extent_entry_is_ptr(en_l))
1735                                 return BCH_MERGE_NOMERGE;
1736
1737                         lp = &en_l->ptr;
1738                         rp = &en_r->ptr;
1739
1740                         if (lp->offset + el.k->size     != rp->offset ||
1741                             lp->dev                     != rp->dev ||
1742                             lp->gen                     != rp->gen)
1743                                 return BCH_MERGE_NOMERGE;
1744
1745                         /* We don't allow extents to straddle buckets: */
1746                         ca = bch_dev_bkey_exists(c, lp->dev);
1747
1748                         if (PTR_BUCKET_NR(ca, lp) != PTR_BUCKET_NR(ca, rp))
1749                                 return BCH_MERGE_NOMERGE;
1750                 }
1751
1752                 break;
1753         case BCH_RESERVATION: {
1754                 struct bkey_i_reservation *li = bkey_i_to_reservation(l);
1755                 struct bkey_i_reservation *ri = bkey_i_to_reservation(r);
1756
1757                 if (li->v.generation != ri->v.generation ||
1758                     li->v.nr_replicas != ri->v.nr_replicas)
1759                         return BCH_MERGE_NOMERGE;
1760                 break;
1761         }
1762         default:
1763                 return BCH_MERGE_NOMERGE;
1764         }
1765
1766         l->k.needs_whiteout |= r->k.needs_whiteout;
1767
1768         /* Keys with no pointers aren't restricted to one bucket and could
1769          * overflow KEY_SIZE
1770          */
1771         if ((u64) l->k.size + r->k.size > KEY_SIZE_MAX) {
1772                 bch2_key_resize(&l->k, KEY_SIZE_MAX);
1773                 bch2_cut_front(l->k.p, r);
1774                 return BCH_MERGE_PARTIAL;
1775         }
1776
1777         bch2_key_resize(&l->k, l->k.size + r->k.size);
1778
1779         return BCH_MERGE_MERGE;
1780 }
1781
1782 /*
1783  * When merging an extent that we're inserting into a btree node, the new merged
1784  * extent could overlap with an existing 0 size extent - if we don't fix that,
1785  * it'll break the btree node iterator so this code finds those 0 size extents
1786  * and shifts them out of the way.
1787  *
1788  * Also unpacks and repacks.
1789  */
1790 static bool bch2_extent_merge_inline(struct bch_fs *c,
1791                                      struct btree_iter *iter,
1792                                      struct bkey_packed *l,
1793                                      struct bkey_packed *r,
1794                                      bool back_merge)
1795 {
1796         struct btree *b = iter->l[0].b;
1797         struct btree_node_iter *node_iter = &iter->l[0].iter;
1798         BKEY_PADDED(k) li, ri;
1799         struct bkey_packed *m   = back_merge ? l : r;
1800         struct bkey_i *mi       = back_merge ? &li.k : &ri.k;
1801         struct bset_tree *t     = bch2_bkey_to_bset(b, m);
1802         enum merge_result ret;
1803
1804         EBUG_ON(bkey_written(b, m));
1805
1806         /*
1807          * We need to save copies of both l and r, because we might get a
1808          * partial merge (which modifies both) and then fails to repack
1809          */
1810         bch2_bkey_unpack(b, &li.k, l);
1811         bch2_bkey_unpack(b, &ri.k, r);
1812
1813         ret = bch2_extent_merge(c, b, &li.k, &ri.k);
1814         if (ret == BCH_MERGE_NOMERGE)
1815                 return false;
1816
1817         /*
1818          * check if we overlap with deleted extents - would break the sort
1819          * order:
1820          */
1821         if (back_merge) {
1822                 struct bkey_packed *n = bkey_next(m);
1823
1824                 if (n != btree_bkey_last(b, t) &&
1825                     bkey_cmp_left_packed(b, n, &li.k.k.p) <= 0 &&
1826                     bkey_deleted(n))
1827                         return false;
1828         } else if (ret == BCH_MERGE_MERGE) {
1829                 struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m);
1830
1831                 if (prev &&
1832                     bkey_cmp_left_packed_byval(b, prev,
1833                                 bkey_start_pos(&li.k.k)) > 0)
1834                         return false;
1835         }
1836
1837         if (ret == BCH_MERGE_PARTIAL) {
1838                 if (!extent_i_save(b, m, mi))
1839                         return false;
1840
1841                 if (!back_merge)
1842                         bkey_copy(packed_to_bkey(l), &li.k);
1843                 else
1844                         bkey_copy(packed_to_bkey(r), &ri.k);
1845         } else {
1846                 if (!extent_i_save(b, m, &li.k))
1847                         return false;
1848         }
1849
1850         bch2_bset_fix_invalidated_key(b, m);
1851         bch2_btree_node_iter_fix(iter, b, node_iter,
1852                                  m, m->u64s, m->u64s);
1853         verify_modified_extent(iter, m);
1854
1855         return ret == BCH_MERGE_MERGE;
1856 }
1857
1858 int bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size)
1859 {
1860         struct btree_iter iter;
1861         struct bpos end = pos;
1862         struct bkey_s_c k;
1863         int ret = 0;
1864
1865         end.offset += size;
1866
1867         for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, pos,
1868                              BTREE_ITER_SLOTS, k) {
1869                 if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
1870                         break;
1871
1872                 if (!bch2_extent_is_fully_allocated(k)) {
1873                         ret = -ENOSPC;
1874                         break;
1875                 }
1876         }
1877         bch2_btree_iter_unlock(&iter);
1878
1879         return ret;
1880 }