b4a04df5ea9555445f86df82b203ddfe01e85cc9
[linux-block.git] / fs / bcachefs / bcachefs_format.h
1 /* SPDX-License-Identifier: GPL-2.0 */
2 #ifndef _BCACHEFS_FORMAT_H
3 #define _BCACHEFS_FORMAT_H
4
5 /*
6  * bcachefs on disk data structures
7  *
8  * OVERVIEW:
9  *
10  * There are three main types of on disk data structures in bcachefs (this is
11  * reduced from 5 in bcache)
12  *
13  *  - superblock
14  *  - journal
15  *  - btree
16  *
17  * The btree is the primary structure; most metadata exists as keys in the
18  * various btrees. There are only a small number of btrees, they're not
19  * sharded - we have one btree for extents, another for inodes, et cetera.
20  *
21  * SUPERBLOCK:
22  *
23  * The superblock contains the location of the journal, the list of devices in
24  * the filesystem, and in general any metadata we need in order to decide
25  * whether we can start a filesystem or prior to reading the journal/btree
26  * roots.
27  *
28  * The superblock is extensible, and most of the contents of the superblock are
29  * in variable length, type tagged fields; see struct bch_sb_field.
30  *
31  * Backup superblocks do not reside in a fixed location; also, superblocks do
32  * not have a fixed size. To locate backup superblocks we have struct
33  * bch_sb_layout; we store a copy of this inside every superblock, and also
34  * before the first superblock.
35  *
36  * JOURNAL:
37  *
38  * The journal primarily records btree updates in the order they occurred;
39  * journal replay consists of just iterating over all the keys in the open
40  * journal entries and re-inserting them into the btrees.
41  *
42  * The journal also contains entry types for the btree roots, and blacklisted
43  * journal sequence numbers (see journal_seq_blacklist.c).
44  *
45  * BTREE:
46  *
47  * bcachefs btrees are copy on write b+ trees, where nodes are big (typically
48  * 128k-256k) and log structured. We use struct btree_node for writing the first
49  * entry in a given node (offset 0), and struct btree_node_entry for all
50  * subsequent writes.
51  *
52  * After the header, btree node entries contain a list of keys in sorted order.
53  * Values are stored inline with the keys; since values are variable length (and
54  * keys effectively are variable length too, due to packing) we can't do random
55  * access without building up additional in memory tables in the btree node read
56  * path.
57  *
58  * BTREE KEYS (struct bkey):
59  *
60  * The various btrees share a common format for the key - so as to avoid
61  * switching in fastpath lookup/comparison code - but define their own
62  * structures for the key values.
63  *
64  * The size of a key/value pair is stored as a u8 in units of u64s, so the max
65  * size is just under 2k. The common part also contains a type tag for the
66  * value, and a format field indicating whether the key is packed or not (and
67  * also meant to allow adding new key fields in the future, if desired).
68  *
69  * bkeys, when stored within a btree node, may also be packed. In that case, the
70  * bkey_format in that node is used to unpack it. Packed bkeys mean that we can
71  * be generous with field sizes in the common part of the key format (64 bit
72  * inode number, 64 bit offset, 96 bit version field, etc.) for negligible cost.
73  */
74
75 #include <asm/types.h>
76 #include <asm/byteorder.h>
77 #include <linux/kernel.h>
78 #include <linux/uuid.h>
79 #include <uapi/linux/magic.h>
80 #include "vstructs.h"
81
82 #ifdef __KERNEL__
83 typedef uuid_t __uuid_t;
84 #endif
85
86 #define BITMASK(name, type, field, offset, end)                         \
87 static const __maybe_unused unsigned    name##_OFFSET = offset;         \
88 static const __maybe_unused unsigned    name##_BITS = (end - offset);   \
89                                                                         \
90 static inline __u64 name(const type *k)                                 \
91 {                                                                       \
92         return (k->field >> offset) & ~(~0ULL << (end - offset));       \
93 }                                                                       \
94                                                                         \
95 static inline void SET_##name(type *k, __u64 v)                         \
96 {                                                                       \
97         k->field &= ~(~(~0ULL << (end - offset)) << offset);            \
98         k->field |= (v & ~(~0ULL << (end - offset))) << offset;         \
99 }
100
101 #define LE_BITMASK(_bits, name, type, field, offset, end)               \
102 static const __maybe_unused unsigned    name##_OFFSET = offset;         \
103 static const __maybe_unused unsigned    name##_BITS = (end - offset);   \
104 static const __maybe_unused __u##_bits  name##_MAX = (1ULL << (end - offset)) - 1;\
105                                                                         \
106 static inline __u64 name(const type *k)                                 \
107 {                                                                       \
108         return (__le##_bits##_to_cpu(k->field) >> offset) &             \
109                 ~(~0ULL << (end - offset));                             \
110 }                                                                       \
111                                                                         \
112 static inline void SET_##name(type *k, __u64 v)                         \
113 {                                                                       \
114         __u##_bits new = __le##_bits##_to_cpu(k->field);                \
115                                                                         \
116         new &= ~(~(~0ULL << (end - offset)) << offset);                 \
117         new |= (v & ~(~0ULL << (end - offset))) << offset;              \
118         k->field = __cpu_to_le##_bits(new);                             \
119 }
120
121 #define LE16_BITMASK(n, t, f, o, e)     LE_BITMASK(16, n, t, f, o, e)
122 #define LE32_BITMASK(n, t, f, o, e)     LE_BITMASK(32, n, t, f, o, e)
123 #define LE64_BITMASK(n, t, f, o, e)     LE_BITMASK(64, n, t, f, o, e)
124
125 struct bkey_format {
126         __u8            key_u64s;
127         __u8            nr_fields;
128         /* One unused slot for now: */
129         __u8            bits_per_field[6];
130         __le64          field_offset[6];
131 };
132
133 /* Btree keys - all units are in sectors */
134
135 struct bpos {
136         /*
137          * Word order matches machine byte order - btree code treats a bpos as a
138          * single large integer, for search/comparison purposes
139          *
140          * Note that wherever a bpos is embedded in another on disk data
141          * structure, it has to be byte swabbed when reading in metadata that
142          * wasn't written in native endian order:
143          */
144 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
145         __u32           snapshot;
146         __u64           offset;
147         __u64           inode;
148 #elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
149         __u64           inode;
150         __u64           offset;         /* Points to end of extent - sectors */
151         __u32           snapshot;
152 #else
153 #error edit for your odd byteorder.
154 #endif
155 } __packed
156 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
157 __aligned(4)
158 #endif
159 ;
160
161 #define KEY_INODE_MAX                   ((__u64)~0ULL)
162 #define KEY_OFFSET_MAX                  ((__u64)~0ULL)
163 #define KEY_SNAPSHOT_MAX                ((__u32)~0U)
164 #define KEY_SIZE_MAX                    ((__u32)~0U)
165
166 static inline struct bpos SPOS(__u64 inode, __u64 offset, __u32 snapshot)
167 {
168         return (struct bpos) {
169                 .inode          = inode,
170                 .offset         = offset,
171                 .snapshot       = snapshot,
172         };
173 }
174
175 #define POS_MIN                         SPOS(0, 0, 0)
176 #define POS_MAX                         SPOS(KEY_INODE_MAX, KEY_OFFSET_MAX, 0)
177 #define SPOS_MAX                        SPOS(KEY_INODE_MAX, KEY_OFFSET_MAX, KEY_SNAPSHOT_MAX)
178 #define POS(_inode, _offset)            SPOS(_inode, _offset, 0)
179
180 /* Empty placeholder struct, for container_of() */
181 struct bch_val {
182         __u64           __nothing[0];
183 };
184
185 struct bversion {
186 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
187         __u64           lo;
188         __u32           hi;
189 #elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
190         __u32           hi;
191         __u64           lo;
192 #endif
193 } __packed
194 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
195 __aligned(4)
196 #endif
197 ;
198
199 struct bkey {
200         /* Size of combined key and value, in u64s */
201         __u8            u64s;
202
203         /* Format of key (0 for format local to btree node) */
204 #if defined(__LITTLE_ENDIAN_BITFIELD)
205         __u8            format:7,
206                         needs_whiteout:1;
207 #elif defined (__BIG_ENDIAN_BITFIELD)
208         __u8            needs_whiteout:1,
209                         format:7;
210 #else
211 #error edit for your odd byteorder.
212 #endif
213
214         /* Type of the value */
215         __u8            type;
216
217 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
218         __u8            pad[1];
219
220         struct bversion bversion;
221         __u32           size;           /* extent size, in sectors */
222         struct bpos     p;
223 #elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
224         struct bpos     p;
225         __u32           size;           /* extent size, in sectors */
226         struct bversion bversion;
227
228         __u8            pad[1];
229 #endif
230 } __packed
231 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
232 /*
233  * The big-endian version of bkey can't be compiled by rustc with the "aligned"
234  * attr since it doesn't allow types to have both "packed" and "aligned" attrs.
235  * So for Rust compatibility, don't include this. It can be included in the LE
236  * version because the "packed" attr is redundant in that case.
237  *
238  * History: (quoting Kent)
239  *
240  * Specifically, when i was designing bkey, I wanted the header to be no
241  * bigger than necessary so that bkey_packed could use the rest. That means that
242  * decently offten extent keys will fit into only 8 bytes, instead of spilling over
243  * to 16.
244  *
245  * But packed_bkey treats the part after the header - the packed section -
246  * as a single multi word, variable length integer. And bkey, the unpacked
247  * version, is just a special case version of a bkey_packed; all the packed
248  * bkey code will work on keys in any packed format, the in-memory
249  * representation of an unpacked key also is just one type of packed key...
250  *
251  * So that constrains the key part of a bkig endian bkey to start right
252  * after the header.
253  *
254  * If we ever do a bkey_v2 and need to expand the hedaer by another byte for
255  * some reason - that will clean up this wart.
256  */
257 __aligned(8)
258 #endif
259 ;
260
261 struct bkey_packed {
262         __u64           _data[0];
263
264         /* Size of combined key and value, in u64s */
265         __u8            u64s;
266
267         /* Format of key (0 for format local to btree node) */
268
269         /*
270          * XXX: next incompat on disk format change, switch format and
271          * needs_whiteout - bkey_packed() will be cheaper if format is the high
272          * bits of the bitfield
273          */
274 #if defined(__LITTLE_ENDIAN_BITFIELD)
275         __u8            format:7,
276                         needs_whiteout:1;
277 #elif defined (__BIG_ENDIAN_BITFIELD)
278         __u8            needs_whiteout:1,
279                         format:7;
280 #endif
281
282         /* Type of the value */
283         __u8            type;
284         __u8            key_start[0];
285
286         /*
287          * We copy bkeys with struct assignment in various places, and while
288          * that shouldn't be done with packed bkeys we can't disallow it in C,
289          * and it's legal to cast a bkey to a bkey_packed  - so padding it out
290          * to the same size as struct bkey should hopefully be safest.
291          */
292         __u8            pad[sizeof(struct bkey) - 3];
293 } __packed __aligned(8);
294
295 typedef struct {
296         __le64                  lo;
297         __le64                  hi;
298 } bch_le128;
299
300 #define BKEY_U64s                       (sizeof(struct bkey) / sizeof(__u64))
301 #define BKEY_U64s_MAX                   U8_MAX
302 #define BKEY_VAL_U64s_MAX               (BKEY_U64s_MAX - BKEY_U64s)
303
304 #define KEY_PACKED_BITS_START           24
305
306 #define KEY_FORMAT_LOCAL_BTREE          0
307 #define KEY_FORMAT_CURRENT              1
308
309 enum bch_bkey_fields {
310         BKEY_FIELD_INODE,
311         BKEY_FIELD_OFFSET,
312         BKEY_FIELD_SNAPSHOT,
313         BKEY_FIELD_SIZE,
314         BKEY_FIELD_VERSION_HI,
315         BKEY_FIELD_VERSION_LO,
316         BKEY_NR_FIELDS,
317 };
318
319 #define bkey_format_field(name, field)                                  \
320         [BKEY_FIELD_##name] = (sizeof(((struct bkey *) NULL)->field) * 8)
321
322 #define BKEY_FORMAT_CURRENT                                             \
323 ((struct bkey_format) {                                                 \
324         .key_u64s       = BKEY_U64s,                                    \
325         .nr_fields      = BKEY_NR_FIELDS,                               \
326         .bits_per_field = {                                             \
327                 bkey_format_field(INODE,        p.inode),               \
328                 bkey_format_field(OFFSET,       p.offset),              \
329                 bkey_format_field(SNAPSHOT,     p.snapshot),            \
330                 bkey_format_field(SIZE,         size),                  \
331                 bkey_format_field(VERSION_HI,   bversion.hi),           \
332                 bkey_format_field(VERSION_LO,   bversion.lo),           \
333         },                                                              \
334 })
335
336 /* bkey with inline value */
337 struct bkey_i {
338         __u64                   _data[0];
339
340         struct bkey     k;
341         struct bch_val  v;
342 };
343
344 #define POS_KEY(_pos)                                                   \
345 ((struct bkey) {                                                        \
346         .u64s           = BKEY_U64s,                                    \
347         .format         = KEY_FORMAT_CURRENT,                           \
348         .p              = _pos,                                         \
349 })
350
351 #define KEY(_inode, _offset, _size)                                     \
352 ((struct bkey) {                                                        \
353         .u64s           = BKEY_U64s,                                    \
354         .format         = KEY_FORMAT_CURRENT,                           \
355         .p              = POS(_inode, _offset),                         \
356         .size           = _size,                                        \
357 })
358
359 static inline void bkey_init(struct bkey *k)
360 {
361         *k = KEY(0, 0, 0);
362 }
363
364 #define bkey_bytes(_k)          ((_k)->u64s * sizeof(__u64))
365
366 #define __BKEY_PADDED(key, pad)                                 \
367         struct bkey_i key; __u64 key ## _pad[pad]
368
369 enum bch_bkey_type_flags {
370         BKEY_TYPE_strict_btree_checks   = BIT(0),
371 };
372
373 /*
374  * - DELETED keys are used internally to mark keys that should be ignored but
375  *   override keys in composition order.  Their version number is ignored.
376  *
377  * - DISCARDED keys indicate that the data is all 0s because it has been
378  *   discarded. DISCARDs may have a version; if the version is nonzero the key
379  *   will be persistent, otherwise the key will be dropped whenever the btree
380  *   node is rewritten (like DELETED keys).
381  *
382  * - ERROR: any read of the data returns a read error, as the data was lost due
383  *   to a failing device. Like DISCARDED keys, they can be removed (overridden)
384  *   by new writes or cluster-wide GC. Node repair can also overwrite them with
385  *   the same or a more recent version number, but not with an older version
386  *   number.
387  *
388  * - WHITEOUT: for hash table btrees
389  */
390 #define BCH_BKEY_TYPES()                                                \
391         x(deleted,              0,      0)                              \
392         x(whiteout,             1,      0)                              \
393         x(error,                2,      0)                              \
394         x(cookie,               3,      0)                              \
395         x(hash_whiteout,        4,      BKEY_TYPE_strict_btree_checks)  \
396         x(btree_ptr,            5,      BKEY_TYPE_strict_btree_checks)  \
397         x(extent,               6,      BKEY_TYPE_strict_btree_checks)  \
398         x(reservation,          7,      BKEY_TYPE_strict_btree_checks)  \
399         x(inode,                8,      BKEY_TYPE_strict_btree_checks)  \
400         x(inode_generation,     9,      BKEY_TYPE_strict_btree_checks)  \
401         x(dirent,               10,     BKEY_TYPE_strict_btree_checks)  \
402         x(xattr,                11,     BKEY_TYPE_strict_btree_checks)  \
403         x(alloc,                12,     BKEY_TYPE_strict_btree_checks)  \
404         x(quota,                13,     BKEY_TYPE_strict_btree_checks)  \
405         x(stripe,               14,     BKEY_TYPE_strict_btree_checks)  \
406         x(reflink_p,            15,     BKEY_TYPE_strict_btree_checks)  \
407         x(reflink_v,            16,     BKEY_TYPE_strict_btree_checks)  \
408         x(inline_data,          17,     BKEY_TYPE_strict_btree_checks)  \
409         x(btree_ptr_v2,         18,     BKEY_TYPE_strict_btree_checks)  \
410         x(indirect_inline_data, 19,     BKEY_TYPE_strict_btree_checks)  \
411         x(alloc_v2,             20,     BKEY_TYPE_strict_btree_checks)  \
412         x(subvolume,            21,     BKEY_TYPE_strict_btree_checks)  \
413         x(snapshot,             22,     BKEY_TYPE_strict_btree_checks)  \
414         x(inode_v2,             23,     BKEY_TYPE_strict_btree_checks)  \
415         x(alloc_v3,             24,     BKEY_TYPE_strict_btree_checks)  \
416         x(set,                  25,     0)                              \
417         x(lru,                  26,     BKEY_TYPE_strict_btree_checks)  \
418         x(alloc_v4,             27,     BKEY_TYPE_strict_btree_checks)  \
419         x(backpointer,          28,     BKEY_TYPE_strict_btree_checks)  \
420         x(inode_v3,             29,     BKEY_TYPE_strict_btree_checks)  \
421         x(bucket_gens,          30,     BKEY_TYPE_strict_btree_checks)  \
422         x(snapshot_tree,        31,     BKEY_TYPE_strict_btree_checks)  \
423         x(logged_op_truncate,   32,     BKEY_TYPE_strict_btree_checks)  \
424         x(logged_op_finsert,    33,     BKEY_TYPE_strict_btree_checks)  \
425         x(accounting,           34,     BKEY_TYPE_strict_btree_checks)  \
426         x(inode_alloc_cursor,   35,     BKEY_TYPE_strict_btree_checks)
427
428 enum bch_bkey_type {
429 #define x(name, nr, ...) KEY_TYPE_##name        = nr,
430         BCH_BKEY_TYPES()
431 #undef x
432         KEY_TYPE_MAX,
433 };
434
435 struct bch_deleted {
436         struct bch_val          v;
437 };
438
439 struct bch_whiteout {
440         struct bch_val          v;
441 };
442
443 struct bch_error {
444         struct bch_val          v;
445 };
446
447 struct bch_cookie {
448         struct bch_val          v;
449         __le64                  cookie;
450 };
451
452 struct bch_hash_whiteout {
453         struct bch_val          v;
454 };
455
456 struct bch_set {
457         struct bch_val          v;
458 };
459
460 /* 128 bits, sufficient for cryptographic MACs: */
461 struct bch_csum {
462         __le64                  lo;
463         __le64                  hi;
464 } __packed __aligned(8);
465
466 struct bch_backpointer {
467         struct bch_val          v;
468         __u8                    btree_id;
469         __u8                    level;
470         __u8                    data_type;
471         __u8                    bucket_gen;
472         __u32                   pad;
473         __u32                   bucket_len;
474         struct bpos             pos;
475 } __packed __aligned(8);
476
477 /* Optional/variable size superblock sections: */
478
479 struct bch_sb_field {
480         __u64                   _data[0];
481         __le32                  u64s;
482         __le32                  type;
483 };
484
485 #define BCH_SB_FIELDS()                         \
486         x(journal,                      0)      \
487         x(members_v1,                   1)      \
488         x(crypt,                        2)      \
489         x(replicas_v0,                  3)      \
490         x(quota,                        4)      \
491         x(disk_groups,                  5)      \
492         x(clean,                        6)      \
493         x(replicas,                     7)      \
494         x(journal_seq_blacklist,        8)      \
495         x(journal_v2,                   9)      \
496         x(counters,                     10)     \
497         x(members_v2,                   11)     \
498         x(errors,                       12)     \
499         x(ext,                          13)     \
500         x(downgrade,                    14)     \
501         x(recovery_passes,              15)
502
503 #include "alloc_background_format.h"
504 #include "dirent_format.h"
505 #include "disk_accounting_format.h"
506 #include "disk_groups_format.h"
507 #include "extents_format.h"
508 #include "ec_format.h"
509 #include "inode_format.h"
510 #include "journal_seq_blacklist_format.h"
511 #include "logged_ops_format.h"
512 #include "lru_format.h"
513 #include "quota_format.h"
514 #include "recovery_passes_format.h"
515 #include "reflink_format.h"
516 #include "replicas_format.h"
517 #include "snapshot_format.h"
518 #include "subvolume_format.h"
519 #include "sb-counters_format.h"
520 #include "sb-downgrade_format.h"
521 #include "sb-errors_format.h"
522 #include "sb-members_format.h"
523 #include "xattr_format.h"
524
525 enum bch_sb_field_type {
526 #define x(f, nr)        BCH_SB_FIELD_##f = nr,
527         BCH_SB_FIELDS()
528 #undef x
529         BCH_SB_FIELD_NR
530 };
531
532 /*
533  * Most superblock fields are replicated in all device's superblocks - a few are
534  * not:
535  */
536 #define BCH_SINGLE_DEVICE_SB_FIELDS             \
537         ((1U << BCH_SB_FIELD_journal)|          \
538          (1U << BCH_SB_FIELD_journal_v2))
539
540 /* BCH_SB_FIELD_journal: */
541
542 struct bch_sb_field_journal {
543         struct bch_sb_field     field;
544         __le64                  buckets[];
545 };
546
547 struct bch_sb_field_journal_v2 {
548         struct bch_sb_field     field;
549
550         struct bch_sb_field_journal_v2_entry {
551                 __le64          start;
552                 __le64          nr;
553         }                       d[];
554 };
555
556 /* BCH_SB_FIELD_crypt: */
557
558 struct nonce {
559         __le32                  d[4];
560 };
561
562 struct bch_key {
563         __le64                  key[4];
564 };
565
566 #define BCH_KEY_MAGIC                                   \
567         (((__u64) 'b' <<  0)|((__u64) 'c' <<  8)|               \
568          ((__u64) 'h' << 16)|((__u64) '*' << 24)|               \
569          ((__u64) '*' << 32)|((__u64) 'k' << 40)|               \
570          ((__u64) 'e' << 48)|((__u64) 'y' << 56))
571
572 struct bch_encrypted_key {
573         __le64                  magic;
574         struct bch_key          key;
575 };
576
577 /*
578  * If this field is present in the superblock, it stores an encryption key which
579  * is used encrypt all other data/metadata. The key will normally be encrypted
580  * with the key userspace provides, but if encryption has been turned off we'll
581  * just store the master key unencrypted in the superblock so we can access the
582  * previously encrypted data.
583  */
584 struct bch_sb_field_crypt {
585         struct bch_sb_field     field;
586
587         __le64                  flags;
588         __le64                  kdf_flags;
589         struct bch_encrypted_key key;
590 };
591
592 LE64_BITMASK(BCH_CRYPT_KDF_TYPE,        struct bch_sb_field_crypt, flags, 0, 4);
593
594 enum bch_kdf_types {
595         BCH_KDF_SCRYPT          = 0,
596         BCH_KDF_NR              = 1,
597 };
598
599 /* stored as base 2 log of scrypt params: */
600 LE64_BITMASK(BCH_KDF_SCRYPT_N,  struct bch_sb_field_crypt, kdf_flags,  0, 16);
601 LE64_BITMASK(BCH_KDF_SCRYPT_R,  struct bch_sb_field_crypt, kdf_flags, 16, 32);
602 LE64_BITMASK(BCH_KDF_SCRYPT_P,  struct bch_sb_field_crypt, kdf_flags, 32, 48);
603
604 /*
605  * On clean shutdown, store btree roots and current journal sequence number in
606  * the superblock:
607  */
608 struct jset_entry {
609         __le16                  u64s;
610         __u8                    btree_id;
611         __u8                    level;
612         __u8                    type; /* designates what this jset holds */
613         __u8                    pad[3];
614
615         struct bkey_i           start[0];
616         __u64                   _data[];
617 };
618
619 struct bch_sb_field_clean {
620         struct bch_sb_field     field;
621
622         __le32                  flags;
623         __le16                  _read_clock; /* no longer used */
624         __le16                  _write_clock;
625         __le64                  journal_seq;
626
627         struct jset_entry       start[0];
628         __u64                   _data[];
629 };
630
631 struct bch_sb_field_ext {
632         struct bch_sb_field     field;
633         __le64                  recovery_passes_required[2];
634         __le64                  errors_silent[8];
635         __le64                  btrees_lost_data;
636 };
637
638 /* Superblock: */
639
640 /*
641  * New versioning scheme:
642  * One common version number for all on disk data structures - superblock, btree
643  * nodes, journal entries
644  */
645 #define BCH_VERSION_MAJOR(_v)           ((__u16) ((_v) >> 10))
646 #define BCH_VERSION_MINOR(_v)           ((__u16) ((_v) & ~(~0U << 10)))
647 #define BCH_VERSION(_major, _minor)     (((_major) << 10)|(_minor) << 0)
648
649 /*
650  * field 1:             version name
651  * field 2:             BCH_VERSION(major, minor)
652  * field 3:             recovery passess required on upgrade
653  */
654 #define BCH_METADATA_VERSIONS()                                         \
655         x(bkey_renumber,                BCH_VERSION(0, 10))             \
656         x(inode_btree_change,           BCH_VERSION(0, 11))             \
657         x(snapshot,                     BCH_VERSION(0, 12))             \
658         x(inode_backpointers,           BCH_VERSION(0, 13))             \
659         x(btree_ptr_sectors_written,    BCH_VERSION(0, 14))             \
660         x(snapshot_2,                   BCH_VERSION(0, 15))             \
661         x(reflink_p_fix,                BCH_VERSION(0, 16))             \
662         x(subvol_dirent,                BCH_VERSION(0, 17))             \
663         x(inode_v2,                     BCH_VERSION(0, 18))             \
664         x(freespace,                    BCH_VERSION(0, 19))             \
665         x(alloc_v4,                     BCH_VERSION(0, 20))             \
666         x(new_data_types,               BCH_VERSION(0, 21))             \
667         x(backpointers,                 BCH_VERSION(0, 22))             \
668         x(inode_v3,                     BCH_VERSION(0, 23))             \
669         x(unwritten_extents,            BCH_VERSION(0, 24))             \
670         x(bucket_gens,                  BCH_VERSION(0, 25))             \
671         x(lru_v2,                       BCH_VERSION(0, 26))             \
672         x(fragmentation_lru,            BCH_VERSION(0, 27))             \
673         x(no_bps_in_alloc_keys,         BCH_VERSION(0, 28))             \
674         x(snapshot_trees,               BCH_VERSION(0, 29))             \
675         x(major_minor,                  BCH_VERSION(1,  0))             \
676         x(snapshot_skiplists,           BCH_VERSION(1,  1))             \
677         x(deleted_inodes,               BCH_VERSION(1,  2))             \
678         x(rebalance_work,               BCH_VERSION(1,  3))             \
679         x(member_seq,                   BCH_VERSION(1,  4))             \
680         x(subvolume_fs_parent,          BCH_VERSION(1,  5))             \
681         x(btree_subvolume_children,     BCH_VERSION(1,  6))             \
682         x(mi_btree_bitmap,              BCH_VERSION(1,  7))             \
683         x(bucket_stripe_sectors,        BCH_VERSION(1,  8))             \
684         x(disk_accounting_v2,           BCH_VERSION(1,  9))             \
685         x(disk_accounting_v3,           BCH_VERSION(1, 10))             \
686         x(disk_accounting_inum,         BCH_VERSION(1, 11))             \
687         x(rebalance_work_acct_fix,      BCH_VERSION(1, 12))             \
688         x(inode_has_child_snapshots,    BCH_VERSION(1, 13))             \
689         x(backpointer_bucket_gen,       BCH_VERSION(1, 14))             \
690         x(disk_accounting_big_endian,   BCH_VERSION(1, 15))             \
691         x(reflink_p_may_update_opts,    BCH_VERSION(1, 16))             \
692         x(inode_depth,                  BCH_VERSION(1, 17))             \
693         x(persistent_inode_cursors,     BCH_VERSION(1, 18))             \
694         x(autofix_errors,               BCH_VERSION(1, 19))             \
695         x(directory_size,               BCH_VERSION(1, 20))             \
696         x(cached_backpointers,          BCH_VERSION(1, 21))             \
697         x(stripe_backpointers,          BCH_VERSION(1, 22))             \
698         x(stripe_lru,                   BCH_VERSION(1, 23))             \
699         x(casefolding,                  BCH_VERSION(1, 24))             \
700         x(extent_flags,                 BCH_VERSION(1, 25))             \
701         x(snapshot_deletion_v2,         BCH_VERSION(1, 26))             \
702         x(fast_device_removal,          BCH_VERSION(1, 27))             \
703         x(inode_has_case_insensitive,   BCH_VERSION(1, 28))
704
705 enum bcachefs_metadata_version {
706         bcachefs_metadata_version_min = 9,
707 #define x(t, n) bcachefs_metadata_version_##t = n,
708         BCH_METADATA_VERSIONS()
709 #undef x
710         bcachefs_metadata_version_max
711 };
712
713 static const __maybe_unused
714 unsigned bcachefs_metadata_required_upgrade_below = bcachefs_metadata_version_rebalance_work;
715
716 #define bcachefs_metadata_version_current       (bcachefs_metadata_version_max - 1)
717
718 #define BCH_SB_SECTOR                   8
719
720 #define BCH_SB_LAYOUT_SIZE_BITS_MAX     16 /* 32 MB */
721
722 struct bch_sb_layout {
723         __uuid_t                magic;  /* bcachefs superblock UUID */
724         __u8                    layout_type;
725         __u8                    sb_max_size_bits; /* base 2 of 512 byte sectors */
726         __u8                    nr_superblocks;
727         __u8                    pad[5];
728         __le64                  sb_offset[61];
729 } __packed __aligned(8);
730
731 #define BCH_SB_LAYOUT_SECTOR    7
732
733 /*
734  * @offset      - sector where this sb was written
735  * @version     - on disk format version
736  * @version_min - Oldest metadata version this filesystem contains; so we can
737  *                safely drop compatibility code and refuse to mount filesystems
738  *                we'd need it for
739  * @magic       - identifies as a bcachefs superblock (BCHFS_MAGIC)
740  * @seq         - incremented each time superblock is written
741  * @uuid        - used for generating various magic numbers and identifying
742  *                member devices, never changes
743  * @user_uuid   - user visible UUID, may be changed
744  * @label       - filesystem label
745  * @seq         - identifies most recent superblock, incremented each time
746  *                superblock is written
747  * @features    - enabled incompatible features
748  */
749 struct bch_sb {
750         struct bch_csum         csum;
751         __le16                  version;
752         __le16                  version_min;
753         __le16                  pad[2];
754         __uuid_t                magic;
755         __uuid_t                uuid;
756         __uuid_t                user_uuid;
757         __u8                    label[BCH_SB_LABEL_SIZE];
758         __le64                  offset;
759         __le64                  seq;
760
761         __le16                  block_size;
762         __u8                    dev_idx;
763         __u8                    nr_devices;
764         __le32                  u64s;
765
766         __le64                  time_base_lo;
767         __le32                  time_base_hi;
768         __le32                  time_precision;
769
770         __le64                  flags[7];
771         __le64                  write_time;
772         __le64                  features[2];
773         __le64                  compat[2];
774
775         struct bch_sb_layout    layout;
776
777         struct bch_sb_field     start[0];
778         __le64                  _data[];
779 } __packed __aligned(8);
780
781 /*
782  * Flags:
783  * BCH_SB_INITALIZED    - set on first mount
784  * BCH_SB_CLEAN         - did we shut down cleanly? Just a hint, doesn't affect
785  *                        behaviour of mount/recovery path:
786  * BCH_SB_INODE_32BIT   - limit inode numbers to 32 bits
787  * BCH_SB_128_BIT_MACS  - 128 bit macs instead of 80
788  * BCH_SB_ENCRYPTION_TYPE - if nonzero encryption is enabled; overrides
789  *                         DATA/META_CSUM_TYPE. Also indicates encryption
790  *                         algorithm in use, if/when we get more than one
791  */
792
793 LE16_BITMASK(BCH_SB_BLOCK_SIZE,         struct bch_sb, block_size, 0, 16);
794
795 LE64_BITMASK(BCH_SB_INITIALIZED,        struct bch_sb, flags[0],  0,  1);
796 LE64_BITMASK(BCH_SB_CLEAN,              struct bch_sb, flags[0],  1,  2);
797 LE64_BITMASK(BCH_SB_CSUM_TYPE,          struct bch_sb, flags[0],  2,  8);
798 LE64_BITMASK(BCH_SB_ERROR_ACTION,       struct bch_sb, flags[0],  8, 12);
799
800 LE64_BITMASK(BCH_SB_BTREE_NODE_SIZE,    struct bch_sb, flags[0], 12, 28);
801
802 LE64_BITMASK(BCH_SB_GC_RESERVE,         struct bch_sb, flags[0], 28, 33);
803 LE64_BITMASK(BCH_SB_ROOT_RESERVE,       struct bch_sb, flags[0], 33, 40);
804
805 LE64_BITMASK(BCH_SB_META_CSUM_TYPE,     struct bch_sb, flags[0], 40, 44);
806 LE64_BITMASK(BCH_SB_DATA_CSUM_TYPE,     struct bch_sb, flags[0], 44, 48);
807
808 LE64_BITMASK(BCH_SB_META_REPLICAS_WANT, struct bch_sb, flags[0], 48, 52);
809 LE64_BITMASK(BCH_SB_DATA_REPLICAS_WANT, struct bch_sb, flags[0], 52, 56);
810
811 LE64_BITMASK(BCH_SB_POSIX_ACL,          struct bch_sb, flags[0], 56, 57);
812 LE64_BITMASK(BCH_SB_USRQUOTA,           struct bch_sb, flags[0], 57, 58);
813 LE64_BITMASK(BCH_SB_GRPQUOTA,           struct bch_sb, flags[0], 58, 59);
814 LE64_BITMASK(BCH_SB_PRJQUOTA,           struct bch_sb, flags[0], 59, 60);
815
816 LE64_BITMASK(BCH_SB_HAS_ERRORS,         struct bch_sb, flags[0], 60, 61);
817 LE64_BITMASK(BCH_SB_HAS_TOPOLOGY_ERRORS,struct bch_sb, flags[0], 61, 62);
818
819 LE64_BITMASK(BCH_SB_BIG_ENDIAN,         struct bch_sb, flags[0], 62, 63);
820 LE64_BITMASK(BCH_SB_PROMOTE_WHOLE_EXTENTS,
821                                         struct bch_sb, flags[0], 63, 64);
822
823 LE64_BITMASK(BCH_SB_STR_HASH_TYPE,      struct bch_sb, flags[1],  0,  4);
824 LE64_BITMASK(BCH_SB_COMPRESSION_TYPE_LO,struct bch_sb, flags[1],  4,  8);
825 LE64_BITMASK(BCH_SB_INODE_32BIT,        struct bch_sb, flags[1],  8,  9);
826
827 LE64_BITMASK(BCH_SB_128_BIT_MACS,       struct bch_sb, flags[1],  9, 10);
828 LE64_BITMASK(BCH_SB_ENCRYPTION_TYPE,    struct bch_sb, flags[1], 10, 14);
829
830 /*
831  * Max size of an extent that may require bouncing to read or write
832  * (checksummed, compressed): 64k
833  */
834 LE64_BITMASK(BCH_SB_ENCODED_EXTENT_MAX_BITS,
835                                         struct bch_sb, flags[1], 14, 20);
836
837 LE64_BITMASK(BCH_SB_META_REPLICAS_REQ,  struct bch_sb, flags[1], 20, 24);
838 LE64_BITMASK(BCH_SB_DATA_REPLICAS_REQ,  struct bch_sb, flags[1], 24, 28);
839
840 LE64_BITMASK(BCH_SB_PROMOTE_TARGET,     struct bch_sb, flags[1], 28, 40);
841 LE64_BITMASK(BCH_SB_FOREGROUND_TARGET,  struct bch_sb, flags[1], 40, 52);
842 LE64_BITMASK(BCH_SB_BACKGROUND_TARGET,  struct bch_sb, flags[1], 52, 64);
843
844 LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE_LO,
845                                         struct bch_sb, flags[2],  0,  4);
846 LE64_BITMASK(BCH_SB_GC_RESERVE_BYTES,   struct bch_sb, flags[2],  4, 64);
847
848 LE64_BITMASK(BCH_SB_ERASURE_CODE,       struct bch_sb, flags[3],  0, 16);
849 LE64_BITMASK(BCH_SB_METADATA_TARGET,    struct bch_sb, flags[3], 16, 28);
850 LE64_BITMASK(BCH_SB_SHARD_INUMS,        struct bch_sb, flags[3], 28, 29);
851 LE64_BITMASK(BCH_SB_INODES_USE_KEY_CACHE,struct bch_sb, flags[3], 29, 30);
852 LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DELAY,struct bch_sb, flags[3], 30, 62);
853 LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DISABLED,struct bch_sb, flags[3], 62, 63);
854 LE64_BITMASK(BCH_SB_MULTI_DEVICE,       struct bch_sb,  flags[3], 63, 64);
855 LE64_BITMASK(BCH_SB_JOURNAL_RECLAIM_DELAY,struct bch_sb, flags[4], 0, 32);
856 LE64_BITMASK(BCH_SB_JOURNAL_TRANSACTION_NAMES,struct bch_sb, flags[4], 32, 33);
857 LE64_BITMASK(BCH_SB_NOCOW,              struct bch_sb, flags[4], 33, 34);
858 LE64_BITMASK(BCH_SB_WRITE_BUFFER_SIZE,  struct bch_sb, flags[4], 34, 54);
859 LE64_BITMASK(BCH_SB_VERSION_UPGRADE,    struct bch_sb, flags[4], 54, 56);
860
861 LE64_BITMASK(BCH_SB_COMPRESSION_TYPE_HI,struct bch_sb, flags[4], 56, 60);
862 LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE_HI,
863                                         struct bch_sb, flags[4], 60, 64);
864
865 LE64_BITMASK(BCH_SB_VERSION_UPGRADE_COMPLETE,
866                                         struct bch_sb, flags[5],  0, 16);
867 LE64_BITMASK(BCH_SB_ALLOCATOR_STUCK_TIMEOUT,
868                                         struct bch_sb, flags[5], 16, 32);
869 LE64_BITMASK(BCH_SB_VERSION_INCOMPAT,   struct bch_sb, flags[5], 32, 48);
870 LE64_BITMASK(BCH_SB_VERSION_INCOMPAT_ALLOWED,
871                                         struct bch_sb, flags[5], 48, 64);
872 LE64_BITMASK(BCH_SB_SHARD_INUMS_NBITS,  struct bch_sb, flags[6],  0,  4);
873 LE64_BITMASK(BCH_SB_WRITE_ERROR_TIMEOUT,struct bch_sb, flags[6],  4, 14);
874 LE64_BITMASK(BCH_SB_CSUM_ERR_RETRY_NR,  struct bch_sb, flags[6], 14, 20);
875 LE64_BITMASK(BCH_SB_DEGRADED_ACTION,    struct bch_sb, flags[6], 20, 22);
876 LE64_BITMASK(BCH_SB_CASEFOLD,           struct bch_sb, flags[6], 22, 23);
877 LE64_BITMASK(BCH_SB_REBALANCE_AC_ONLY,  struct bch_sb, flags[6], 23, 24);
878
879 static inline __u64 BCH_SB_COMPRESSION_TYPE(const struct bch_sb *sb)
880 {
881         return BCH_SB_COMPRESSION_TYPE_LO(sb) | (BCH_SB_COMPRESSION_TYPE_HI(sb) << 4);
882 }
883
884 static inline void SET_BCH_SB_COMPRESSION_TYPE(struct bch_sb *sb, __u64 v)
885 {
886         SET_BCH_SB_COMPRESSION_TYPE_LO(sb, v);
887         SET_BCH_SB_COMPRESSION_TYPE_HI(sb, v >> 4);
888 }
889
890 static inline __u64 BCH_SB_BACKGROUND_COMPRESSION_TYPE(const struct bch_sb *sb)
891 {
892         return BCH_SB_BACKGROUND_COMPRESSION_TYPE_LO(sb) |
893                 (BCH_SB_BACKGROUND_COMPRESSION_TYPE_HI(sb) << 4);
894 }
895
896 static inline void SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE(struct bch_sb *sb, __u64 v)
897 {
898         SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE_LO(sb, v);
899         SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE_HI(sb, v >> 4);
900 }
901
902 /*
903  * Features:
904  *
905  * journal_seq_blacklist_v3:    gates BCH_SB_FIELD_journal_seq_blacklist
906  * reflink:                     gates KEY_TYPE_reflink
907  * inline_data:                 gates KEY_TYPE_inline_data
908  * new_siphash:                 gates BCH_STR_HASH_siphash
909  * new_extent_overwrite:        gates BTREE_NODE_NEW_EXTENT_OVERWRITE
910  */
911 #define BCH_SB_FEATURES()                       \
912         x(lz4,                          0)      \
913         x(gzip,                         1)      \
914         x(zstd,                         2)      \
915         x(atomic_nlink,                 3)      \
916         x(ec,                           4)      \
917         x(journal_seq_blacklist_v3,     5)      \
918         x(reflink,                      6)      \
919         x(new_siphash,                  7)      \
920         x(inline_data,                  8)      \
921         x(new_extent_overwrite,         9)      \
922         x(incompressible,               10)     \
923         x(btree_ptr_v2,                 11)     \
924         x(extents_above_btree_updates,  12)     \
925         x(btree_updates_journalled,     13)     \
926         x(reflink_inline_data,          14)     \
927         x(new_varint,                   15)     \
928         x(journal_no_flush,             16)     \
929         x(alloc_v2,                     17)     \
930         x(extents_across_btree_nodes,   18)     \
931         x(incompat_version_field,       19)     \
932         x(casefolding,                  20)     \
933         x(no_alloc_info,                21)     \
934         x(small_image,                  22)
935
936 #define BCH_SB_FEATURES_ALWAYS                          \
937         (BIT_ULL(BCH_FEATURE_new_extent_overwrite)|     \
938          BIT_ULL(BCH_FEATURE_extents_above_btree_updates)|\
939          BIT_ULL(BCH_FEATURE_btree_updates_journalled)|\
940          BIT_ULL(BCH_FEATURE_alloc_v2)|\
941          BIT_ULL(BCH_FEATURE_extents_across_btree_nodes))
942
943 #define BCH_SB_FEATURES_ALL                             \
944         (BCH_SB_FEATURES_ALWAYS|                        \
945          BIT_ULL(BCH_FEATURE_new_siphash)|              \
946          BIT_ULL(BCH_FEATURE_btree_ptr_v2)|             \
947          BIT_ULL(BCH_FEATURE_new_varint)|               \
948          BIT_ULL(BCH_FEATURE_journal_no_flush)|         \
949          BIT_ULL(BCH_FEATURE_incompat_version_field))
950
951 enum bch_sb_feature {
952 #define x(f, n) BCH_FEATURE_##f,
953         BCH_SB_FEATURES()
954 #undef x
955         BCH_FEATURE_NR,
956 };
957
958 #define BCH_SB_COMPAT()                                 \
959         x(alloc_info,                           0)      \
960         x(alloc_metadata,                       1)      \
961         x(extents_above_btree_updates_done,     2)      \
962         x(bformat_overflow_done,                3)
963
964 enum bch_sb_compat {
965 #define x(f, n) BCH_COMPAT_##f,
966         BCH_SB_COMPAT()
967 #undef x
968         BCH_COMPAT_NR,
969 };
970
971 /* options: */
972
973 #define BCH_VERSION_UPGRADE_OPTS()      \
974         x(compatible,           0)      \
975         x(incompatible,         1)      \
976         x(none,                 2)
977
978 enum bch_version_upgrade_opts {
979 #define x(t, n) BCH_VERSION_UPGRADE_##t = n,
980         BCH_VERSION_UPGRADE_OPTS()
981 #undef x
982 };
983
984 #define BCH_REPLICAS_MAX                4U
985
986 #define BCH_BKEY_PTRS_MAX               16U
987
988 #define BCH_ERROR_ACTIONS()             \
989         x(continue,             0)      \
990         x(fix_safe,             1)      \
991         x(panic,                2)      \
992         x(ro,                   3)
993
994 enum bch_error_actions {
995 #define x(t, n) BCH_ON_ERROR_##t = n,
996         BCH_ERROR_ACTIONS()
997 #undef x
998         BCH_ON_ERROR_NR
999 };
1000
1001 #define BCH_DEGRADED_ACTIONS()          \
1002         x(ask,                  0)      \
1003         x(yes,                  1)      \
1004         x(very,                 2)      \
1005         x(no,                   3)
1006
1007 enum bch_degraded_actions {
1008 #define x(t, n) BCH_DEGRADED_##t = n,
1009         BCH_DEGRADED_ACTIONS()
1010 #undef x
1011         BCH_DEGRADED_ACTIONS_NR
1012 };
1013
1014 #define BCH_STR_HASH_TYPES()            \
1015         x(crc32c,               0)      \
1016         x(crc64,                1)      \
1017         x(siphash_old,          2)      \
1018         x(siphash,              3)
1019
1020 enum bch_str_hash_type {
1021 #define x(t, n) BCH_STR_HASH_##t = n,
1022         BCH_STR_HASH_TYPES()
1023 #undef x
1024         BCH_STR_HASH_NR
1025 };
1026
1027 #define BCH_STR_HASH_OPTS()             \
1028         x(crc32c,               0)      \
1029         x(crc64,                1)      \
1030         x(siphash,              2)
1031
1032 enum bch_str_hash_opts {
1033 #define x(t, n) BCH_STR_HASH_OPT_##t = n,
1034         BCH_STR_HASH_OPTS()
1035 #undef x
1036         BCH_STR_HASH_OPT_NR
1037 };
1038
1039 #define BCH_CSUM_TYPES()                        \
1040         x(none,                         0)      \
1041         x(crc32c_nonzero,               1)      \
1042         x(crc64_nonzero,                2)      \
1043         x(chacha20_poly1305_80,         3)      \
1044         x(chacha20_poly1305_128,        4)      \
1045         x(crc32c,                       5)      \
1046         x(crc64,                        6)      \
1047         x(xxhash,                       7)
1048
1049 enum bch_csum_type {
1050 #define x(t, n) BCH_CSUM_##t = n,
1051         BCH_CSUM_TYPES()
1052 #undef x
1053         BCH_CSUM_NR
1054 };
1055
1056 static const __maybe_unused unsigned bch_crc_bytes[] = {
1057         [BCH_CSUM_none]                         = 0,
1058         [BCH_CSUM_crc32c_nonzero]               = 4,
1059         [BCH_CSUM_crc32c]                       = 4,
1060         [BCH_CSUM_crc64_nonzero]                = 8,
1061         [BCH_CSUM_crc64]                        = 8,
1062         [BCH_CSUM_xxhash]                       = 8,
1063         [BCH_CSUM_chacha20_poly1305_80]         = 10,
1064         [BCH_CSUM_chacha20_poly1305_128]        = 16,
1065 };
1066
1067 static inline _Bool bch2_csum_type_is_encryption(enum bch_csum_type type)
1068 {
1069         switch (type) {
1070         case BCH_CSUM_chacha20_poly1305_80:
1071         case BCH_CSUM_chacha20_poly1305_128:
1072                 return true;
1073         default:
1074                 return false;
1075         }
1076 }
1077
1078 #define BCH_CSUM_OPTS()                 \
1079         x(none,                 0)      \
1080         x(crc32c,               1)      \
1081         x(crc64,                2)      \
1082         x(xxhash,               3)
1083
1084 enum bch_csum_opt {
1085 #define x(t, n) BCH_CSUM_OPT_##t = n,
1086         BCH_CSUM_OPTS()
1087 #undef x
1088         BCH_CSUM_OPT_NR
1089 };
1090
1091 #define BCH_COMPRESSION_TYPES()         \
1092         x(none,                 0)      \
1093         x(lz4_old,              1)      \
1094         x(gzip,                 2)      \
1095         x(lz4,                  3)      \
1096         x(zstd,                 4)      \
1097         x(incompressible,       5)
1098
1099 enum bch_compression_type {
1100 #define x(t, n) BCH_COMPRESSION_TYPE_##t = n,
1101         BCH_COMPRESSION_TYPES()
1102 #undef x
1103         BCH_COMPRESSION_TYPE_NR
1104 };
1105
1106 #define BCH_COMPRESSION_OPTS()          \
1107         x(none,         0)              \
1108         x(lz4,          1)              \
1109         x(gzip,         2)              \
1110         x(zstd,         3)
1111
1112 enum bch_compression_opts {
1113 #define x(t, n) BCH_COMPRESSION_OPT_##t = n,
1114         BCH_COMPRESSION_OPTS()
1115 #undef x
1116         BCH_COMPRESSION_OPT_NR
1117 };
1118
1119 /*
1120  * Magic numbers
1121  *
1122  * The various other data structures have their own magic numbers, which are
1123  * xored with the first part of the cache set's UUID
1124  */
1125
1126 #define BCACHE_MAGIC                                                    \
1127         UUID_INIT(0xc68573f6, 0x4e1a, 0x45ca,                           \
1128                   0x82, 0x65, 0xf5, 0x7f, 0x48, 0xba, 0x6d, 0x81)
1129 #define BCHFS_MAGIC                                                     \
1130         UUID_INIT(0xc68573f6, 0x66ce, 0x90a9,                           \
1131                   0xd9, 0x6a, 0x60, 0xcf, 0x80, 0x3d, 0xf7, 0xef)
1132
1133 #define BCACHEFS_STATFS_MAGIC           BCACHEFS_SUPER_MAGIC
1134
1135 #define JSET_MAGIC              __cpu_to_le64(0x245235c1a3625032ULL)
1136 #define BSET_MAGIC              __cpu_to_le64(0x90135c78b99e07f5ULL)
1137
1138 static inline __le64 __bch2_sb_magic(struct bch_sb *sb)
1139 {
1140         __le64 ret;
1141
1142         memcpy(&ret, &sb->uuid, sizeof(ret));
1143         return ret;
1144 }
1145
1146 static inline __u64 __jset_magic(struct bch_sb *sb)
1147 {
1148         return __le64_to_cpu(__bch2_sb_magic(sb) ^ JSET_MAGIC);
1149 }
1150
1151 static inline __u64 __bset_magic(struct bch_sb *sb)
1152 {
1153         return __le64_to_cpu(__bch2_sb_magic(sb) ^ BSET_MAGIC);
1154 }
1155
1156 /* Journal */
1157
1158 #define JSET_KEYS_U64s  (sizeof(struct jset_entry) / sizeof(__u64))
1159
1160 #define BCH_JSET_ENTRY_TYPES()                  \
1161         x(btree_keys,           0)              \
1162         x(btree_root,           1)              \
1163         x(prio_ptrs,            2)              \
1164         x(blacklist,            3)              \
1165         x(blacklist_v2,         4)              \
1166         x(usage,                5)              \
1167         x(data_usage,           6)              \
1168         x(clock,                7)              \
1169         x(dev_usage,            8)              \
1170         x(log,                  9)              \
1171         x(overwrite,            10)             \
1172         x(write_buffer_keys,    11)             \
1173         x(datetime,             12)             \
1174         x(log_bkey,             13)
1175
1176 enum bch_jset_entry_type {
1177 #define x(f, nr)        BCH_JSET_ENTRY_##f      = nr,
1178         BCH_JSET_ENTRY_TYPES()
1179 #undef x
1180         BCH_JSET_ENTRY_NR
1181 };
1182
1183 static inline bool jset_entry_is_key(struct jset_entry *e)
1184 {
1185         switch (e->type) {
1186         case BCH_JSET_ENTRY_btree_keys:
1187         case BCH_JSET_ENTRY_btree_root:
1188         case BCH_JSET_ENTRY_write_buffer_keys:
1189                 return true;
1190         }
1191
1192         return false;
1193 }
1194
1195 /*
1196  * Journal sequence numbers can be blacklisted: bsets record the max sequence
1197  * number of all the journal entries they contain updates for, so that on
1198  * recovery we can ignore those bsets that contain index updates newer that what
1199  * made it into the journal.
1200  *
1201  * This means that we can't reuse that journal_seq - we have to skip it, and
1202  * then record that we skipped it so that the next time we crash and recover we
1203  * don't think there was a missing journal entry.
1204  */
1205 struct jset_entry_blacklist {
1206         struct jset_entry       entry;
1207         __le64                  seq;
1208 };
1209
1210 struct jset_entry_blacklist_v2 {
1211         struct jset_entry       entry;
1212         __le64                  start;
1213         __le64                  end;
1214 };
1215
1216 #define BCH_FS_USAGE_TYPES()                    \
1217         x(reserved,             0)              \
1218         x(inodes,               1)              \
1219         x(key_version,          2)
1220
1221 enum bch_fs_usage_type {
1222 #define x(f, nr)        BCH_FS_USAGE_##f        = nr,
1223         BCH_FS_USAGE_TYPES()
1224 #undef x
1225         BCH_FS_USAGE_NR
1226 };
1227
1228 struct jset_entry_usage {
1229         struct jset_entry       entry;
1230         __le64                  v;
1231 } __packed;
1232
1233 struct jset_entry_data_usage {
1234         struct jset_entry       entry;
1235         __le64                  v;
1236         struct bch_replicas_entry_v1 r;
1237 } __packed;
1238
1239 struct jset_entry_clock {
1240         struct jset_entry       entry;
1241         __u8                    rw;
1242         __u8                    pad[7];
1243         __le64                  time;
1244 } __packed;
1245
1246 struct jset_entry_dev_usage_type {
1247         __le64                  buckets;
1248         __le64                  sectors;
1249         __le64                  fragmented;
1250 } __packed;
1251
1252 struct jset_entry_dev_usage {
1253         struct jset_entry       entry;
1254         __le32                  dev;
1255         __u32                   pad;
1256
1257         __le64                  _buckets_ec;            /* No longer used */
1258         __le64                  _buckets_unavailable;   /* No longer used */
1259
1260         struct jset_entry_dev_usage_type d[];
1261 };
1262
1263 static inline unsigned jset_entry_dev_usage_nr_types(struct jset_entry_dev_usage *u)
1264 {
1265         return (vstruct_bytes(&u->entry) - sizeof(struct jset_entry_dev_usage)) /
1266                 sizeof(struct jset_entry_dev_usage_type);
1267 }
1268
1269 struct jset_entry_log {
1270         struct jset_entry       entry;
1271         u8                      d[];
1272 } __packed __aligned(8);
1273
1274 static inline unsigned jset_entry_log_msg_bytes(struct jset_entry_log *l)
1275 {
1276         unsigned b = vstruct_bytes(&l->entry) - offsetof(struct jset_entry_log, d);
1277
1278         while (b && !l->d[b - 1])
1279                 --b;
1280         return b;
1281 }
1282
1283 struct jset_entry_datetime {
1284         struct jset_entry       entry;
1285         __le64                  seconds;
1286 } __packed __aligned(8);
1287
1288 /*
1289  * On disk format for a journal entry:
1290  * seq is monotonically increasing; every journal entry has its own unique
1291  * sequence number.
1292  *
1293  * last_seq is the oldest journal entry that still has keys the btree hasn't
1294  * flushed to disk yet.
1295  *
1296  * version is for on disk format changes.
1297  */
1298 struct jset {
1299         struct bch_csum         csum;
1300
1301         __le64                  magic;
1302         __le64                  seq;
1303         __le32                  version;
1304         __le32                  flags;
1305
1306         __le32                  u64s; /* size of d[] in u64s */
1307
1308         __u8                    encrypted_start[0];
1309
1310         __le16                  _read_clock; /* no longer used */
1311         __le16                  _write_clock;
1312
1313         /* Sequence number of oldest dirty journal entry */
1314         __le64                  last_seq;
1315
1316
1317         struct jset_entry       start[0];
1318         __u64                   _data[];
1319 } __packed __aligned(8);
1320
1321 LE32_BITMASK(JSET_CSUM_TYPE,    struct jset, flags, 0, 4);
1322 LE32_BITMASK(JSET_BIG_ENDIAN,   struct jset, flags, 4, 5);
1323 LE32_BITMASK(JSET_NO_FLUSH,     struct jset, flags, 5, 6);
1324
1325 #define BCH_JOURNAL_BUCKETS_MIN         8
1326
1327 /* Btree: */
1328
1329 enum btree_id_flags {
1330         BTREE_IS_extents        = BIT(0),
1331         BTREE_IS_snapshots      = BIT(1),
1332         BTREE_IS_snapshot_field = BIT(2),
1333         BTREE_IS_data           = BIT(3),
1334         BTREE_IS_write_buffer   = BIT(4),
1335 };
1336
1337 #define BCH_BTREE_IDS()                                                         \
1338         x(extents,              0,                                              \
1339           BTREE_IS_extents|                                                     \
1340           BTREE_IS_snapshots|                                                   \
1341           BTREE_IS_data,                                                        \
1342           BIT_ULL(KEY_TYPE_whiteout)|                                           \
1343           BIT_ULL(KEY_TYPE_error)|                                              \
1344           BIT_ULL(KEY_TYPE_cookie)|                                             \
1345           BIT_ULL(KEY_TYPE_extent)|                                             \
1346           BIT_ULL(KEY_TYPE_reservation)|                                        \
1347           BIT_ULL(KEY_TYPE_reflink_p)|                                          \
1348           BIT_ULL(KEY_TYPE_inline_data))                                        \
1349         x(inodes,               1,                                              \
1350           BTREE_IS_snapshots,                                                   \
1351           BIT_ULL(KEY_TYPE_whiteout)|                                           \
1352           BIT_ULL(KEY_TYPE_inode)|                                              \
1353           BIT_ULL(KEY_TYPE_inode_v2)|                                           \
1354           BIT_ULL(KEY_TYPE_inode_v3)|                                           \
1355           BIT_ULL(KEY_TYPE_inode_generation))                                   \
1356         x(dirents,              2,                                              \
1357           BTREE_IS_snapshots,                                                   \
1358           BIT_ULL(KEY_TYPE_whiteout)|                                           \
1359           BIT_ULL(KEY_TYPE_hash_whiteout)|                                      \
1360           BIT_ULL(KEY_TYPE_dirent))                                             \
1361         x(xattrs,               3,                                              \
1362           BTREE_IS_snapshots,                                                   \
1363           BIT_ULL(KEY_TYPE_whiteout)|                                           \
1364           BIT_ULL(KEY_TYPE_cookie)|                                             \
1365           BIT_ULL(KEY_TYPE_hash_whiteout)|                                      \
1366           BIT_ULL(KEY_TYPE_xattr))                                              \
1367         x(alloc,                4,      0,                                      \
1368           BIT_ULL(KEY_TYPE_alloc)|                                              \
1369           BIT_ULL(KEY_TYPE_alloc_v2)|                                           \
1370           BIT_ULL(KEY_TYPE_alloc_v3)|                                           \
1371           BIT_ULL(KEY_TYPE_alloc_v4))                                           \
1372         x(quotas,               5,      0,                                      \
1373           BIT_ULL(KEY_TYPE_quota))                                              \
1374         x(stripes,              6,      0,                                      \
1375           BIT_ULL(KEY_TYPE_stripe))                                             \
1376         x(reflink,              7,                                              \
1377           BTREE_IS_extents|                                                     \
1378           BTREE_IS_data,                                                        \
1379           BIT_ULL(KEY_TYPE_reflink_v)|                                          \
1380           BIT_ULL(KEY_TYPE_indirect_inline_data)|                               \
1381           BIT_ULL(KEY_TYPE_error))                                              \
1382         x(subvolumes,           8,      0,                                      \
1383           BIT_ULL(KEY_TYPE_subvolume))                                          \
1384         x(snapshots,            9,      0,                                      \
1385           BIT_ULL(KEY_TYPE_snapshot))                                           \
1386         x(lru,                  10,                                             \
1387           BTREE_IS_write_buffer,                                                \
1388           BIT_ULL(KEY_TYPE_set))                                                \
1389         x(freespace,            11,                                             \
1390           BTREE_IS_extents,                                                     \
1391           BIT_ULL(KEY_TYPE_set))                                                \
1392         x(need_discard,         12,     0,                                      \
1393           BIT_ULL(KEY_TYPE_set))                                                \
1394         x(backpointers,         13,                                             \
1395           BTREE_IS_write_buffer,                                                \
1396           BIT_ULL(KEY_TYPE_backpointer))                                        \
1397         x(bucket_gens,          14,     0,                                      \
1398           BIT_ULL(KEY_TYPE_bucket_gens))                                        \
1399         x(snapshot_trees,       15,     0,                                      \
1400           BIT_ULL(KEY_TYPE_snapshot_tree))                                      \
1401         x(deleted_inodes,       16,                                             \
1402           BTREE_IS_snapshot_field|                                              \
1403           BTREE_IS_write_buffer,                                                \
1404           BIT_ULL(KEY_TYPE_set))                                                \
1405         x(logged_ops,           17,     0,                                      \
1406           BIT_ULL(KEY_TYPE_logged_op_truncate)|                                 \
1407           BIT_ULL(KEY_TYPE_logged_op_finsert)|                                  \
1408           BIT_ULL(KEY_TYPE_inode_alloc_cursor))                                 \
1409         x(rebalance_work,       18,                                             \
1410           BTREE_IS_snapshot_field|                                              \
1411           BTREE_IS_write_buffer,                                                \
1412           BIT_ULL(KEY_TYPE_set)|BIT_ULL(KEY_TYPE_cookie))                       \
1413         x(subvolume_children,   19,     0,                                      \
1414           BIT_ULL(KEY_TYPE_set))                                                \
1415         x(accounting,           20,                                             \
1416           BTREE_IS_snapshot_field|                                              \
1417           BTREE_IS_write_buffer,                                                \
1418           BIT_ULL(KEY_TYPE_accounting))                                         \
1419
1420 enum btree_id {
1421 #define x(name, nr, ...) BTREE_ID_##name = nr,
1422         BCH_BTREE_IDS()
1423 #undef x
1424         BTREE_ID_NR
1425 };
1426
1427 /*
1428  * Maximum number of btrees that we will _ever_ have under the current scheme,
1429  * where we refer to them with 64 bit bitfields - and we also need a bit for
1430  * the interior btree node type:
1431  */
1432 #define BTREE_ID_NR_MAX         63
1433
1434 static inline bool btree_id_is_alloc(enum btree_id id)
1435 {
1436         switch (id) {
1437         case BTREE_ID_alloc:
1438         case BTREE_ID_backpointers:
1439         case BTREE_ID_need_discard:
1440         case BTREE_ID_freespace:
1441         case BTREE_ID_bucket_gens:
1442         case BTREE_ID_lru:
1443         case BTREE_ID_accounting:
1444                 return true;
1445         default:
1446                 return false;
1447         }
1448 }
1449
1450 #define BTREE_MAX_DEPTH         4U
1451
1452 /* Btree nodes */
1453
1454 /*
1455  * Btree nodes
1456  *
1457  * On disk a btree node is a list/log of these; within each set the keys are
1458  * sorted
1459  */
1460 struct bset {
1461         __le64                  seq;
1462
1463         /*
1464          * Highest journal entry this bset contains keys for.
1465          * If on recovery we don't see that journal entry, this bset is ignored:
1466          * this allows us to preserve the order of all index updates after a
1467          * crash, since the journal records a total order of all index updates
1468          * and anything that didn't make it to the journal doesn't get used.
1469          */
1470         __le64                  journal_seq;
1471
1472         __le32                  flags;
1473         __le16                  version;
1474         __le16                  u64s; /* count of d[] in u64s */
1475
1476         struct bkey_packed      start[0];
1477         __u64                   _data[];
1478 } __packed __aligned(8);
1479
1480 LE32_BITMASK(BSET_CSUM_TYPE,    struct bset, flags, 0, 4);
1481
1482 LE32_BITMASK(BSET_BIG_ENDIAN,   struct bset, flags, 4, 5);
1483 LE32_BITMASK(BSET_SEPARATE_WHITEOUTS,
1484                                 struct bset, flags, 5, 6);
1485
1486 /* Sector offset within the btree node: */
1487 LE32_BITMASK(BSET_OFFSET,       struct bset, flags, 16, 32);
1488
1489 struct btree_node {
1490         struct bch_csum         csum;
1491         __le64                  magic;
1492
1493         /* this flags field is encrypted, unlike bset->flags: */
1494         __le64                  flags;
1495
1496         /* Closed interval: */
1497         struct bpos             min_key;
1498         struct bpos             max_key;
1499         struct bch_extent_ptr   _ptr; /* not used anymore */
1500         struct bkey_format      format;
1501
1502         union {
1503         struct bset             keys;
1504         struct {
1505                 __u8            pad[22];
1506                 __le16          u64s;
1507                 __u64           _data[0];
1508
1509         };
1510         };
1511 } __packed __aligned(8);
1512
1513 LE64_BITMASK(BTREE_NODE_ID_LO,  struct btree_node, flags,  0,  4);
1514 LE64_BITMASK(BTREE_NODE_LEVEL,  struct btree_node, flags,  4,  8);
1515 LE64_BITMASK(BTREE_NODE_NEW_EXTENT_OVERWRITE,
1516                                 struct btree_node, flags,  8,  9);
1517 LE64_BITMASK(BTREE_NODE_ID_HI,  struct btree_node, flags,  9, 25);
1518 /* 25-32 unused */
1519 LE64_BITMASK(BTREE_NODE_SEQ,    struct btree_node, flags, 32, 64);
1520
1521 static inline __u64 BTREE_NODE_ID(struct btree_node *n)
1522 {
1523         return BTREE_NODE_ID_LO(n) | (BTREE_NODE_ID_HI(n) << 4);
1524 }
1525
1526 static inline void SET_BTREE_NODE_ID(struct btree_node *n, __u64 v)
1527 {
1528         SET_BTREE_NODE_ID_LO(n, v);
1529         SET_BTREE_NODE_ID_HI(n, v >> 4);
1530 }
1531
1532 struct btree_node_entry {
1533         struct bch_csum         csum;
1534
1535         union {
1536         struct bset             keys;
1537         struct {
1538                 __u8            pad[22];
1539                 __le16          u64s;
1540                 __u64           _data[0];
1541         };
1542         };
1543 } __packed __aligned(8);
1544
1545 #endif /* _BCACHEFS_FORMAT_H */