bcachefs: comment bch_subvolume
[linux-block.git] / fs / bcachefs / bcachefs_format.h
CommitLineData
1c6fdbd8
KO
1/* SPDX-License-Identifier: GPL-2.0 */
2#ifndef _BCACHEFS_FORMAT_H
3#define _BCACHEFS_FORMAT_H
4
5/*
6 * bcachefs on disk data structures
7 *
8 * OVERVIEW:
9 *
10 * There are three main types of on disk data structures in bcachefs (this is
11 * reduced from 5 in bcache)
12 *
13 * - superblock
14 * - journal
15 * - btree
16 *
17 * The btree is the primary structure; most metadata exists as keys in the
18 * various btrees. There are only a small number of btrees, they're not
19 * sharded - we have one btree for extents, another for inodes, et cetera.
20 *
21 * SUPERBLOCK:
22 *
23 * The superblock contains the location of the journal, the list of devices in
24 * the filesystem, and in general any metadata we need in order to decide
25 * whether we can start a filesystem or prior to reading the journal/btree
26 * roots.
27 *
28 * The superblock is extensible, and most of the contents of the superblock are
29 * in variable length, type tagged fields; see struct bch_sb_field.
30 *
31 * Backup superblocks do not reside in a fixed location; also, superblocks do
32 * not have a fixed size. To locate backup superblocks we have struct
33 * bch_sb_layout; we store a copy of this inside every superblock, and also
34 * before the first superblock.
35 *
36 * JOURNAL:
37 *
38 * The journal primarily records btree updates in the order they occurred;
39 * journal replay consists of just iterating over all the keys in the open
40 * journal entries and re-inserting them into the btrees.
41 *
42 * The journal also contains entry types for the btree roots, and blacklisted
43 * journal sequence numbers (see journal_seq_blacklist.c).
44 *
45 * BTREE:
46 *
47 * bcachefs btrees are copy on write b+ trees, where nodes are big (typically
48 * 128k-256k) and log structured. We use struct btree_node for writing the first
49 * entry in a given node (offset 0), and struct btree_node_entry for all
50 * subsequent writes.
51 *
52 * After the header, btree node entries contain a list of keys in sorted order.
53 * Values are stored inline with the keys; since values are variable length (and
54 * keys effectively are variable length too, due to packing) we can't do random
55 * access without building up additional in memory tables in the btree node read
56 * path.
57 *
58 * BTREE KEYS (struct bkey):
59 *
60 * The various btrees share a common format for the key - so as to avoid
61 * switching in fastpath lookup/comparison code - but define their own
62 * structures for the key values.
63 *
64 * The size of a key/value pair is stored as a u8 in units of u64s, so the max
65 * size is just under 2k. The common part also contains a type tag for the
66 * value, and a format field indicating whether the key is packed or not (and
67 * also meant to allow adding new key fields in the future, if desired).
68 *
69 * bkeys, when stored within a btree node, may also be packed. In that case, the
70 * bkey_format in that node is used to unpack it. Packed bkeys mean that we can
71 * be generous with field sizes in the common part of the key format (64 bit
72 * inode number, 64 bit offset, 96 bit version field, etc.) for negligible cost.
73 */
74
75#include <asm/types.h>
76#include <asm/byteorder.h>
7121643e 77#include <linux/kernel.h>
1c6fdbd8 78#include <linux/uuid.h>
528b18e6 79#include "vstructs.h"
1c6fdbd8
KO
80
81#ifdef __KERNEL__
82typedef uuid_t __uuid_t;
83#endif
84
3d48a7f8 85#define BITMASK(name, type, field, offset, end) \
96dea3d5
KO
86static const __maybe_unused unsigned name##_OFFSET = offset; \
87static const __maybe_unused unsigned name##_BITS = (end - offset); \
3d48a7f8
KO
88 \
89static inline __u64 name(const type *k) \
90{ \
91 return (k->field >> offset) & ~(~0ULL << (end - offset)); \
92} \
93 \
94static inline void SET_##name(type *k, __u64 v) \
95{ \
96 k->field &= ~(~(~0ULL << (end - offset)) << offset); \
97 k->field |= (v & ~(~0ULL << (end - offset))) << offset; \
98}
99
1c6fdbd8 100#define LE_BITMASK(_bits, name, type, field, offset, end) \
96dea3d5
KO
101static const __maybe_unused unsigned name##_OFFSET = offset; \
102static const __maybe_unused unsigned name##_BITS = (end - offset); \
103static const __maybe_unused __u##_bits name##_MAX = (1ULL << (end - offset)) - 1;\
1c6fdbd8
KO
104 \
105static inline __u64 name(const type *k) \
106{ \
107 return (__le##_bits##_to_cpu(k->field) >> offset) & \
108 ~(~0ULL << (end - offset)); \
109} \
110 \
111static inline void SET_##name(type *k, __u64 v) \
112{ \
113 __u##_bits new = __le##_bits##_to_cpu(k->field); \
114 \
115 new &= ~(~(~0ULL << (end - offset)) << offset); \
116 new |= (v & ~(~0ULL << (end - offset))) << offset; \
117 k->field = __cpu_to_le##_bits(new); \
118}
119
120#define LE16_BITMASK(n, t, f, o, e) LE_BITMASK(16, n, t, f, o, e)
121#define LE32_BITMASK(n, t, f, o, e) LE_BITMASK(32, n, t, f, o, e)
122#define LE64_BITMASK(n, t, f, o, e) LE_BITMASK(64, n, t, f, o, e)
123
124struct bkey_format {
125 __u8 key_u64s;
126 __u8 nr_fields;
127 /* One unused slot for now: */
128 __u8 bits_per_field[6];
129 __le64 field_offset[6];
130};
131
132/* Btree keys - all units are in sectors */
133
134struct bpos {
135 /*
136 * Word order matches machine byte order - btree code treats a bpos as a
137 * single large integer, for search/comparison purposes
138 *
139 * Note that wherever a bpos is embedded in another on disk data
140 * structure, it has to be byte swabbed when reading in metadata that
141 * wasn't written in native endian order:
142 */
143#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
144 __u32 snapshot;
145 __u64 offset;
146 __u64 inode;
147#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
148 __u64 inode;
149 __u64 offset; /* Points to end of extent - sectors */
150 __u32 snapshot;
151#else
152#error edit for your odd byteorder.
153#endif
3f3ae125
KO
154} __packed
155#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
156__aligned(4)
157#endif
158;
1c6fdbd8
KO
159
160#define KEY_INODE_MAX ((__u64)~0ULL)
161#define KEY_OFFSET_MAX ((__u64)~0ULL)
162#define KEY_SNAPSHOT_MAX ((__u32)~0U)
163#define KEY_SIZE_MAX ((__u32)~0U)
164
e751c01a 165static inline struct bpos SPOS(__u64 inode, __u64 offset, __u32 snapshot)
1c6fdbd8 166{
e751c01a
KO
167 return (struct bpos) {
168 .inode = inode,
169 .offset = offset,
170 .snapshot = snapshot,
171 };
1c6fdbd8
KO
172}
173
e751c01a 174#define POS_MIN SPOS(0, 0, 0)
618b1c0e
KO
175#define POS_MAX SPOS(KEY_INODE_MAX, KEY_OFFSET_MAX, 0)
176#define SPOS_MAX SPOS(KEY_INODE_MAX, KEY_OFFSET_MAX, KEY_SNAPSHOT_MAX)
e751c01a 177#define POS(_inode, _offset) SPOS(_inode, _offset, 0)
1c6fdbd8
KO
178
179/* Empty placeholder struct, for container_of() */
180struct bch_val {
181 __u64 __nothing[0];
182};
183
184struct bversion {
185#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
186 __u64 lo;
187 __u32 hi;
188#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
189 __u32 hi;
190 __u64 lo;
191#endif
fd0c7679 192} __packed __aligned(4);
1c6fdbd8
KO
193
194struct bkey {
195 /* Size of combined key and value, in u64s */
196 __u8 u64s;
197
198 /* Format of key (0 for format local to btree node) */
199#if defined(__LITTLE_ENDIAN_BITFIELD)
200 __u8 format:7,
201 needs_whiteout:1;
202#elif defined (__BIG_ENDIAN_BITFIELD)
203 __u8 needs_whiteout:1,
204 format:7;
205#else
206#error edit for your odd byteorder.
207#endif
208
209 /* Type of the value */
210 __u8 type;
211
212#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
213 __u8 pad[1];
214
215 struct bversion version;
216 __u32 size; /* extent size, in sectors */
217 struct bpos p;
218#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
219 struct bpos p;
220 __u32 size; /* extent size, in sectors */
221 struct bversion version;
222
223 __u8 pad[1];
224#endif
fd0c7679 225} __packed __aligned(8);
1c6fdbd8
KO
226
227struct bkey_packed {
228 __u64 _data[0];
229
230 /* Size of combined key and value, in u64s */
231 __u8 u64s;
232
233 /* Format of key (0 for format local to btree node) */
234
235 /*
236 * XXX: next incompat on disk format change, switch format and
237 * needs_whiteout - bkey_packed() will be cheaper if format is the high
238 * bits of the bitfield
239 */
240#if defined(__LITTLE_ENDIAN_BITFIELD)
241 __u8 format:7,
242 needs_whiteout:1;
243#elif defined (__BIG_ENDIAN_BITFIELD)
244 __u8 needs_whiteout:1,
245 format:7;
246#endif
247
248 /* Type of the value */
249 __u8 type;
250 __u8 key_start[0];
251
252 /*
253 * We copy bkeys with struct assignment in various places, and while
254 * that shouldn't be done with packed bkeys we can't disallow it in C,
255 * and it's legal to cast a bkey to a bkey_packed - so padding it out
256 * to the same size as struct bkey should hopefully be safest.
257 */
258 __u8 pad[sizeof(struct bkey) - 3];
fd0c7679 259} __packed __aligned(8);
1c6fdbd8 260
653693be
KO
261typedef struct {
262 __le64 lo;
263 __le64 hi;
264} bch_le128;
265
1c6fdbd8 266#define BKEY_U64s (sizeof(struct bkey) / sizeof(__u64))
cd575ddf
KO
267#define BKEY_U64s_MAX U8_MAX
268#define BKEY_VAL_U64s_MAX (BKEY_U64s_MAX - BKEY_U64s)
269
1c6fdbd8
KO
270#define KEY_PACKED_BITS_START 24
271
272#define KEY_FORMAT_LOCAL_BTREE 0
273#define KEY_FORMAT_CURRENT 1
274
275enum bch_bkey_fields {
276 BKEY_FIELD_INODE,
277 BKEY_FIELD_OFFSET,
278 BKEY_FIELD_SNAPSHOT,
279 BKEY_FIELD_SIZE,
280 BKEY_FIELD_VERSION_HI,
281 BKEY_FIELD_VERSION_LO,
282 BKEY_NR_FIELDS,
283};
284
285#define bkey_format_field(name, field) \
286 [BKEY_FIELD_##name] = (sizeof(((struct bkey *) NULL)->field) * 8)
287
288#define BKEY_FORMAT_CURRENT \
289((struct bkey_format) { \
290 .key_u64s = BKEY_U64s, \
291 .nr_fields = BKEY_NR_FIELDS, \
292 .bits_per_field = { \
293 bkey_format_field(INODE, p.inode), \
294 bkey_format_field(OFFSET, p.offset), \
295 bkey_format_field(SNAPSHOT, p.snapshot), \
296 bkey_format_field(SIZE, size), \
297 bkey_format_field(VERSION_HI, version.hi), \
298 bkey_format_field(VERSION_LO, version.lo), \
299 }, \
300})
301
302/* bkey with inline value */
303struct bkey_i {
304 __u64 _data[0];
305
ac2ccddc
KO
306 struct bkey k;
307 struct bch_val v;
1c6fdbd8
KO
308};
309
1ae8a090
KO
310#define POS_KEY(_pos) \
311((struct bkey) { \
312 .u64s = BKEY_U64s, \
313 .format = KEY_FORMAT_CURRENT, \
314 .p = _pos, \
315})
316
1c6fdbd8
KO
317#define KEY(_inode, _offset, _size) \
318((struct bkey) { \
319 .u64s = BKEY_U64s, \
320 .format = KEY_FORMAT_CURRENT, \
321 .p = POS(_inode, _offset), \
322 .size = _size, \
323})
324
325static inline void bkey_init(struct bkey *k)
326{
327 *k = KEY(0, 0, 0);
328}
329
330#define bkey_bytes(_k) ((_k)->u64s * sizeof(__u64))
331
332#define __BKEY_PADDED(key, pad) \
45dd05b3 333 struct bkey_i key; __u64 key ## _pad[pad]
1c6fdbd8 334
1c6fdbd8
KO
335/*
336 * - DELETED keys are used internally to mark keys that should be ignored but
337 * override keys in composition order. Their version number is ignored.
338 *
339 * - DISCARDED keys indicate that the data is all 0s because it has been
340 * discarded. DISCARDs may have a version; if the version is nonzero the key
341 * will be persistent, otherwise the key will be dropped whenever the btree
342 * node is rewritten (like DELETED keys).
343 *
344 * - ERROR: any read of the data returns a read error, as the data was lost due
345 * to a failing device. Like DISCARDED keys, they can be removed (overridden)
346 * by new writes or cluster-wide GC. Node repair can also overwrite them with
347 * the same or a more recent version number, but not with an older version
348 * number.
26609b61
KO
349 *
350 * - WHITEOUT: for hash table btrees
3e3e02e6 351 */
26609b61
KO
352#define BCH_BKEY_TYPES() \
353 x(deleted, 0) \
7a7d17b2 354 x(whiteout, 1) \
26609b61
KO
355 x(error, 2) \
356 x(cookie, 3) \
79f88eba 357 x(hash_whiteout, 4) \
26609b61
KO
358 x(btree_ptr, 5) \
359 x(extent, 6) \
360 x(reservation, 7) \
361 x(inode, 8) \
362 x(inode_generation, 9) \
363 x(dirent, 10) \
364 x(xattr, 11) \
365 x(alloc, 12) \
366 x(quota, 13) \
76426098
KO
367 x(stripe, 14) \
368 x(reflink_p, 15) \
4be1a412 369 x(reflink_v, 16) \
548b3d20 370 x(inline_data, 17) \
801a3de6 371 x(btree_ptr_v2, 18) \
7f4e1d5d 372 x(indirect_inline_data, 19) \
14b393ee
KO
373 x(alloc_v2, 20) \
374 x(subvolume, 21) \
3e52c222
KO
375 x(snapshot, 22) \
376 x(inode_v2, 23) \
179e3434 377 x(alloc_v3, 24) \
d326ab2f 378 x(set, 25) \
3d48a7f8 379 x(lru, 26) \
a8c752bb 380 x(alloc_v4, 27) \
8dd69d9f 381 x(backpointer, 28) \
5250b74d 382 x(inode_v3, 29) \
1c59b483 383 x(bucket_gens, 30) \
b030e262 384 x(snapshot_tree, 31) \
f3e374ef
KO
385 x(logged_op_truncate, 32) \
386 x(logged_op_finsert, 33)
26609b61
KO
387
388enum bch_bkey_type {
389#define x(name, nr) KEY_TYPE_##name = nr,
390 BCH_BKEY_TYPES()
391#undef x
392 KEY_TYPE_MAX,
393};
1c6fdbd8 394
79f88eba
KO
395struct bch_deleted {
396 struct bch_val v;
397};
398
7a7d17b2 399struct bch_whiteout {
79f88eba
KO
400 struct bch_val v;
401};
402
403struct bch_error {
404 struct bch_val v;
405};
406
1c6fdbd8
KO
407struct bch_cookie {
408 struct bch_val v;
409 __le64 cookie;
410};
1c6fdbd8 411
79f88eba
KO
412struct bch_hash_whiteout {
413 struct bch_val v;
414};
415
179e3434
KO
416struct bch_set {
417 struct bch_val v;
418};
419
1c6fdbd8
KO
420/* Extents */
421
422/*
423 * In extent bkeys, the value is a list of pointers (bch_extent_ptr), optionally
424 * preceded by checksum/compression information (bch_extent_crc32 or
425 * bch_extent_crc64).
426 *
427 * One major determining factor in the format of extents is how we handle and
428 * represent extents that have been partially overwritten and thus trimmed:
429 *
430 * If an extent is not checksummed or compressed, when the extent is trimmed we
431 * don't have to remember the extent we originally allocated and wrote: we can
432 * merely adjust ptr->offset to point to the start of the data that is currently
433 * live. The size field in struct bkey records the current (live) size of the
434 * extent, and is also used to mean "size of region on disk that we point to" in
435 * this case.
436 *
437 * Thus an extent that is not checksummed or compressed will consist only of a
438 * list of bch_extent_ptrs, with none of the fields in
439 * bch_extent_crc32/bch_extent_crc64.
440 *
441 * When an extent is checksummed or compressed, it's not possible to read only
442 * the data that is currently live: we have to read the entire extent that was
443 * originally written, and then return only the part of the extent that is
444 * currently live.
445 *
446 * Thus, in addition to the current size of the extent in struct bkey, we need
447 * to store the size of the originally allocated space - this is the
448 * compressed_size and uncompressed_size fields in bch_extent_crc32/64. Also,
449 * when the extent is trimmed, instead of modifying the offset field of the
450 * pointer, we keep a second smaller offset field - "offset into the original
451 * extent of the currently live region".
452 *
453 * The other major determining factor is replication and data migration:
454 *
455 * Each pointer may have its own bch_extent_crc32/64. When doing a replicated
456 * write, we will initially write all the replicas in the same format, with the
457 * same checksum type and compression format - however, when copygc runs later (or
458 * tiering/cache promotion, anything that moves data), it is not in general
459 * going to rewrite all the pointers at once - one of the replicas may be in a
460 * bucket on one device that has very little fragmentation while another lives
461 * in a bucket that has become heavily fragmented, and thus is being rewritten
462 * sooner than the rest.
463 *
464 * Thus it will only move a subset of the pointers (or in the case of
465 * tiering/cache promotion perhaps add a single pointer without dropping any
466 * current pointers), and if the extent has been partially overwritten it must
467 * write only the currently live portion (or copygc would not be able to reduce
468 * fragmentation!) - which necessitates a different bch_extent_crc format for
469 * the new pointer.
470 *
471 * But in the interests of space efficiency, we don't want to store one
472 * bch_extent_crc for each pointer if we don't have to.
473 *
474 * Thus, a bch_extent consists of bch_extent_crc32s, bch_extent_crc64s, and
475 * bch_extent_ptrs appended arbitrarily one after the other. We determine the
476 * type of a given entry with a scheme similar to utf8 (except we're encoding a
477 * type, not a size), encoding the type in the position of the first set bit:
478 *
479 * bch_extent_crc32 - 0b1
480 * bch_extent_ptr - 0b10
481 * bch_extent_crc64 - 0b100
482 *
483 * We do it this way because bch_extent_crc32 is _very_ constrained on bits (and
484 * bch_extent_crc64 is the least constrained).
485 *
486 * Then, each bch_extent_crc32/64 applies to the pointers that follow after it,
487 * until the next bch_extent_crc32/64.
488 *
489 * If there are no bch_extent_crcs preceding a bch_extent_ptr, then that pointer
490 * is neither checksummed nor compressed.
491 */
492
493/* 128 bits, sufficient for cryptographic MACs: */
494struct bch_csum {
495 __le64 lo;
496 __le64 hi;
fd0c7679 497} __packed __aligned(8);
1c6fdbd8 498
abce30b7
KO
499#define BCH_EXTENT_ENTRY_TYPES() \
500 x(ptr, 0) \
501 x(crc32, 1) \
502 x(crc64, 2) \
cd575ddf 503 x(crc128, 3) \
2766876d
KO
504 x(stripe_ptr, 4) \
505 x(rebalance, 5)
506#define BCH_EXTENT_ENTRY_MAX 6
abce30b7 507
1c6fdbd8 508enum bch_extent_entry_type {
abce30b7
KO
509#define x(f, n) BCH_EXTENT_ENTRY_##f = n,
510 BCH_EXTENT_ENTRY_TYPES()
511#undef x
1c6fdbd8
KO
512};
513
1c6fdbd8
KO
514/* Compressed/uncompressed size are stored biased by 1: */
515struct bch_extent_crc32 {
516#if defined(__LITTLE_ENDIAN_BITFIELD)
517 __u32 type:2,
518 _compressed_size:7,
519 _uncompressed_size:7,
520 offset:7,
521 _unused:1,
522 csum_type:4,
523 compression_type:4;
524 __u32 csum;
525#elif defined (__BIG_ENDIAN_BITFIELD)
526 __u32 csum;
527 __u32 compression_type:4,
528 csum_type:4,
529 _unused:1,
530 offset:7,
531 _uncompressed_size:7,
532 _compressed_size:7,
533 type:2;
534#endif
fd0c7679 535} __packed __aligned(8);
1c6fdbd8
KO
536
537#define CRC32_SIZE_MAX (1U << 7)
538#define CRC32_NONCE_MAX 0
539
540struct bch_extent_crc64 {
541#if defined(__LITTLE_ENDIAN_BITFIELD)
542 __u64 type:3,
543 _compressed_size:9,
544 _uncompressed_size:9,
545 offset:9,
546 nonce:10,
547 csum_type:4,
548 compression_type:4,
549 csum_hi:16;
550#elif defined (__BIG_ENDIAN_BITFIELD)
551 __u64 csum_hi:16,
552 compression_type:4,
553 csum_type:4,
554 nonce:10,
555 offset:9,
556 _uncompressed_size:9,
557 _compressed_size:9,
558 type:3;
559#endif
560 __u64 csum_lo;
fd0c7679 561} __packed __aligned(8);
1c6fdbd8
KO
562
563#define CRC64_SIZE_MAX (1U << 9)
564#define CRC64_NONCE_MAX ((1U << 10) - 1)
565
566struct bch_extent_crc128 {
567#if defined(__LITTLE_ENDIAN_BITFIELD)
568 __u64 type:4,
569 _compressed_size:13,
570 _uncompressed_size:13,
571 offset:13,
572 nonce:13,
573 csum_type:4,
574 compression_type:4;
575#elif defined (__BIG_ENDIAN_BITFIELD)
576 __u64 compression_type:4,
577 csum_type:4,
578 nonce:13,
579 offset:13,
580 _uncompressed_size:13,
581 _compressed_size:13,
582 type:4;
583#endif
584 struct bch_csum csum;
fd0c7679 585} __packed __aligned(8);
1c6fdbd8
KO
586
587#define CRC128_SIZE_MAX (1U << 13)
588#define CRC128_NONCE_MAX ((1U << 13) - 1)
589
590/*
591 * @reservation - pointer hasn't been written to, just reserved
592 */
593struct bch_extent_ptr {
594#if defined(__LITTLE_ENDIAN_BITFIELD)
595 __u64 type:1,
596 cached:1,
cd575ddf 597 unused:1,
79203111 598 unwritten:1,
1c6fdbd8
KO
599 offset:44, /* 8 petabytes */
600 dev:8,
601 gen:8;
602#elif defined (__BIG_ENDIAN_BITFIELD)
603 __u64 gen:8,
604 dev:8,
605 offset:44,
79203111 606 unwritten:1,
cd575ddf 607 unused:1,
1c6fdbd8
KO
608 cached:1,
609 type:1;
610#endif
fd0c7679 611} __packed __aligned(8);
1c6fdbd8 612
cd575ddf 613struct bch_extent_stripe_ptr {
1c6fdbd8
KO
614#if defined(__LITTLE_ENDIAN_BITFIELD)
615 __u64 type:5,
cd575ddf 616 block:8,
7f4e1d5d
KO
617 redundancy:4,
618 idx:47;
cd575ddf 619#elif defined (__BIG_ENDIAN_BITFIELD)
7f4e1d5d
KO
620 __u64 idx:47,
621 redundancy:4,
cd575ddf
KO
622 block:8,
623 type:5;
624#endif
625};
626
2766876d
KO
627struct bch_extent_rebalance {
628#if defined(__LITTLE_ENDIAN_BITFIELD)
fb3f57bb
KO
629 __u64 type:6,
630 unused:34,
631 compression:8, /* enum bch_compression_opt */
2766876d
KO
632 target:16;
633#elif defined (__BIG_ENDIAN_BITFIELD)
634 __u64 target:16,
635 compression:8,
fb3f57bb
KO
636 unused:34,
637 type:6;
2766876d
KO
638#endif
639};
640
1c6fdbd8
KO
641union bch_extent_entry {
642#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ || __BITS_PER_LONG == 64
643 unsigned long type;
644#elif __BITS_PER_LONG == 32
645 struct {
646 unsigned long pad;
647 unsigned long type;
648 };
649#else
650#error edit for your odd byteorder.
651#endif
abce30b7
KO
652
653#define x(f, n) struct bch_extent_##f f;
654 BCH_EXTENT_ENTRY_TYPES()
655#undef x
1c6fdbd8
KO
656};
657
26609b61
KO
658struct bch_btree_ptr {
659 struct bch_val v;
1c6fdbd8 660
26609b61
KO
661 __u64 _data[0];
662 struct bch_extent_ptr start[];
fd0c7679 663} __packed __aligned(8);
1c6fdbd8 664
548b3d20
KO
665struct bch_btree_ptr_v2 {
666 struct bch_val v;
667
668 __u64 mem_ptr;
669 __le64 seq;
670 __le16 sectors_written;
51d2dfb8 671 __le16 flags;
548b3d20
KO
672 struct bpos min_key;
673 __u64 _data[0];
674 struct bch_extent_ptr start[];
fd0c7679 675} __packed __aligned(8);
548b3d20 676
51d2dfb8
KO
677LE16_BITMASK(BTREE_PTR_RANGE_UPDATED, struct bch_btree_ptr_v2, flags, 0, 1);
678
1c6fdbd8
KO
679struct bch_extent {
680 struct bch_val v;
681
682 __u64 _data[0];
683 union bch_extent_entry start[];
fd0c7679 684} __packed __aligned(8);
1c6fdbd8
KO
685
686struct bch_reservation {
687 struct bch_val v;
688
689 __le32 generation;
690 __u8 nr_replicas;
691 __u8 pad[3];
fd0c7679 692} __packed __aligned(8);
1c6fdbd8
KO
693
694/* Maximum size (in u64s) a single pointer could be: */
695#define BKEY_EXTENT_PTR_U64s_MAX\
696 ((sizeof(struct bch_extent_crc128) + \
a5cf5a4b 697 sizeof(struct bch_extent_ptr)) / sizeof(__u64))
1c6fdbd8
KO
698
699/* Maximum possible size of an entire extent value: */
700#define BKEY_EXTENT_VAL_U64s_MAX \
5055b509 701 (1 + BKEY_EXTENT_PTR_U64s_MAX * (BCH_REPLICAS_MAX + 1))
1c6fdbd8 702
1c6fdbd8
KO
703/* * Maximum possible size of an entire extent, key + value: */
704#define BKEY_EXTENT_U64s_MAX (BKEY_U64s + BKEY_EXTENT_VAL_U64s_MAX)
705
706/* Btree pointers don't carry around checksums: */
707#define BKEY_BTREE_PTR_VAL_U64s_MAX \
548b3d20 708 ((sizeof(struct bch_btree_ptr_v2) + \
a5cf5a4b 709 sizeof(struct bch_extent_ptr) * BCH_REPLICAS_MAX) / sizeof(__u64))
1c6fdbd8
KO
710#define BKEY_BTREE_PTR_U64s_MAX \
711 (BKEY_U64s + BKEY_BTREE_PTR_VAL_U64s_MAX)
712
713/* Inodes */
714
715#define BLOCKDEV_INODE_MAX 4096
716
717#define BCACHEFS_ROOT_INO 4096
718
1c6fdbd8
KO
719struct bch_inode {
720 struct bch_val v;
721
722 __le64 bi_hash_seed;
723 __le32 bi_flags;
724 __le16 bi_mode;
5cfd6977 725 __u8 fields[];
fd0c7679 726} __packed __aligned(8);
1c6fdbd8 727
3e52c222
KO
728struct bch_inode_v2 {
729 struct bch_val v;
730
731 __le64 bi_journal_seq;
732 __le64 bi_hash_seed;
733 __le64 bi_flags;
734 __le16 bi_mode;
5cfd6977 735 __u8 fields[];
fd0c7679 736} __packed __aligned(8);
3e52c222 737
8dd69d9f
KO
738struct bch_inode_v3 {
739 struct bch_val v;
740
741 __le64 bi_journal_seq;
742 __le64 bi_hash_seed;
743 __le64 bi_flags;
744 __le64 bi_sectors;
745 __le64 bi_size;
746 __le64 bi_version;
5cfd6977 747 __u8 fields[];
8dd69d9f
KO
748} __packed __aligned(8);
749
750#define INODEv3_FIELDS_START_INITIAL 6
a5cf5a4b 751#define INODEv3_FIELDS_START_CUR (offsetof(struct bch_inode_v3, fields) / sizeof(__u64))
8dd69d9f 752
1c6fdbd8
KO
753struct bch_inode_generation {
754 struct bch_val v;
755
756 __le32 bi_generation;
757 __le32 pad;
fd0c7679 758} __packed __aligned(8);
1c6fdbd8 759
14b393ee
KO
760/*
761 * bi_subvol and bi_parent_subvol are only set for subvolume roots:
762 */
763
8dd69d9f 764#define BCH_INODE_FIELDS_v2() \
a3e72262
KO
765 x(bi_atime, 96) \
766 x(bi_ctime, 96) \
767 x(bi_mtime, 96) \
768 x(bi_otime, 96) \
a3e70fb2
KO
769 x(bi_size, 64) \
770 x(bi_sectors, 64) \
771 x(bi_uid, 32) \
772 x(bi_gid, 32) \
773 x(bi_nlink, 32) \
774 x(bi_generation, 32) \
775 x(bi_dev, 32) \
776 x(bi_data_checksum, 8) \
777 x(bi_compression, 8) \
778 x(bi_project, 32) \
779 x(bi_background_compression, 8) \
780 x(bi_data_replicas, 8) \
781 x(bi_promote_target, 16) \
782 x(bi_foreground_target, 16) \
783 x(bi_background_target, 16) \
721d4ad8 784 x(bi_erasure_code, 16) \
ab2a29cc
KO
785 x(bi_fields_set, 16) \
786 x(bi_dir, 64) \
14b393ee
KO
787 x(bi_dir_offset, 64) \
788 x(bi_subvol, 32) \
789 x(bi_parent_subvol, 32)
a3e70fb2 790
8dd69d9f
KO
791#define BCH_INODE_FIELDS_v3() \
792 x(bi_atime, 96) \
793 x(bi_ctime, 96) \
794 x(bi_mtime, 96) \
795 x(bi_otime, 96) \
796 x(bi_uid, 32) \
797 x(bi_gid, 32) \
798 x(bi_nlink, 32) \
799 x(bi_generation, 32) \
800 x(bi_dev, 32) \
801 x(bi_data_checksum, 8) \
802 x(bi_compression, 8) \
803 x(bi_project, 32) \
804 x(bi_background_compression, 8) \
805 x(bi_data_replicas, 8) \
806 x(bi_promote_target, 16) \
807 x(bi_foreground_target, 16) \
808 x(bi_background_target, 16) \
809 x(bi_erasure_code, 16) \
810 x(bi_fields_set, 16) \
811 x(bi_dir, 64) \
812 x(bi_dir_offset, 64) \
813 x(bi_subvol, 32) \
a8b3a677
KO
814 x(bi_parent_subvol, 32) \
815 x(bi_nocow, 8)
8dd69d9f 816
d42dd4ad
KO
817/* subset of BCH_INODE_FIELDS */
818#define BCH_INODE_OPTS() \
819 x(data_checksum, 8) \
820 x(compression, 8) \
821 x(project, 32) \
822 x(background_compression, 8) \
823 x(data_replicas, 8) \
824 x(promote_target, 16) \
825 x(foreground_target, 16) \
826 x(background_target, 16) \
a8b3a677
KO
827 x(erasure_code, 16) \
828 x(nocow, 8)
1c6fdbd8 829
721d4ad8
KO
830enum inode_opt_id {
831#define x(name, ...) \
832 Inode_opt_##name,
833 BCH_INODE_OPTS()
834#undef x
835 Inode_opt_nr,
836};
837
103ffe9a
KO
838#define BCH_INODE_FLAGS() \
839 x(sync, 0) \
840 x(immutable, 1) \
841 x(append, 2) \
842 x(nodump, 3) \
843 x(noatime, 4) \
844 x(i_size_dirty, 5) \
845 x(i_sectors_dirty, 6) \
846 x(unlinked, 7) \
847 x(backptr_untrusted, 8)
848
849/* bits 20+ reserved for packed fields below: */
850
851enum bch_inode_flags {
852#define x(t, n) BCH_INODE_##t = 1U << n,
853 BCH_INODE_FLAGS()
854#undef x
1c6fdbd8
KO
855};
856
103ffe9a
KO
857enum __bch_inode_flags {
858#define x(t, n) __BCH_INODE_##t = n,
859 BCH_INODE_FLAGS()
860#undef x
861};
1c6fdbd8
KO
862
863LE32_BITMASK(INODE_STR_HASH, struct bch_inode, bi_flags, 20, 24);
a3e72262
KO
864LE32_BITMASK(INODE_NR_FIELDS, struct bch_inode, bi_flags, 24, 31);
865LE32_BITMASK(INODE_NEW_VARINT, struct bch_inode, bi_flags, 31, 32);
1c6fdbd8 866
3e52c222
KO
867LE64_BITMASK(INODEv2_STR_HASH, struct bch_inode_v2, bi_flags, 20, 24);
868LE64_BITMASK(INODEv2_NR_FIELDS, struct bch_inode_v2, bi_flags, 24, 31);
869
8dd69d9f
KO
870LE64_BITMASK(INODEv3_STR_HASH, struct bch_inode_v3, bi_flags, 20, 24);
871LE64_BITMASK(INODEv3_NR_FIELDS, struct bch_inode_v3, bi_flags, 24, 31);
872
873LE64_BITMASK(INODEv3_FIELDS_START,
874 struct bch_inode_v3, bi_flags, 31, 36);
875LE64_BITMASK(INODEv3_MODE, struct bch_inode_v3, bi_flags, 36, 52);
876
1c6fdbd8
KO
877/* Dirents */
878
879/*
880 * Dirents (and xattrs) have to implement string lookups; since our b-tree
881 * doesn't support arbitrary length strings for the key, we instead index by a
882 * 64 bit hash (currently truncated sha1) of the string, stored in the offset
883 * field of the key - using linear probing to resolve hash collisions. This also
884 * provides us with the readdir cookie posix requires.
885 *
886 * Linear probing requires us to use whiteouts for deletions, in the event of a
887 * collision:
888 */
889
1c6fdbd8
KO
890struct bch_dirent {
891 struct bch_val v;
892
893 /* Target inode number: */
4db65027 894 union {
1c6fdbd8 895 __le64 d_inum;
4db65027
KO
896 struct { /* DT_SUBVOL */
897 __le32 d_child_subvol;
898 __le32 d_parent_subvol;
899 };
900 };
1c6fdbd8
KO
901
902 /*
903 * Copy of mode bits 12-15 from the target inode - so userspace can get
904 * the filetype without having to do a stat()
905 */
906 __u8 d_type;
907
908 __u8 d_name[];
fd0c7679 909} __packed __aligned(8);
1c6fdbd8 910
14b393ee
KO
911#define DT_SUBVOL 16
912#define BCH_DT_MAX 17
913
a125c074 914#define BCH_NAME_MAX 512
1c6fdbd8
KO
915
916/* Xattrs */
917
26609b61
KO
918#define KEY_TYPE_XATTR_INDEX_USER 0
919#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS 1
920#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT 2
921#define KEY_TYPE_XATTR_INDEX_TRUSTED 3
922#define KEY_TYPE_XATTR_INDEX_SECURITY 4
1c6fdbd8
KO
923
924struct bch_xattr {
925 struct bch_val v;
926 __u8 x_type;
927 __u8 x_name_len;
928 __le16 x_val_len;
929 __u8 x_name[];
fd0c7679 930} __packed __aligned(8);
1c6fdbd8
KO
931
932/* Bucket/allocation information: */
933
1c6fdbd8
KO
934struct bch_alloc {
935 struct bch_val v;
936 __u8 fields;
937 __u8 gen;
938 __u8 data[];
fd0c7679 939} __packed __aligned(8);
1c6fdbd8 940
7f4e1d5d 941#define BCH_ALLOC_FIELDS_V1() \
8fe826f9
KO
942 x(read_time, 16) \
943 x(write_time, 16) \
944 x(data_type, 8) \
945 x(dirty_sectors, 16) \
946 x(cached_sectors, 16) \
7f4e1d5d
KO
947 x(oldest_gen, 8) \
948 x(stripe, 32) \
949 x(stripe_redundancy, 8)
950
a8c752bb
KO
951enum {
952#define x(name, _bits) BCH_ALLOC_FIELD_V1_##name,
953 BCH_ALLOC_FIELDS_V1()
954#undef x
955};
956
7f4e1d5d
KO
957struct bch_alloc_v2 {
958 struct bch_val v;
959 __u8 nr_fields;
960 __u8 gen;
961 __u8 oldest_gen;
962 __u8 data_type;
963 __u8 data[];
fd0c7679 964} __packed __aligned(8);
7f4e1d5d
KO
965
966#define BCH_ALLOC_FIELDS_V2() \
967 x(read_time, 64) \
968 x(write_time, 64) \
66d90823
KO
969 x(dirty_sectors, 32) \
970 x(cached_sectors, 32) \
7f4e1d5d
KO
971 x(stripe, 32) \
972 x(stripe_redundancy, 8)
90541a74 973
3e52c222
KO
974struct bch_alloc_v3 {
975 struct bch_val v;
976 __le64 journal_seq;
977 __le32 flags;
978 __u8 nr_fields;
979 __u8 gen;
980 __u8 oldest_gen;
981 __u8 data_type;
982 __u8 data[];
fd0c7679 983} __packed __aligned(8);
3e52c222 984
a8c752bb
KO
985LE32_BITMASK(BCH_ALLOC_V3_NEED_DISCARD,struct bch_alloc_v3, flags, 0, 1)
986LE32_BITMASK(BCH_ALLOC_V3_NEED_INC_GEN,struct bch_alloc_v3, flags, 1, 2)
987
3d48a7f8
KO
988struct bch_alloc_v4 {
989 struct bch_val v;
990 __u64 journal_seq;
991 __u32 flags;
992 __u8 gen;
993 __u8 oldest_gen;
994 __u8 data_type;
995 __u8 stripe_redundancy;
996 __u32 dirty_sectors;
997 __u32 cached_sectors;
998 __u64 io_time[2];
999 __u32 stripe;
1000 __u32 nr_external_backpointers;
80c33085 1001 __u64 fragmentation_lru;
fd0c7679 1002} __packed __aligned(8);
3d48a7f8 1003
19a614d2 1004#define BCH_ALLOC_V4_U64s_V0 6
a5cf5a4b 1005#define BCH_ALLOC_V4_U64s (sizeof(struct bch_alloc_v4) / sizeof(__u64))
19a614d2 1006
3d48a7f8
KO
1007BITMASK(BCH_ALLOC_V4_NEED_DISCARD, struct bch_alloc_v4, flags, 0, 1)
1008BITMASK(BCH_ALLOC_V4_NEED_INC_GEN, struct bch_alloc_v4, flags, 1, 2)
1009BITMASK(BCH_ALLOC_V4_BACKPOINTERS_START,struct bch_alloc_v4, flags, 2, 8)
1010BITMASK(BCH_ALLOC_V4_NR_BACKPOINTERS, struct bch_alloc_v4, flags, 8, 14)
1011
a8c752bb
KO
1012#define BCH_ALLOC_V4_NR_BACKPOINTERS_MAX 40
1013
1014struct bch_backpointer {
1015 struct bch_val v;
1016 __u8 btree_id;
1017 __u8 level;
1018 __u8 data_type;
1019 __u64 bucket_offset:40;
1020 __u32 bucket_len;
1021 struct bpos pos;
1022} __packed __aligned(8);
90541a74 1023
5250b74d
KO
1024#define KEY_TYPE_BUCKET_GENS_BITS 8
1025#define KEY_TYPE_BUCKET_GENS_NR (1U << KEY_TYPE_BUCKET_GENS_BITS)
1026#define KEY_TYPE_BUCKET_GENS_MASK (KEY_TYPE_BUCKET_GENS_NR - 1)
1027
1028struct bch_bucket_gens {
1029 struct bch_val v;
1030 u8 gens[KEY_TYPE_BUCKET_GENS_NR];
1031} __packed __aligned(8);
1032
1c6fdbd8
KO
1033/* Quotas: */
1034
1c6fdbd8
KO
1035enum quota_types {
1036 QTYP_USR = 0,
1037 QTYP_GRP = 1,
1038 QTYP_PRJ = 2,
1039 QTYP_NR = 3,
1040};
1041
1042enum quota_counters {
1043 Q_SPC = 0,
1044 Q_INO = 1,
1045 Q_COUNTERS = 2,
1046};
1047
1048struct bch_quota_counter {
1049 __le64 hardlimit;
1050 __le64 softlimit;
1051};
1052
1053struct bch_quota {
1054 struct bch_val v;
1055 struct bch_quota_counter c[Q_COUNTERS];
fd0c7679 1056} __packed __aligned(8);
1c6fdbd8 1057
cd575ddf
KO
1058/* Erasure coding */
1059
cd575ddf
KO
1060struct bch_stripe {
1061 struct bch_val v;
1062 __le16 sectors;
1063 __u8 algorithm;
1064 __u8 nr_blocks;
1065 __u8 nr_redundant;
1066
1067 __u8 csum_granularity_bits;
1068 __u8 csum_type;
1069 __u8 pad;
1070
81d8599e 1071 struct bch_extent_ptr ptrs[];
fd0c7679 1072} __packed __aligned(8);
cd575ddf 1073
76426098
KO
1074/* Reflink: */
1075
1076struct bch_reflink_p {
1077 struct bch_val v;
1078 __le64 idx;
6d76aefe
KO
1079 /*
1080 * A reflink pointer might point to an indirect extent which is then
1081 * later split (by copygc or rebalance). If we only pointed to part of
1082 * the original indirect extent, and then one of the fragments is
1083 * outside the range we point to, we'd leak a refcount: so when creating
1084 * reflink pointers, we need to store pad values to remember the full
1085 * range we were taking a reference on.
1086 */
1087 __le32 front_pad;
1088 __le32 back_pad;
fd0c7679 1089} __packed __aligned(8);
76426098
KO
1090
1091struct bch_reflink_v {
1092 struct bch_val v;
1093 __le64 refcount;
1094 union bch_extent_entry start[0];
5cfd6977 1095 __u64 _data[];
fd0c7679 1096} __packed __aligned(8);
76426098 1097
801a3de6
KO
1098struct bch_indirect_inline_data {
1099 struct bch_val v;
1100 __le64 refcount;
5cfd6977 1101 u8 data[];
801a3de6
KO
1102};
1103
4be1a412
KO
1104/* Inline data */
1105
1106struct bch_inline_data {
1107 struct bch_val v;
5cfd6977 1108 u8 data[];
4be1a412
KO
1109};
1110
14b393ee
KO
1111/* Subvolumes: */
1112
1113#define SUBVOL_POS_MIN POS(0, 1)
1114#define SUBVOL_POS_MAX POS(0, S32_MAX)
1115#define BCACHEFS_ROOT_SUBVOL 1
1116
1117struct bch_subvolume {
1118 struct bch_val v;
1119 __le32 flags;
1120 __le32 snapshot;
1121 __le64 inode;
8e877caa
KO
1122 /*
1123 * Snapshot subvolumes form a tree, separate from the snapshot nodes
1124 * tree - if this subvolume is a snapshot, this is the ID of the
1125 * subvolume it was created from:
12207f49
KO
1126 *
1127 * This is _not_ necessarily the subvolume of the directory containing
1128 * this subvolume:
8e877caa 1129 */
653693be
KO
1130 __le32 parent;
1131 __le32 pad;
1132 bch_le128 otime;
14b393ee
KO
1133};
1134
1135LE32_BITMASK(BCH_SUBVOLUME_RO, struct bch_subvolume, flags, 0, 1)
1136/*
1137 * We need to know whether a subvolume is a snapshot so we can know whether we
1138 * can delete it (or whether it should just be rm -rf'd)
1139 */
1140LE32_BITMASK(BCH_SUBVOLUME_SNAP, struct bch_subvolume, flags, 1, 2)
2027875b 1141LE32_BITMASK(BCH_SUBVOLUME_UNLINKED, struct bch_subvolume, flags, 2, 3)
14b393ee
KO
1142
1143/* Snapshots */
1144
1145struct bch_snapshot {
1146 struct bch_val v;
1147 __le32 flags;
1148 __le32 parent;
1149 __le32 children[2];
1150 __le32 subvol;
f55d6e07 1151 /* corresponds to a bch_snapshot_tree in BTREE_ID_snapshot_trees */
1c59b483 1152 __le32 tree;
f26c67f4
KO
1153 __le32 depth;
1154 __le32 skip[3];
d32088f2 1155 bch_le128 btime;
14b393ee
KO
1156};
1157
1158LE32_BITMASK(BCH_SNAPSHOT_DELETED, struct bch_snapshot, flags, 0, 1)
1159
1160/* True if a subvolume points to this snapshot node: */
1161LE32_BITMASK(BCH_SNAPSHOT_SUBVOL, struct bch_snapshot, flags, 1, 2)
1162
1c59b483
KO
1163/*
1164 * Snapshot trees:
1165 *
1166 * The snapshot_trees btree gives us persistent indentifier for each tree of
1167 * bch_snapshot nodes, and allow us to record and easily find the root/master
1168 * subvolume that other snapshots were created from:
1169 */
1170struct bch_snapshot_tree {
1171 struct bch_val v;
1172 __le32 master_subvol;
1173 __le32 root_snapshot;
1174};
1175
d326ab2f
KO
1176/* LRU btree: */
1177
1178struct bch_lru {
1179 struct bch_val v;
1180 __le64 idx;
fd0c7679 1181} __packed __aligned(8);
d326ab2f
KO
1182
1183#define LRU_ID_STRIPES (1U << 16)
1184
b030e262
KO
1185/* Logged operations btree: */
1186
1187struct bch_logged_op_truncate {
1188 struct bch_val v;
1189 __le32 subvol;
1190 __le32 pad;
1191 __le64 inum;
1192 __le64 new_i_size;
1193};
1194
f3e374ef
KO
1195enum logged_op_finsert_state {
1196 LOGGED_OP_FINSERT_start,
1197 LOGGED_OP_FINSERT_shift_extents,
1198 LOGGED_OP_FINSERT_finish,
1199};
1200
1201struct bch_logged_op_finsert {
1202 struct bch_val v;
1203 __u8 state;
1204 __u8 pad[3];
1205 __le32 subvol;
1206 __le64 inum;
1207 __le64 dst_offset;
1208 __le64 src_offset;
1209 __le64 pos;
1210};
1211
1c6fdbd8
KO
1212/* Optional/variable size superblock sections: */
1213
1214struct bch_sb_field {
1215 __u64 _data[0];
1216 __le32 u64s;
1217 __le32 type;
1218};
1219
25be2e5d 1220#define BCH_SB_FIELDS() \
8b16413c
KO
1221 x(journal, 0) \
1222 x(members_v1, 1) \
1223 x(crypt, 2) \
1224 x(replicas_v0, 3) \
1225 x(quota, 4) \
1226 x(disk_groups, 5) \
1227 x(clean, 6) \
1228 x(replicas, 7) \
1229 x(journal_seq_blacklist, 8) \
1230 x(journal_v2, 9) \
1231 x(counters, 10) \
1232 x(members_v2, 11) \
1233 x(errors, 12) \
84f16387
KO
1234 x(ext, 13) \
1235 x(downgrade, 14)
1c6fdbd8
KO
1236
1237enum bch_sb_field_type {
1238#define x(f, nr) BCH_SB_FIELD_##f = nr,
1239 BCH_SB_FIELDS()
1240#undef x
1241 BCH_SB_FIELD_NR
1242};
1243
25be2e5d
KO
1244/*
1245 * Most superblock fields are replicated in all device's superblocks - a few are
1246 * not:
1247 */
1248#define BCH_SINGLE_DEVICE_SB_FIELDS \
1249 ((1U << BCH_SB_FIELD_journal)| \
1250 (1U << BCH_SB_FIELD_journal_v2))
1251
1c6fdbd8
KO
1252/* BCH_SB_FIELD_journal: */
1253
1254struct bch_sb_field_journal {
1255 struct bch_sb_field field;
5cfd6977 1256 __le64 buckets[];
1c6fdbd8
KO
1257};
1258
25be2e5d
KO
1259struct bch_sb_field_journal_v2 {
1260 struct bch_sb_field field;
1261
1262 struct bch_sb_field_journal_v2_entry {
1263 __le64 start;
1264 __le64 nr;
5cfd6977 1265 } d[];
25be2e5d
KO
1266};
1267
9af26120 1268/* BCH_SB_FIELD_members_v1: */
1c6fdbd8 1269
8b335bae
KO
1270#define BCH_MIN_NR_NBUCKETS (1 << 6)
1271
40f7914e
HS
1272#define BCH_IOPS_MEASUREMENTS() \
1273 x(seqread, 0) \
1274 x(seqwrite, 1) \
1275 x(randread, 2) \
1276 x(randwrite, 3)
1277
1278enum bch_iops_measurement {
1279#define x(t, n) BCH_IOPS_##t = n,
1280 BCH_IOPS_MEASUREMENTS()
1281#undef x
1282 BCH_IOPS_NR
1283};
1284
94119eeb
KO
1285#define BCH_MEMBER_ERROR_TYPES() \
1286 x(read, 0) \
1287 x(write, 1) \
1288 x(checksum, 2)
1289
1290enum bch_member_error_type {
1291#define x(t, n) BCH_MEMBER_ERROR_##t = n,
1292 BCH_MEMBER_ERROR_TYPES()
1293#undef x
1294 BCH_MEMBER_ERROR_NR
1295};
1296
1c6fdbd8
KO
1297struct bch_member {
1298 __uuid_t uuid;
1299 __le64 nbuckets; /* device size */
1300 __le16 first_bucket; /* index of first bucket used */
1301 __le16 bucket_size; /* sectors */
1302 __le32 pad;
1303 __le64 last_mount; /* time_t */
1304
40f7914e
HS
1305 __le64 flags;
1306 __le32 iops[4];
94119eeb
KO
1307 __le64 errors[BCH_MEMBER_ERROR_NR];
1308 __le64 errors_at_reset[BCH_MEMBER_ERROR_NR];
1309 __le64 errors_reset_time;
6b00de06 1310 __le64 seq;
1c6fdbd8
KO
1311};
1312
3f7b9713
HS
1313#define BCH_MEMBER_V1_BYTES 56
1314
40f7914e 1315LE64_BITMASK(BCH_MEMBER_STATE, struct bch_member, flags, 0, 4)
7243498d 1316/* 4-14 unused, was TIER, HAS_(META)DATA, REPLACEMENT */
40f7914e
HS
1317LE64_BITMASK(BCH_MEMBER_DISCARD, struct bch_member, flags, 14, 15)
1318LE64_BITMASK(BCH_MEMBER_DATA_ALLOWED, struct bch_member, flags, 15, 20)
1319LE64_BITMASK(BCH_MEMBER_GROUP, struct bch_member, flags, 20, 28)
1320LE64_BITMASK(BCH_MEMBER_DURABILITY, struct bch_member, flags, 28, 30)
c6b2826c 1321LE64_BITMASK(BCH_MEMBER_FREESPACE_INITIALIZED,
40f7914e 1322 struct bch_member, flags, 30, 31)
1c6fdbd8 1323
1c6fdbd8
KO
1324#if 0
1325LE64_BITMASK(BCH_MEMBER_NR_READ_ERRORS, struct bch_member, flags[1], 0, 20);
1326LE64_BITMASK(BCH_MEMBER_NR_WRITE_ERRORS,struct bch_member, flags[1], 20, 40);
1327#endif
1328
2436cb9f
KO
1329#define BCH_MEMBER_STATES() \
1330 x(rw, 0) \
1331 x(ro, 1) \
1332 x(failed, 2) \
1333 x(spare, 3)
1334
1c6fdbd8 1335enum bch_member_state {
2436cb9f
KO
1336#define x(t, n) BCH_MEMBER_STATE_##t = n,
1337 BCH_MEMBER_STATES()
1338#undef x
1339 BCH_MEMBER_STATE_NR
1c6fdbd8
KO
1340};
1341
9af26120 1342struct bch_sb_field_members_v1 {
1c6fdbd8 1343 struct bch_sb_field field;
3f7b9713
HS
1344 struct bch_member _members[]; //Members are now variable size
1345};
1346
1347struct bch_sb_field_members_v2 {
1348 struct bch_sb_field field;
1349 __le16 member_bytes; //size of single member entry
1350 u8 pad[6];
1351 struct bch_member _members[];
1c6fdbd8
KO
1352};
1353
1354/* BCH_SB_FIELD_crypt: */
1355
1356struct nonce {
1357 __le32 d[4];
1358};
1359
1360struct bch_key {
1361 __le64 key[4];
1362};
1363
1364#define BCH_KEY_MAGIC \
a5cf5a4b
KO
1365 (((__u64) 'b' << 0)|((__u64) 'c' << 8)| \
1366 ((__u64) 'h' << 16)|((__u64) '*' << 24)| \
1367 ((__u64) '*' << 32)|((__u64) 'k' << 40)| \
1368 ((__u64) 'e' << 48)|((__u64) 'y' << 56))
1c6fdbd8
KO
1369
1370struct bch_encrypted_key {
1371 __le64 magic;
1372 struct bch_key key;
1373};
1374
1375/*
1376 * If this field is present in the superblock, it stores an encryption key which
1377 * is used encrypt all other data/metadata. The key will normally be encrypted
1378 * with the key userspace provides, but if encryption has been turned off we'll
1379 * just store the master key unencrypted in the superblock so we can access the
1380 * previously encrypted data.
1381 */
1382struct bch_sb_field_crypt {
1383 struct bch_sb_field field;
1384
1385 __le64 flags;
1386 __le64 kdf_flags;
1387 struct bch_encrypted_key key;
1388};
1389
1390LE64_BITMASK(BCH_CRYPT_KDF_TYPE, struct bch_sb_field_crypt, flags, 0, 4);
1391
1392enum bch_kdf_types {
1393 BCH_KDF_SCRYPT = 0,
1394 BCH_KDF_NR = 1,
1395};
1396
1397/* stored as base 2 log of scrypt params: */
1398LE64_BITMASK(BCH_KDF_SCRYPT_N, struct bch_sb_field_crypt, kdf_flags, 0, 16);
1399LE64_BITMASK(BCH_KDF_SCRYPT_R, struct bch_sb_field_crypt, kdf_flags, 16, 32);
1400LE64_BITMASK(BCH_KDF_SCRYPT_P, struct bch_sb_field_crypt, kdf_flags, 32, 48);
1401
1402/* BCH_SB_FIELD_replicas: */
1403
89fd25be 1404#define BCH_DATA_TYPES() \
822835ff 1405 x(free, 0) \
89fd25be
KO
1406 x(sb, 1) \
1407 x(journal, 2) \
1408 x(btree, 3) \
1409 x(user, 4) \
af4d05c4 1410 x(cached, 5) \
822835ff
KO
1411 x(parity, 6) \
1412 x(stripe, 7) \
1413 x(need_gc_gens, 8) \
1414 x(need_discard, 9)
89fd25be 1415
1c6fdbd8 1416enum bch_data_type {
89fd25be
KO
1417#define x(t, n) BCH_DATA_##t,
1418 BCH_DATA_TYPES()
1419#undef x
1420 BCH_DATA_NR
1c6fdbd8
KO
1421};
1422
822835ff
KO
1423static inline bool data_type_is_empty(enum bch_data_type type)
1424{
1425 switch (type) {
1426 case BCH_DATA_free:
1427 case BCH_DATA_need_gc_gens:
1428 case BCH_DATA_need_discard:
1429 return true;
1430 default:
1431 return false;
1432 }
1433}
1434
1435static inline bool data_type_is_hidden(enum bch_data_type type)
1436{
1437 switch (type) {
1438 case BCH_DATA_sb:
1439 case BCH_DATA_journal:
1440 return true;
1441 default:
1442 return false;
1443 }
1444}
1445
af9d3bc2
KO
1446struct bch_replicas_entry_v0 {
1447 __u8 data_type;
1448 __u8 nr_devs;
5cfd6977 1449 __u8 devs[];
fd0c7679 1450} __packed;
af9d3bc2
KO
1451
1452struct bch_sb_field_replicas_v0 {
1453 struct bch_sb_field field;
5cfd6977 1454 struct bch_replicas_entry_v0 entries[];
fd0c7679 1455} __packed __aligned(8);
af9d3bc2 1456
086a52f7 1457struct bch_replicas_entry_v1 {
7a920560
KO
1458 __u8 data_type;
1459 __u8 nr_devs;
af9d3bc2 1460 __u8 nr_required;
5cfd6977 1461 __u8 devs[];
fd0c7679 1462} __packed;
1c6fdbd8 1463
22502ac2
KO
1464#define replicas_entry_bytes(_i) \
1465 (offsetof(typeof(*(_i)), devs) + (_i)->nr_devs)
1466
1c6fdbd8
KO
1467struct bch_sb_field_replicas {
1468 struct bch_sb_field field;
086a52f7 1469 struct bch_replicas_entry_v1 entries[];
fd0c7679 1470} __packed __aligned(8);
1c6fdbd8
KO
1471
1472/* BCH_SB_FIELD_quota: */
1473
1474struct bch_sb_quota_counter {
1475 __le32 timelimit;
1476 __le32 warnlimit;
1477};
1478
1479struct bch_sb_quota_type {
1480 __le64 flags;
1481 struct bch_sb_quota_counter c[Q_COUNTERS];
1482};
1483
1484struct bch_sb_field_quota {
1485 struct bch_sb_field field;
1486 struct bch_sb_quota_type q[QTYP_NR];
fd0c7679 1487} __packed __aligned(8);
1c6fdbd8
KO
1488
1489/* BCH_SB_FIELD_disk_groups: */
1490
1491#define BCH_SB_LABEL_SIZE 32
1492
1493struct bch_disk_group {
1494 __u8 label[BCH_SB_LABEL_SIZE];
1495 __le64 flags[2];
fd0c7679 1496} __packed __aligned(8);
1c6fdbd8
KO
1497
1498LE64_BITMASK(BCH_GROUP_DELETED, struct bch_disk_group, flags[0], 0, 1)
1499LE64_BITMASK(BCH_GROUP_DATA_ALLOWED, struct bch_disk_group, flags[0], 1, 6)
1500LE64_BITMASK(BCH_GROUP_PARENT, struct bch_disk_group, flags[0], 6, 24)
1501
1502struct bch_sb_field_disk_groups {
1503 struct bch_sb_field field;
5cfd6977 1504 struct bch_disk_group entries[];
fd0c7679 1505} __packed __aligned(8);
1c6fdbd8 1506
104c6974
DH
1507/* BCH_SB_FIELD_counters */
1508
674cfc26
KO
1509#define BCH_PERSISTENT_COUNTERS() \
1510 x(io_read, 0) \
1511 x(io_write, 1) \
1512 x(io_move, 2) \
1513 x(bucket_invalidate, 3) \
1514 x(bucket_discard, 4) \
1515 x(bucket_alloc, 5) \
1516 x(bucket_alloc_fail, 6) \
1517 x(btree_cache_scan, 7) \
1518 x(btree_cache_reap, 8) \
1519 x(btree_cache_cannibalize, 9) \
1520 x(btree_cache_cannibalize_lock, 10) \
1521 x(btree_cache_cannibalize_lock_fail, 11) \
1522 x(btree_cache_cannibalize_unlock, 12) \
1523 x(btree_node_write, 13) \
1524 x(btree_node_read, 14) \
1525 x(btree_node_compact, 15) \
1526 x(btree_node_merge, 16) \
1527 x(btree_node_split, 17) \
1528 x(btree_node_rewrite, 18) \
1529 x(btree_node_alloc, 19) \
1530 x(btree_node_free, 20) \
1531 x(btree_node_set_root, 21) \
1532 x(btree_path_relock_fail, 22) \
1533 x(btree_path_upgrade_fail, 23) \
1534 x(btree_reserve_get_fail, 24) \
1535 x(journal_entry_full, 25) \
1536 x(journal_full, 26) \
1537 x(journal_reclaim_finish, 27) \
1538 x(journal_reclaim_start, 28) \
1539 x(journal_write, 29) \
1540 x(read_promote, 30) \
1541 x(read_bounce, 31) \
1542 x(read_split, 33) \
1543 x(read_retry, 32) \
1544 x(read_reuse_race, 34) \
1545 x(move_extent_read, 35) \
1546 x(move_extent_write, 36) \
1547 x(move_extent_finish, 37) \
1548 x(move_extent_fail, 38) \
ae4d612c 1549 x(move_extent_start_fail, 39) \
674cfc26
KO
1550 x(copygc, 40) \
1551 x(copygc_wait, 41) \
1552 x(gc_gens_end, 42) \
1553 x(gc_gens_start, 43) \
1554 x(trans_blocked_journal_reclaim, 44) \
1555 x(trans_restart_btree_node_reused, 45) \
1556 x(trans_restart_btree_node_split, 46) \
1557 x(trans_restart_fault_inject, 47) \
1558 x(trans_restart_iter_upgrade, 48) \
1559 x(trans_restart_journal_preres_get, 49) \
1560 x(trans_restart_journal_reclaim, 50) \
1561 x(trans_restart_journal_res_get, 51) \
1562 x(trans_restart_key_cache_key_realloced, 52) \
1563 x(trans_restart_key_cache_raced, 53) \
1564 x(trans_restart_mark_replicas, 54) \
1565 x(trans_restart_mem_realloced, 55) \
1566 x(trans_restart_memory_allocation_failure, 56) \
1567 x(trans_restart_relock, 57) \
1568 x(trans_restart_relock_after_fill, 58) \
1569 x(trans_restart_relock_key_cache_fill, 59) \
1570 x(trans_restart_relock_next_node, 60) \
1571 x(trans_restart_relock_parent_for_fill, 61) \
1572 x(trans_restart_relock_path, 62) \
1573 x(trans_restart_relock_path_intent, 63) \
1574 x(trans_restart_too_many_iters, 64) \
1575 x(trans_restart_traverse, 65) \
1576 x(trans_restart_upgrade, 66) \
1577 x(trans_restart_would_deadlock, 67) \
1578 x(trans_restart_would_deadlock_write, 68) \
1579 x(trans_restart_injected, 69) \
1580 x(trans_restart_key_cache_upgrade, 70) \
1581 x(trans_traverse_all, 71) \
1582 x(transaction_commit, 72) \
33bd5d06 1583 x(write_super, 73) \
920e69bc 1584 x(trans_restart_would_deadlock_recursion_limit, 74) \
e151580d 1585 x(trans_restart_write_buffer_flush, 75) \
56db2429
KO
1586 x(trans_restart_split_race, 76) \
1587 x(write_buffer_flush_slowpath, 77) \
1588 x(write_buffer_flush_sync, 78)
104c6974
DH
1589
1590enum bch_persistent_counters {
1591#define x(t, n, ...) BCH_COUNTER_##t,
1592 BCH_PERSISTENT_COUNTERS()
1593#undef x
1594 BCH_COUNTER_NR
1595};
1596
1597struct bch_sb_field_counters {
1598 struct bch_sb_field field;
5cfd6977 1599 __le64 d[];
104c6974
DH
1600};
1601
1c6fdbd8
KO
1602/*
1603 * On clean shutdown, store btree roots and current journal sequence number in
1604 * the superblock:
1605 */
1606struct jset_entry {
1607 __le16 u64s;
1608 __u8 btree_id;
1609 __u8 level;
1610 __u8 type; /* designates what this jset holds */
1611 __u8 pad[3];
1612
5cfd6977
KO
1613 struct bkey_i start[0];
1614 __u64 _data[];
1c6fdbd8
KO
1615};
1616
1617struct bch_sb_field_clean {
1618 struct bch_sb_field field;
1619
1620 __le32 flags;
2abe5420
KO
1621 __le16 _read_clock; /* no longer used */
1622 __le16 _write_clock;
1c6fdbd8
KO
1623 __le64 journal_seq;
1624
5cfd6977
KO
1625 struct jset_entry start[0];
1626 __u64 _data[];
1c6fdbd8
KO
1627};
1628
1dd7f9d9
KO
1629struct journal_seq_blacklist_entry {
1630 __le64 start;
1631 __le64 end;
1632};
1633
1634struct bch_sb_field_journal_seq_blacklist {
1635 struct bch_sb_field field;
6dfa10ab 1636 struct journal_seq_blacklist_entry start[];
1dd7f9d9
KO
1637};
1638
f5d26fa3
KO
1639struct bch_sb_field_errors {
1640 struct bch_sb_field field;
1641 struct bch_sb_field_error_entry {
1642 __le64 v;
1643 __le64 last_error_time;
1644 } entries[];
1645};
1646
1647LE64_BITMASK(BCH_SB_ERROR_ENTRY_ID, struct bch_sb_field_error_entry, v, 0, 16);
1648LE64_BITMASK(BCH_SB_ERROR_ENTRY_NR, struct bch_sb_field_error_entry, v, 16, 64);
1649
8b16413c
KO
1650struct bch_sb_field_ext {
1651 struct bch_sb_field field;
1652 __le64 recovery_passes_required[2];
1653 __le64 errors_silent[8];
1654};
1655
84f16387
KO
1656struct bch_sb_field_downgrade_entry {
1657 __le16 version;
1658 __le64 recovery_passes[2];
1659 __le16 nr_errors;
1660 __le16 errors[] __counted_by(nr_errors);
1661} __packed __aligned(2);
1662
1663struct bch_sb_field_downgrade {
1664 struct bch_sb_field field;
1665 struct bch_sb_field_downgrade_entry entries[];
1666};
1667
1c6fdbd8
KO
1668/* Superblock: */
1669
1670/*
26609b61
KO
1671 * New versioning scheme:
1672 * One common version number for all on disk data structures - superblock, btree
1673 * nodes, journal entries
1c6fdbd8 1674 */
ba8eeae8
KO
1675#define BCH_VERSION_MAJOR(_v) ((__u16) ((_v) >> 10))
1676#define BCH_VERSION_MINOR(_v) ((__u16) ((_v) & ~(~0U << 10)))
1677#define BCH_VERSION(_major, _minor) (((_major) << 10)|(_minor) << 0)
26609b61 1678
84f16387
KO
1679/*
1680 * field 1: version name
1681 * field 2: BCH_VERSION(major, minor)
1682 * field 3: recovery passess required on upgrade
1683 */
065bd335 1684#define BCH_METADATA_VERSIONS() \
15eaaa4c
KO
1685 x(bkey_renumber, BCH_VERSION(0, 10)) \
1686 x(inode_btree_change, BCH_VERSION(0, 11)) \
1687 x(snapshot, BCH_VERSION(0, 12)) \
1688 x(inode_backpointers, BCH_VERSION(0, 13)) \
1689 x(btree_ptr_sectors_written, BCH_VERSION(0, 14)) \
1690 x(snapshot_2, BCH_VERSION(0, 15)) \
1691 x(reflink_p_fix, BCH_VERSION(0, 16)) \
1692 x(subvol_dirent, BCH_VERSION(0, 17)) \
1693 x(inode_v2, BCH_VERSION(0, 18)) \
1694 x(freespace, BCH_VERSION(0, 19)) \
1695 x(alloc_v4, BCH_VERSION(0, 20)) \
1696 x(new_data_types, BCH_VERSION(0, 21)) \
1697 x(backpointers, BCH_VERSION(0, 22)) \
1698 x(inode_v3, BCH_VERSION(0, 23)) \
1699 x(unwritten_extents, BCH_VERSION(0, 24)) \
1700 x(bucket_gens, BCH_VERSION(0, 25)) \
1701 x(lru_v2, BCH_VERSION(0, 26)) \
1702 x(fragmentation_lru, BCH_VERSION(0, 27)) \
1703 x(no_bps_in_alloc_keys, BCH_VERSION(0, 28)) \
1704 x(snapshot_trees, BCH_VERSION(0, 29)) \
1705 x(major_minor, BCH_VERSION(1, 0)) \
1706 x(snapshot_skiplists, BCH_VERSION(1, 1)) \
1707 x(deleted_inodes, BCH_VERSION(1, 2)) \
1708 x(rebalance_work, BCH_VERSION(1, 3)) \
1709 x(member_seq, BCH_VERSION(1, 4))
74b33393 1710
26609b61 1711enum bcachefs_metadata_version {
74b33393 1712 bcachefs_metadata_version_min = 9,
15eaaa4c 1713#define x(t, n) bcachefs_metadata_version_##t = n,
74b33393
KO
1714 BCH_METADATA_VERSIONS()
1715#undef x
1716 bcachefs_metadata_version_max
26609b61 1717};
1c6fdbd8 1718
96dea3d5 1719static const __maybe_unused
fb3f57bb 1720unsigned bcachefs_metadata_required_upgrade_below = bcachefs_metadata_version_rebalance_work;
1c59b483 1721
26609b61 1722#define bcachefs_metadata_version_current (bcachefs_metadata_version_max - 1)
1c6fdbd8
KO
1723
1724#define BCH_SB_SECTOR 8
1725#define BCH_SB_MEMBERS_MAX 64 /* XXX kill */
1726
1727struct bch_sb_layout {
1728 __uuid_t magic; /* bcachefs superblock UUID */
1729 __u8 layout_type;
1730 __u8 sb_max_size_bits; /* base 2 of 512 byte sectors */
1731 __u8 nr_superblocks;
1732 __u8 pad[5];
1733 __le64 sb_offset[61];
fd0c7679 1734} __packed __aligned(8);
1c6fdbd8
KO
1735
1736#define BCH_SB_LAYOUT_SECTOR 7
1737
1738/*
1739 * @offset - sector where this sb was written
1740 * @version - on disk format version
26609b61
KO
1741 * @version_min - Oldest metadata version this filesystem contains; so we can
1742 * safely drop compatibility code and refuse to mount filesystems
1743 * we'd need it for
e1538212 1744 * @magic - identifies as a bcachefs superblock (BCHFS_MAGIC)
1c6fdbd8
KO
1745 * @seq - incremented each time superblock is written
1746 * @uuid - used for generating various magic numbers and identifying
1747 * member devices, never changes
1748 * @user_uuid - user visible UUID, may be changed
1749 * @label - filesystem label
1750 * @seq - identifies most recent superblock, incremented each time
1751 * superblock is written
1752 * @features - enabled incompatible features
1753 */
1754struct bch_sb {
1755 struct bch_csum csum;
1756 __le16 version;
1757 __le16 version_min;
1758 __le16 pad[2];
1759 __uuid_t magic;
1760 __uuid_t uuid;
1761 __uuid_t user_uuid;
1762 __u8 label[BCH_SB_LABEL_SIZE];
1763 __le64 offset;
1764 __le64 seq;
1765
1766 __le16 block_size;
1767 __u8 dev_idx;
1768 __u8 nr_devices;
1769 __le32 u64s;
1770
1771 __le64 time_base_lo;
1772 __le32 time_base_hi;
1773 __le32 time_precision;
1774
6b00de06
KO
1775 __le64 flags[7];
1776 __le64 write_time;
1c6fdbd8
KO
1777 __le64 features[2];
1778 __le64 compat[2];
1779
1780 struct bch_sb_layout layout;
1781
5cfd6977
KO
1782 struct bch_sb_field start[0];
1783 __le64 _data[];
fd0c7679 1784} __packed __aligned(8);
1c6fdbd8
KO
1785
1786/*
1787 * Flags:
1788 * BCH_SB_INITALIZED - set on first mount
1789 * BCH_SB_CLEAN - did we shut down cleanly? Just a hint, doesn't affect
1790 * behaviour of mount/recovery path:
1791 * BCH_SB_INODE_32BIT - limit inode numbers to 32 bits
1792 * BCH_SB_128_BIT_MACS - 128 bit macs instead of 80
1793 * BCH_SB_ENCRYPTION_TYPE - if nonzero encryption is enabled; overrides
1794 * DATA/META_CSUM_TYPE. Also indicates encryption
1795 * algorithm in use, if/when we get more than one
1796 */
1797
1798LE16_BITMASK(BCH_SB_BLOCK_SIZE, struct bch_sb, block_size, 0, 16);
1799
1800LE64_BITMASK(BCH_SB_INITIALIZED, struct bch_sb, flags[0], 0, 1);
1801LE64_BITMASK(BCH_SB_CLEAN, struct bch_sb, flags[0], 1, 2);
1802LE64_BITMASK(BCH_SB_CSUM_TYPE, struct bch_sb, flags[0], 2, 8);
1803LE64_BITMASK(BCH_SB_ERROR_ACTION, struct bch_sb, flags[0], 8, 12);
1804
1805LE64_BITMASK(BCH_SB_BTREE_NODE_SIZE, struct bch_sb, flags[0], 12, 28);
1806
1807LE64_BITMASK(BCH_SB_GC_RESERVE, struct bch_sb, flags[0], 28, 33);
1808LE64_BITMASK(BCH_SB_ROOT_RESERVE, struct bch_sb, flags[0], 33, 40);
1809
1810LE64_BITMASK(BCH_SB_META_CSUM_TYPE, struct bch_sb, flags[0], 40, 44);
1811LE64_BITMASK(BCH_SB_DATA_CSUM_TYPE, struct bch_sb, flags[0], 44, 48);
1812
1813LE64_BITMASK(BCH_SB_META_REPLICAS_WANT, struct bch_sb, flags[0], 48, 52);
1814LE64_BITMASK(BCH_SB_DATA_REPLICAS_WANT, struct bch_sb, flags[0], 52, 56);
1815
1816LE64_BITMASK(BCH_SB_POSIX_ACL, struct bch_sb, flags[0], 56, 57);
1817LE64_BITMASK(BCH_SB_USRQUOTA, struct bch_sb, flags[0], 57, 58);
1818LE64_BITMASK(BCH_SB_GRPQUOTA, struct bch_sb, flags[0], 58, 59);
1819LE64_BITMASK(BCH_SB_PRJQUOTA, struct bch_sb, flags[0], 59, 60);
1820
0bc166ff 1821LE64_BITMASK(BCH_SB_HAS_ERRORS, struct bch_sb, flags[0], 60, 61);
aae15aaf 1822LE64_BITMASK(BCH_SB_HAS_TOPOLOGY_ERRORS,struct bch_sb, flags[0], 61, 62);
0bc166ff 1823
7d6f07ed 1824LE64_BITMASK(BCH_SB_BIG_ENDIAN, struct bch_sb, flags[0], 62, 63);
36b8372b 1825
1c6fdbd8 1826LE64_BITMASK(BCH_SB_STR_HASH_TYPE, struct bch_sb, flags[1], 0, 4);
e86e9124 1827LE64_BITMASK(BCH_SB_COMPRESSION_TYPE_LO,struct bch_sb, flags[1], 4, 8);
1c6fdbd8
KO
1828LE64_BITMASK(BCH_SB_INODE_32BIT, struct bch_sb, flags[1], 8, 9);
1829
1830LE64_BITMASK(BCH_SB_128_BIT_MACS, struct bch_sb, flags[1], 9, 10);
1831LE64_BITMASK(BCH_SB_ENCRYPTION_TYPE, struct bch_sb, flags[1], 10, 14);
1832
1833/*
1834 * Max size of an extent that may require bouncing to read or write
1835 * (checksummed, compressed): 64k
1836 */
1837LE64_BITMASK(BCH_SB_ENCODED_EXTENT_MAX_BITS,
1838 struct bch_sb, flags[1], 14, 20);
1839
1840LE64_BITMASK(BCH_SB_META_REPLICAS_REQ, struct bch_sb, flags[1], 20, 24);
1841LE64_BITMASK(BCH_SB_DATA_REPLICAS_REQ, struct bch_sb, flags[1], 24, 28);
1842
1843LE64_BITMASK(BCH_SB_PROMOTE_TARGET, struct bch_sb, flags[1], 28, 40);
1844LE64_BITMASK(BCH_SB_FOREGROUND_TARGET, struct bch_sb, flags[1], 40, 52);
1845LE64_BITMASK(BCH_SB_BACKGROUND_TARGET, struct bch_sb, flags[1], 52, 64);
1846
e86e9124 1847LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE_LO,
1c6fdbd8 1848 struct bch_sb, flags[2], 0, 4);
a50ed7c8 1849LE64_BITMASK(BCH_SB_GC_RESERVE_BYTES, struct bch_sb, flags[2], 4, 64);
1c6fdbd8 1850
cd575ddf 1851LE64_BITMASK(BCH_SB_ERASURE_CODE, struct bch_sb, flags[3], 0, 16);
d042b040 1852LE64_BITMASK(BCH_SB_METADATA_TARGET, struct bch_sb, flags[3], 16, 28);
b282a74f 1853LE64_BITMASK(BCH_SB_SHARD_INUMS, struct bch_sb, flags[3], 28, 29);
996fb577 1854LE64_BITMASK(BCH_SB_INODES_USE_KEY_CACHE,struct bch_sb, flags[3], 29, 30);
2430e72f
KO
1855LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DELAY,struct bch_sb, flags[3], 30, 62);
1856LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DISABLED,struct bch_sb, flags[3], 62, 63);
1857LE64_BITMASK(BCH_SB_JOURNAL_RECLAIM_DELAY,struct bch_sb, flags[4], 0, 32);
fb64f3fd 1858LE64_BITMASK(BCH_SB_JOURNAL_TRANSACTION_NAMES,struct bch_sb, flags[4], 32, 33);
a8b3a677 1859LE64_BITMASK(BCH_SB_NOCOW, struct bch_sb, flags[4], 33, 34);
920e69bc 1860LE64_BITMASK(BCH_SB_WRITE_BUFFER_SIZE, struct bch_sb, flags[4], 34, 54);
3045bb95 1861LE64_BITMASK(BCH_SB_VERSION_UPGRADE, struct bch_sb, flags[4], 54, 56);
cd575ddf 1862
e86e9124
KO
1863LE64_BITMASK(BCH_SB_COMPRESSION_TYPE_HI,struct bch_sb, flags[4], 56, 60);
1864LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE_HI,
1865 struct bch_sb, flags[4], 60, 64);
24964e1c
KO
1866
1867LE64_BITMASK(BCH_SB_VERSION_UPGRADE_COMPLETE,
1868 struct bch_sb, flags[5], 0, 16);
1869
e86e9124
KO
1870static inline __u64 BCH_SB_COMPRESSION_TYPE(const struct bch_sb *sb)
1871{
1872 return BCH_SB_COMPRESSION_TYPE_LO(sb) | (BCH_SB_COMPRESSION_TYPE_HI(sb) << 4);
1873}
1874
1875static inline void SET_BCH_SB_COMPRESSION_TYPE(struct bch_sb *sb, __u64 v)
1876{
1877 SET_BCH_SB_COMPRESSION_TYPE_LO(sb, v);
1878 SET_BCH_SB_COMPRESSION_TYPE_HI(sb, v >> 4);
1879}
1880
1881static inline __u64 BCH_SB_BACKGROUND_COMPRESSION_TYPE(const struct bch_sb *sb)
1882{
1883 return BCH_SB_BACKGROUND_COMPRESSION_TYPE_LO(sb) |
1884 (BCH_SB_BACKGROUND_COMPRESSION_TYPE_HI(sb) << 4);
1885}
1886
1887static inline void SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE(struct bch_sb *sb, __u64 v)
1888{
1889 SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE_LO(sb, v);
1890 SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE_HI(sb, v >> 4);
1891}
1892
1c3ff72c
KO
1893/*
1894 * Features:
1895 *
1896 * journal_seq_blacklist_v3: gates BCH_SB_FIELD_journal_seq_blacklist
1897 * reflink: gates KEY_TYPE_reflink
1898 * inline_data: gates KEY_TYPE_inline_data
6404dcc9 1899 * new_siphash: gates BCH_STR_HASH_siphash
bcd6f3e0 1900 * new_extent_overwrite: gates BTREE_NODE_NEW_EXTENT_OVERWRITE
1c3ff72c
KO
1901 */
1902#define BCH_SB_FEATURES() \
1903 x(lz4, 0) \
1904 x(gzip, 1) \
1905 x(zstd, 2) \
1906 x(atomic_nlink, 3) \
1907 x(ec, 4) \
1908 x(journal_seq_blacklist_v3, 5) \
1909 x(reflink, 6) \
1910 x(new_siphash, 7) \
bcd6f3e0 1911 x(inline_data, 8) \
ab05de4c 1912 x(new_extent_overwrite, 9) \
548b3d20 1913 x(incompressible, 10) \
e3e464ac 1914 x(btree_ptr_v2, 11) \
6357d607 1915 x(extents_above_btree_updates, 12) \
801a3de6 1916 x(btree_updates_journalled, 13) \
a3e72262 1917 x(reflink_inline_data, 14) \
adbcada4 1918 x(new_varint, 15) \
7f4e1d5d 1919 x(journal_no_flush, 16) \
8042b5b7
KO
1920 x(alloc_v2, 17) \
1921 x(extents_across_btree_nodes, 18)
1922
1923#define BCH_SB_FEATURES_ALWAYS \
1924 ((1ULL << BCH_FEATURE_new_extent_overwrite)| \
1925 (1ULL << BCH_FEATURE_extents_above_btree_updates)|\
1926 (1ULL << BCH_FEATURE_btree_updates_journalled)|\
73590619 1927 (1ULL << BCH_FEATURE_alloc_v2)|\
8042b5b7 1928 (1ULL << BCH_FEATURE_extents_across_btree_nodes))
1c3ff72c 1929
b807a0c8 1930#define BCH_SB_FEATURES_ALL \
8042b5b7
KO
1931 (BCH_SB_FEATURES_ALWAYS| \
1932 (1ULL << BCH_FEATURE_new_siphash)| \
e3e464ac 1933 (1ULL << BCH_FEATURE_btree_ptr_v2)| \
adbcada4 1934 (1ULL << BCH_FEATURE_new_varint)| \
73590619 1935 (1ULL << BCH_FEATURE_journal_no_flush))
b807a0c8 1936
1c3ff72c
KO
1937enum bch_sb_feature {
1938#define x(f, n) BCH_FEATURE_##f,
1939 BCH_SB_FEATURES()
1940#undef x
c258f28e 1941 BCH_FEATURE_NR,
1c6fdbd8
KO
1942};
1943
19dd3172
KO
1944#define BCH_SB_COMPAT() \
1945 x(alloc_info, 0) \
1946 x(alloc_metadata, 1) \
1947 x(extents_above_btree_updates_done, 2) \
1948 x(bformat_overflow_done, 3)
1949
1df42b57 1950enum bch_sb_compat {
19dd3172
KO
1951#define x(f, n) BCH_COMPAT_##f,
1952 BCH_SB_COMPAT()
1953#undef x
1954 BCH_COMPAT_NR,
1df42b57
KO
1955};
1956
1c6fdbd8
KO
1957/* options: */
1958
3045bb95
KO
1959#define BCH_VERSION_UPGRADE_OPTS() \
1960 x(compatible, 0) \
1961 x(incompatible, 1) \
1962 x(none, 2)
1963
1964enum bch_version_upgrade_opts {
1965#define x(t, n) BCH_VERSION_UPGRADE_##t = n,
1966 BCH_VERSION_UPGRADE_OPTS()
1967#undef x
1968};
1969
1c6fdbd8
KO
1970#define BCH_REPLICAS_MAX 4U
1971
ffb7c3d3
KO
1972#define BCH_BKEY_PTRS_MAX 16U
1973
2436cb9f
KO
1974#define BCH_ERROR_ACTIONS() \
1975 x(continue, 0) \
1976 x(ro, 1) \
1977 x(panic, 2)
1978
1c6fdbd8 1979enum bch_error_actions {
2436cb9f
KO
1980#define x(t, n) BCH_ON_ERROR_##t = n,
1981 BCH_ERROR_ACTIONS()
1982#undef x
1983 BCH_ON_ERROR_NR
1c6fdbd8
KO
1984};
1985
6404dcc9
KO
1986#define BCH_STR_HASH_TYPES() \
1987 x(crc32c, 0) \
1988 x(crc64, 1) \
1989 x(siphash_old, 2) \
1990 x(siphash, 3)
1991
73501ab8 1992enum bch_str_hash_type {
6404dcc9
KO
1993#define x(t, n) BCH_STR_HASH_##t = n,
1994 BCH_STR_HASH_TYPES()
1995#undef x
1996 BCH_STR_HASH_NR
73501ab8
KO
1997};
1998
2436cb9f
KO
1999#define BCH_STR_HASH_OPTS() \
2000 x(crc32c, 0) \
2001 x(crc64, 1) \
2002 x(siphash, 2)
2003
73501ab8 2004enum bch_str_hash_opts {
2436cb9f
KO
2005#define x(t, n) BCH_STR_HASH_OPT_##t = n,
2006 BCH_STR_HASH_OPTS()
2007#undef x
2008 BCH_STR_HASH_OPT_NR
1c6fdbd8
KO
2009};
2010
6404dcc9
KO
2011#define BCH_CSUM_TYPES() \
2012 x(none, 0) \
2013 x(crc32c_nonzero, 1) \
2014 x(crc64_nonzero, 2) \
2015 x(chacha20_poly1305_80, 3) \
2016 x(chacha20_poly1305_128, 4) \
2017 x(crc32c, 5) \
2018 x(crc64, 6) \
2019 x(xxhash, 7)
2020
1c3ff72c 2021enum bch_csum_type {
6404dcc9
KO
2022#define x(t, n) BCH_CSUM_##t = n,
2023 BCH_CSUM_TYPES()
2024#undef x
2025 BCH_CSUM_NR
1c3ff72c
KO
2026};
2027
96dea3d5 2028static const __maybe_unused unsigned bch_crc_bytes[] = {
6404dcc9
KO
2029 [BCH_CSUM_none] = 0,
2030 [BCH_CSUM_crc32c_nonzero] = 4,
2031 [BCH_CSUM_crc32c] = 4,
2032 [BCH_CSUM_crc64_nonzero] = 8,
2033 [BCH_CSUM_crc64] = 8,
2034 [BCH_CSUM_xxhash] = 8,
2035 [BCH_CSUM_chacha20_poly1305_80] = 10,
2036 [BCH_CSUM_chacha20_poly1305_128] = 16,
1c3ff72c
KO
2037};
2038
2039static inline _Bool bch2_csum_type_is_encryption(enum bch_csum_type type)
2040{
2041 switch (type) {
6404dcc9
KO
2042 case BCH_CSUM_chacha20_poly1305_80:
2043 case BCH_CSUM_chacha20_poly1305_128:
1c3ff72c
KO
2044 return true;
2045 default:
2046 return false;
2047 }
2048}
2049
2436cb9f
KO
2050#define BCH_CSUM_OPTS() \
2051 x(none, 0) \
2052 x(crc32c, 1) \
41e63382 2053 x(crc64, 2) \
2054 x(xxhash, 3)
2436cb9f 2055
1c3ff72c 2056enum bch_csum_opts {
2436cb9f
KO
2057#define x(t, n) BCH_CSUM_OPT_##t = n,
2058 BCH_CSUM_OPTS()
2059#undef x
2060 BCH_CSUM_OPT_NR
1c3ff72c
KO
2061};
2062
1c6fdbd8 2063#define BCH_COMPRESSION_TYPES() \
ab05de4c
KO
2064 x(none, 0) \
2065 x(lz4_old, 1) \
2066 x(gzip, 2) \
2067 x(lz4, 3) \
2068 x(zstd, 4) \
2069 x(incompressible, 5)
1c6fdbd8 2070
1c3ff72c 2071enum bch_compression_type {
2436cb9f 2072#define x(t, n) BCH_COMPRESSION_TYPE_##t = n,
1c6fdbd8 2073 BCH_COMPRESSION_TYPES()
1c3ff72c
KO
2074#undef x
2075 BCH_COMPRESSION_TYPE_NR
2076};
2077
2078#define BCH_COMPRESSION_OPTS() \
2079 x(none, 0) \
2080 x(lz4, 1) \
2081 x(gzip, 2) \
2082 x(zstd, 3)
2083
2084enum bch_compression_opts {
2436cb9f 2085#define x(t, n) BCH_COMPRESSION_OPT_##t = n,
1c3ff72c 2086 BCH_COMPRESSION_OPTS()
1c6fdbd8
KO
2087#undef x
2088 BCH_COMPRESSION_OPT_NR
2089};
2090
2091/*
2092 * Magic numbers
2093 *
2094 * The various other data structures have their own magic numbers, which are
2095 * xored with the first part of the cache set's UUID
2096 */
2097
2098#define BCACHE_MAGIC \
2099 UUID_INIT(0xc68573f6, 0x4e1a, 0x45ca, \
2100 0x82, 0x65, 0xf5, 0x7f, 0x48, 0xba, 0x6d, 0x81)
2101#define BCHFS_MAGIC \
2102 UUID_INIT(0xc68573f6, 0x66ce, 0x90a9, \
2103 0xd9, 0x6a, 0x60, 0xcf, 0x80, 0x3d, 0xf7, 0xef)
2104
2105#define BCACHEFS_STATFS_MAGIC 0xca451a4e
2106
2107#define JSET_MAGIC __cpu_to_le64(0x245235c1a3625032ULL)
2108#define BSET_MAGIC __cpu_to_le64(0x90135c78b99e07f5ULL)
2109
2110static inline __le64 __bch2_sb_magic(struct bch_sb *sb)
2111{
2112 __le64 ret;
a1019576 2113
1c6fdbd8
KO
2114 memcpy(&ret, &sb->uuid, sizeof(ret));
2115 return ret;
2116}
2117
2118static inline __u64 __jset_magic(struct bch_sb *sb)
2119{
2120 return __le64_to_cpu(__bch2_sb_magic(sb) ^ JSET_MAGIC);
2121}
2122
2123static inline __u64 __bset_magic(struct bch_sb *sb)
2124{
2125 return __le64_to_cpu(__bch2_sb_magic(sb) ^ BSET_MAGIC);
2126}
2127
2128/* Journal */
2129
1c6fdbd8
KO
2130#define JSET_KEYS_U64s (sizeof(struct jset_entry) / sizeof(__u64))
2131
2132#define BCH_JSET_ENTRY_TYPES() \
2133 x(btree_keys, 0) \
2134 x(btree_root, 1) \
2135 x(prio_ptrs, 2) \
2136 x(blacklist, 3) \
2c5af169 2137 x(blacklist_v2, 4) \
3577df5f 2138 x(usage, 5) \
2abe5420 2139 x(data_usage, 6) \
180fb49d 2140 x(clock, 7) \
fb64f3fd 2141 x(dev_usage, 8) \
cb685ce7 2142 x(log, 9) \
09caeabe
KO
2143 x(overwrite, 10) \
2144 x(write_buffer_keys, 11)
1c6fdbd8
KO
2145
2146enum {
2147#define x(f, nr) BCH_JSET_ENTRY_##f = nr,
2148 BCH_JSET_ENTRY_TYPES()
2149#undef x
2150 BCH_JSET_ENTRY_NR
2151};
2152
5e329145
KO
2153static inline bool jset_entry_is_key(struct jset_entry *e)
2154{
2155 switch (e->type) {
2156 case BCH_JSET_ENTRY_btree_keys:
2157 case BCH_JSET_ENTRY_btree_root:
2158 case BCH_JSET_ENTRY_overwrite:
2159 case BCH_JSET_ENTRY_write_buffer_keys:
2160 return true;
2161 }
2162
2163 return false;
2164}
2165
1c6fdbd8
KO
2166/*
2167 * Journal sequence numbers can be blacklisted: bsets record the max sequence
2168 * number of all the journal entries they contain updates for, so that on
2169 * recovery we can ignore those bsets that contain index updates newer that what
2170 * made it into the journal.
2171 *
2172 * This means that we can't reuse that journal_seq - we have to skip it, and
2173 * then record that we skipped it so that the next time we crash and recover we
2174 * don't think there was a missing journal entry.
2175 */
2176struct jset_entry_blacklist {
2177 struct jset_entry entry;
2178 __le64 seq;
2179};
2180
2181struct jset_entry_blacklist_v2 {
2182 struct jset_entry entry;
2183 __le64 start;
2184 __le64 end;
2185};
2186
528b18e6
KO
2187#define BCH_FS_USAGE_TYPES() \
2188 x(reserved, 0) \
2189 x(inodes, 1) \
2190 x(key_version, 2)
2191
2c5af169 2192enum {
528b18e6
KO
2193#define x(f, nr) BCH_FS_USAGE_##f = nr,
2194 BCH_FS_USAGE_TYPES()
2195#undef x
2196 BCH_FS_USAGE_NR
2c5af169
KO
2197};
2198
2199struct jset_entry_usage {
2200 struct jset_entry entry;
3577df5f 2201 __le64 v;
fd0c7679 2202} __packed;
3577df5f
KO
2203
2204struct jset_entry_data_usage {
2205 struct jset_entry entry;
2206 __le64 v;
086a52f7 2207 struct bch_replicas_entry_v1 r;
fd0c7679 2208} __packed;
2c5af169 2209
2abe5420
KO
2210struct jset_entry_clock {
2211 struct jset_entry entry;
2212 __u8 rw;
2213 __u8 pad[7];
2214 __le64 time;
fd0c7679 2215} __packed;
2abe5420 2216
180fb49d
KO
2217struct jset_entry_dev_usage_type {
2218 __le64 buckets;
2219 __le64 sectors;
2220 __le64 fragmented;
fd0c7679 2221} __packed;
180fb49d
KO
2222
2223struct jset_entry_dev_usage {
2224 struct jset_entry entry;
2225 __le32 dev;
2226 __u32 pad;
2227
9b34f02c
KO
2228 __le64 _buckets_ec; /* No longer used */
2229 __le64 _buckets_unavailable; /* No longer used */
180fb49d
KO
2230
2231 struct jset_entry_dev_usage_type d[];
bf5a261c 2232};
180fb49d 2233
528b18e6
KO
2234static inline unsigned jset_entry_dev_usage_nr_types(struct jset_entry_dev_usage *u)
2235{
2236 return (vstruct_bytes(&u->entry) - sizeof(struct jset_entry_dev_usage)) /
2237 sizeof(struct jset_entry_dev_usage_type);
2238}
2239
fb64f3fd
KO
2240struct jset_entry_log {
2241 struct jset_entry entry;
2242 u8 d[];
e06af207 2243} __packed __aligned(8);
fb64f3fd 2244
1c6fdbd8
KO
2245/*
2246 * On disk format for a journal entry:
2247 * seq is monotonically increasing; every journal entry has its own unique
2248 * sequence number.
2249 *
2250 * last_seq is the oldest journal entry that still has keys the btree hasn't
2251 * flushed to disk yet.
2252 *
2253 * version is for on disk format changes.
2254 */
2255struct jset {
2256 struct bch_csum csum;
2257
2258 __le64 magic;
2259 __le64 seq;
2260 __le32 version;
2261 __le32 flags;
2262
2263 __le32 u64s; /* size of d[] in u64s */
2264
2265 __u8 encrypted_start[0];
2266
2abe5420
KO
2267 __le16 _read_clock; /* no longer used */
2268 __le16 _write_clock;
1c6fdbd8
KO
2269
2270 /* Sequence number of oldest dirty journal entry */
2271 __le64 last_seq;
2272
2273
5cfd6977
KO
2274 struct jset_entry start[0];
2275 __u64 _data[];
fd0c7679 2276} __packed __aligned(8);
1c6fdbd8
KO
2277
2278LE32_BITMASK(JSET_CSUM_TYPE, struct jset, flags, 0, 4);
2279LE32_BITMASK(JSET_BIG_ENDIAN, struct jset, flags, 4, 5);
adbcada4 2280LE32_BITMASK(JSET_NO_FLUSH, struct jset, flags, 5, 6);
1c6fdbd8 2281
8b335bae 2282#define BCH_JOURNAL_BUCKETS_MIN 8
1c6fdbd8
KO
2283
2284/* Btree: */
2285
e8d2fe3b
KO
2286enum btree_id_flags {
2287 BTREE_ID_EXTENTS = BIT(0),
2288 BTREE_ID_SNAPSHOTS = BIT(1),
d3c7727b
KO
2289 BTREE_ID_SNAPSHOT_FIELD = BIT(2),
2290 BTREE_ID_DATA = BIT(3),
e8d2fe3b
KO
2291};
2292
2293#define BCH_BTREE_IDS() \
2294 x(extents, 0, BTREE_ID_EXTENTS|BTREE_ID_SNAPSHOTS|BTREE_ID_DATA,\
2295 BIT_ULL(KEY_TYPE_whiteout)| \
2296 BIT_ULL(KEY_TYPE_error)| \
2297 BIT_ULL(KEY_TYPE_cookie)| \
2298 BIT_ULL(KEY_TYPE_extent)| \
2299 BIT_ULL(KEY_TYPE_reservation)| \
2300 BIT_ULL(KEY_TYPE_reflink_p)| \
2301 BIT_ULL(KEY_TYPE_inline_data)) \
2302 x(inodes, 1, BTREE_ID_SNAPSHOTS, \
2303 BIT_ULL(KEY_TYPE_whiteout)| \
2304 BIT_ULL(KEY_TYPE_inode)| \
2305 BIT_ULL(KEY_TYPE_inode_v2)| \
2306 BIT_ULL(KEY_TYPE_inode_v3)| \
2307 BIT_ULL(KEY_TYPE_inode_generation)) \
2308 x(dirents, 2, BTREE_ID_SNAPSHOTS, \
2309 BIT_ULL(KEY_TYPE_whiteout)| \
2310 BIT_ULL(KEY_TYPE_hash_whiteout)| \
2311 BIT_ULL(KEY_TYPE_dirent)) \
2312 x(xattrs, 3, BTREE_ID_SNAPSHOTS, \
2313 BIT_ULL(KEY_TYPE_whiteout)| \
2314 BIT_ULL(KEY_TYPE_cookie)| \
2315 BIT_ULL(KEY_TYPE_hash_whiteout)| \
2316 BIT_ULL(KEY_TYPE_xattr)) \
2317 x(alloc, 4, 0, \
2318 BIT_ULL(KEY_TYPE_alloc)| \
2319 BIT_ULL(KEY_TYPE_alloc_v2)| \
2320 BIT_ULL(KEY_TYPE_alloc_v3)| \
2321 BIT_ULL(KEY_TYPE_alloc_v4)) \
2322 x(quotas, 5, 0, \
2323 BIT_ULL(KEY_TYPE_quota)) \
2324 x(stripes, 6, 0, \
2325 BIT_ULL(KEY_TYPE_stripe)) \
2326 x(reflink, 7, BTREE_ID_EXTENTS|BTREE_ID_DATA, \
2327 BIT_ULL(KEY_TYPE_reflink_v)| \
2328 BIT_ULL(KEY_TYPE_indirect_inline_data)) \
2329 x(subvolumes, 8, 0, \
2330 BIT_ULL(KEY_TYPE_subvolume)) \
2331 x(snapshots, 9, 0, \
2332 BIT_ULL(KEY_TYPE_snapshot)) \
2333 x(lru, 10, 0, \
2334 BIT_ULL(KEY_TYPE_set)) \
2335 x(freespace, 11, BTREE_ID_EXTENTS, \
2336 BIT_ULL(KEY_TYPE_set)) \
2337 x(need_discard, 12, 0, \
2338 BIT_ULL(KEY_TYPE_set)) \
2339 x(backpointers, 13, 0, \
2340 BIT_ULL(KEY_TYPE_backpointer)) \
2341 x(bucket_gens, 14, 0, \
2342 BIT_ULL(KEY_TYPE_bucket_gens)) \
2343 x(snapshot_trees, 15, 0, \
dde8cb11 2344 BIT_ULL(KEY_TYPE_snapshot_tree)) \
d3c7727b 2345 x(deleted_inodes, 16, BTREE_ID_SNAPSHOT_FIELD, \
aaad530a
KO
2346 BIT_ULL(KEY_TYPE_set)) \
2347 x(logged_ops, 17, 0, \
f3e374ef 2348 BIT_ULL(KEY_TYPE_logged_op_truncate)| \
fb3f57bb 2349 BIT_ULL(KEY_TYPE_logged_op_finsert)) \
d3c7727b 2350 x(rebalance_work, 18, BTREE_ID_SNAPSHOT_FIELD, \
fb3f57bb 2351 BIT_ULL(KEY_TYPE_set)|BIT_ULL(KEY_TYPE_cookie))
1c6fdbd8
KO
2352
2353enum btree_id {
e8d2fe3b 2354#define x(name, nr, ...) BTREE_ID_##name = nr,
26609b61
KO
2355 BCH_BTREE_IDS()
2356#undef x
1c6fdbd8
KO
2357 BTREE_ID_NR
2358};
2359
1c6fdbd8
KO
2360#define BTREE_MAX_DEPTH 4U
2361
2362/* Btree nodes */
2363
1c6fdbd8
KO
2364/*
2365 * Btree nodes
2366 *
2367 * On disk a btree node is a list/log of these; within each set the keys are
2368 * sorted
2369 */
2370struct bset {
2371 __le64 seq;
2372
2373 /*
2374 * Highest journal entry this bset contains keys for.
2375 * If on recovery we don't see that journal entry, this bset is ignored:
2376 * this allows us to preserve the order of all index updates after a
2377 * crash, since the journal records a total order of all index updates
2378 * and anything that didn't make it to the journal doesn't get used.
2379 */
2380 __le64 journal_seq;
2381
2382 __le32 flags;
2383 __le16 version;
2384 __le16 u64s; /* count of d[] in u64s */
2385
5cfd6977
KO
2386 struct bkey_packed start[0];
2387 __u64 _data[];
fd0c7679 2388} __packed __aligned(8);
1c6fdbd8
KO
2389
2390LE32_BITMASK(BSET_CSUM_TYPE, struct bset, flags, 0, 4);
2391
2392LE32_BITMASK(BSET_BIG_ENDIAN, struct bset, flags, 4, 5);
2393LE32_BITMASK(BSET_SEPARATE_WHITEOUTS,
2394 struct bset, flags, 5, 6);
2395
e719fc34
KO
2396/* Sector offset within the btree node: */
2397LE32_BITMASK(BSET_OFFSET, struct bset, flags, 16, 32);
2398
1c6fdbd8
KO
2399struct btree_node {
2400 struct bch_csum csum;
2401 __le64 magic;
2402
2403 /* this flags field is encrypted, unlike bset->flags: */
2404 __le64 flags;
2405
2406 /* Closed interval: */
2407 struct bpos min_key;
2408 struct bpos max_key;
e751c01a 2409 struct bch_extent_ptr _ptr; /* not used anymore */
1c6fdbd8
KO
2410 struct bkey_format format;
2411
2412 union {
2413 struct bset keys;
2414 struct {
2415 __u8 pad[22];
2416 __le16 u64s;
2417 __u64 _data[0];
2418
2419 };
2420 };
fd0c7679 2421} __packed __aligned(8);
1c6fdbd8 2422
4e1430a7 2423LE64_BITMASK(BTREE_NODE_ID_LO, struct btree_node, flags, 0, 4);
1c6fdbd8 2424LE64_BITMASK(BTREE_NODE_LEVEL, struct btree_node, flags, 4, 8);
bcd6f3e0
KO
2425LE64_BITMASK(BTREE_NODE_NEW_EXTENT_OVERWRITE,
2426 struct btree_node, flags, 8, 9);
4e1430a7
KO
2427LE64_BITMASK(BTREE_NODE_ID_HI, struct btree_node, flags, 9, 25);
2428/* 25-32 unused */
1c6fdbd8
KO
2429LE64_BITMASK(BTREE_NODE_SEQ, struct btree_node, flags, 32, 64);
2430
4e1430a7
KO
2431static inline __u64 BTREE_NODE_ID(struct btree_node *n)
2432{
2433 return BTREE_NODE_ID_LO(n) | (BTREE_NODE_ID_HI(n) << 4);
2434}
2435
a5cf5a4b 2436static inline void SET_BTREE_NODE_ID(struct btree_node *n, __u64 v)
4e1430a7
KO
2437{
2438 SET_BTREE_NODE_ID_LO(n, v);
2439 SET_BTREE_NODE_ID_HI(n, v >> 4);
2440}
2441
1c6fdbd8
KO
2442struct btree_node_entry {
2443 struct bch_csum csum;
2444
2445 union {
2446 struct bset keys;
2447 struct {
2448 __u8 pad[22];
2449 __le16 u64s;
2450 __u64 _data[0];
1c6fdbd8
KO
2451 };
2452 };
fd0c7679 2453} __packed __aligned(8);
1c6fdbd8
KO
2454
2455#endif /* _BCACHEFS_FORMAT_H */