Commit | Line | Data |
---|---|---|
cafe5635 KO |
1 | #ifndef _BCACHE_JOURNAL_H |
2 | #define _BCACHE_JOURNAL_H | |
3 | ||
4 | /* | |
5 | * THE JOURNAL: | |
6 | * | |
7 | * The journal is treated as a circular buffer of buckets - a journal entry | |
8 | * never spans two buckets. This means (not implemented yet) we can resize the | |
9 | * journal at runtime, and will be needed for bcache on raw flash support. | |
10 | * | |
11 | * Journal entries contain a list of keys, ordered by the time they were | |
12 | * inserted; thus journal replay just has to reinsert the keys. | |
13 | * | |
14 | * We also keep some things in the journal header that are logically part of the | |
15 | * superblock - all the things that are frequently updated. This is for future | |
16 | * bcache on raw flash support; the superblock (which will become another | |
17 | * journal) can't be moved or wear leveled, so it contains just enough | |
18 | * information to find the main journal, and the superblock only has to be | |
19 | * rewritten when we want to move/wear level the main journal. | |
20 | * | |
21 | * Currently, we don't journal BTREE_REPLACE operations - this will hopefully be | |
22 | * fixed eventually. This isn't a bug - BTREE_REPLACE is used for insertions | |
23 | * from cache misses, which don't have to be journaled, and for writeback and | |
24 | * moving gc we work around it by flushing the btree to disk before updating the | |
25 | * gc information. But it is a potential issue with incremental garbage | |
26 | * collection, and it's fragile. | |
27 | * | |
28 | * OPEN JOURNAL ENTRIES: | |
29 | * | |
30 | * Each journal entry contains, in the header, the sequence number of the last | |
31 | * journal entry still open - i.e. that has keys that haven't been flushed to | |
32 | * disk in the btree. | |
33 | * | |
34 | * We track this by maintaining a refcount for every open journal entry, in a | |
35 | * fifo; each entry in the fifo corresponds to a particular journal | |
36 | * entry/sequence number. When the refcount at the tail of the fifo goes to | |
37 | * zero, we pop it off - thus, the size of the fifo tells us the number of open | |
38 | * journal entries | |
39 | * | |
40 | * We take a refcount on a journal entry when we add some keys to a journal | |
41 | * entry that we're going to insert (held by struct btree_op), and then when we | |
42 | * insert those keys into the btree the btree write we're setting up takes a | |
43 | * copy of that refcount (held by struct btree_write). That refcount is dropped | |
44 | * when the btree write completes. | |
45 | * | |
46 | * A struct btree_write can only hold a refcount on a single journal entry, but | |
47 | * might contain keys for many journal entries - we handle this by making sure | |
48 | * it always has a refcount on the _oldest_ journal entry of all the journal | |
49 | * entries it has keys for. | |
50 | * | |
51 | * JOURNAL RECLAIM: | |
52 | * | |
53 | * As mentioned previously, our fifo of refcounts tells us the number of open | |
54 | * journal entries; from that and the current journal sequence number we compute | |
55 | * last_seq - the oldest journal entry we still need. We write last_seq in each | |
56 | * journal entry, and we also have to keep track of where it exists on disk so | |
57 | * we don't overwrite it when we loop around the journal. | |
58 | * | |
59 | * To do that we track, for each journal bucket, the sequence number of the | |
60 | * newest journal entry it contains - if we don't need that journal entry we | |
61 | * don't need anything in that bucket anymore. From that we track the last | |
62 | * journal bucket we still need; all this is tracked in struct journal_device | |
63 | * and updated by journal_reclaim(). | |
64 | * | |
65 | * JOURNAL FILLING UP: | |
66 | * | |
67 | * There are two ways the journal could fill up; either we could run out of | |
68 | * space to write to, or we could have too many open journal entries and run out | |
69 | * of room in the fifo of refcounts. Since those refcounts are decremented | |
70 | * without any locking we can't safely resize that fifo, so we handle it the | |
71 | * same way. | |
72 | * | |
73 | * If the journal fills up, we start flushing dirty btree nodes until we can | |
74 | * allocate space for a journal write again - preferentially flushing btree | |
75 | * nodes that are pinning the oldest journal entries first. | |
76 | */ | |
77 | ||
cafe5635 KO |
78 | /* |
79 | * Only used for holding the journal entries we read in btree_journal_read() | |
80 | * during cache_registration | |
81 | */ | |
82 | struct journal_replay { | |
83 | struct list_head list; | |
84 | atomic_t *pin; | |
85 | struct jset j; | |
86 | }; | |
87 | ||
88 | /* | |
89 | * We put two of these in struct journal; we used them for writes to the | |
90 | * journal that are being staged or in flight. | |
91 | */ | |
92 | struct journal_write { | |
93 | struct jset *data; | |
94 | #define JSET_BITS 3 | |
95 | ||
96 | struct cache_set *c; | |
97 | struct closure_waitlist wait; | |
dabb4433 | 98 | bool dirty; |
cafe5635 KO |
99 | bool need_write; |
100 | }; | |
101 | ||
102 | /* Embedded in struct cache_set */ | |
103 | struct journal { | |
104 | spinlock_t lock; | |
105 | /* used when waiting because the journal was full */ | |
106 | struct closure_waitlist wait; | |
7857d5d4 | 107 | struct closure io; |
cb7a583e | 108 | int io_in_flight; |
7857d5d4 | 109 | struct delayed_work work; |
cafe5635 KO |
110 | |
111 | /* Number of blocks free in the bucket(s) we're currently writing to */ | |
112 | unsigned blocks_free; | |
113 | uint64_t seq; | |
114 | DECLARE_FIFO(atomic_t, pin); | |
115 | ||
116 | BKEY_PADDED(key); | |
117 | ||
118 | struct journal_write w[2], *cur; | |
119 | }; | |
120 | ||
121 | /* | |
122 | * Embedded in struct cache. First three fields refer to the array of journal | |
123 | * buckets, in cache_sb. | |
124 | */ | |
125 | struct journal_device { | |
126 | /* | |
127 | * For each journal bucket, contains the max sequence number of the | |
128 | * journal writes it contains - so we know when a bucket can be reused. | |
129 | */ | |
130 | uint64_t seq[SB_JOURNAL_BUCKETS]; | |
131 | ||
132 | /* Journal bucket we're currently writing to */ | |
133 | unsigned cur_idx; | |
134 | ||
135 | /* Last journal bucket that still contains an open journal entry */ | |
136 | unsigned last_idx; | |
137 | ||
138 | /* Next journal bucket to be discarded */ | |
139 | unsigned discard_idx; | |
140 | ||
141 | #define DISCARD_READY 0 | |
142 | #define DISCARD_IN_FLIGHT 1 | |
143 | #define DISCARD_DONE 2 | |
144 | /* 1 - discard in flight, -1 - discard completed */ | |
145 | atomic_t discard_in_flight; | |
146 | ||
147 | struct work_struct discard_work; | |
148 | struct bio discard_bio; | |
149 | struct bio_vec discard_bv; | |
150 | ||
151 | /* Bio for journal reads/writes to this device */ | |
152 | struct bio bio; | |
153 | struct bio_vec bv[8]; | |
154 | }; | |
155 | ||
156 | #define journal_pin_cmp(c, l, r) \ | |
c18536a7 | 157 | (fifo_idx(&(c)->journal.pin, (l)) > fifo_idx(&(c)->journal.pin, (r))) |
cafe5635 KO |
158 | |
159 | #define JOURNAL_PIN 20000 | |
160 | ||
161 | #define journal_full(j) \ | |
162 | (!(j)->blocks_free || fifo_free(&(j)->pin) <= 1) | |
163 | ||
164 | struct closure; | |
165 | struct cache_set; | |
166 | struct btree_op; | |
a34a8bfd | 167 | struct keylist; |
cafe5635 | 168 | |
a34a8bfd | 169 | atomic_t *bch_journal(struct cache_set *, struct keylist *, struct closure *); |
cafe5635 KO |
170 | void bch_journal_next(struct journal *); |
171 | void bch_journal_mark(struct cache_set *, struct list_head *); | |
172 | void bch_journal_meta(struct cache_set *, struct closure *); | |
c18536a7 KO |
173 | int bch_journal_read(struct cache_set *, struct list_head *); |
174 | int bch_journal_replay(struct cache_set *, struct list_head *); | |
cafe5635 KO |
175 | |
176 | void bch_journal_free(struct cache_set *); | |
177 | int bch_journal_alloc(struct cache_set *); | |
178 | ||
179 | #endif /* _BCACHE_JOURNAL_H */ |