Commit | Line | Data |
---|---|---|
cafe5635 KO |
1 | #ifndef _BCACHE_JOURNAL_H |
2 | #define _BCACHE_JOURNAL_H | |
3 | ||
4 | /* | |
5 | * THE JOURNAL: | |
6 | * | |
7 | * The journal is treated as a circular buffer of buckets - a journal entry | |
8 | * never spans two buckets. This means (not implemented yet) we can resize the | |
9 | * journal at runtime, and will be needed for bcache on raw flash support. | |
10 | * | |
11 | * Journal entries contain a list of keys, ordered by the time they were | |
12 | * inserted; thus journal replay just has to reinsert the keys. | |
13 | * | |
14 | * We also keep some things in the journal header that are logically part of the | |
15 | * superblock - all the things that are frequently updated. This is for future | |
16 | * bcache on raw flash support; the superblock (which will become another | |
17 | * journal) can't be moved or wear leveled, so it contains just enough | |
18 | * information to find the main journal, and the superblock only has to be | |
19 | * rewritten when we want to move/wear level the main journal. | |
20 | * | |
21 | * Currently, we don't journal BTREE_REPLACE operations - this will hopefully be | |
22 | * fixed eventually. This isn't a bug - BTREE_REPLACE is used for insertions | |
23 | * from cache misses, which don't have to be journaled, and for writeback and | |
24 | * moving gc we work around it by flushing the btree to disk before updating the | |
25 | * gc information. But it is a potential issue with incremental garbage | |
26 | * collection, and it's fragile. | |
27 | * | |
28 | * OPEN JOURNAL ENTRIES: | |
29 | * | |
30 | * Each journal entry contains, in the header, the sequence number of the last | |
31 | * journal entry still open - i.e. that has keys that haven't been flushed to | |
32 | * disk in the btree. | |
33 | * | |
34 | * We track this by maintaining a refcount for every open journal entry, in a | |
35 | * fifo; each entry in the fifo corresponds to a particular journal | |
36 | * entry/sequence number. When the refcount at the tail of the fifo goes to | |
37 | * zero, we pop it off - thus, the size of the fifo tells us the number of open | |
38 | * journal entries | |
39 | * | |
40 | * We take a refcount on a journal entry when we add some keys to a journal | |
41 | * entry that we're going to insert (held by struct btree_op), and then when we | |
42 | * insert those keys into the btree the btree write we're setting up takes a | |
43 | * copy of that refcount (held by struct btree_write). That refcount is dropped | |
44 | * when the btree write completes. | |
45 | * | |
46 | * A struct btree_write can only hold a refcount on a single journal entry, but | |
47 | * might contain keys for many journal entries - we handle this by making sure | |
48 | * it always has a refcount on the _oldest_ journal entry of all the journal | |
49 | * entries it has keys for. | |
50 | * | |
51 | * JOURNAL RECLAIM: | |
52 | * | |
53 | * As mentioned previously, our fifo of refcounts tells us the number of open | |
54 | * journal entries; from that and the current journal sequence number we compute | |
55 | * last_seq - the oldest journal entry we still need. We write last_seq in each | |
56 | * journal entry, and we also have to keep track of where it exists on disk so | |
57 | * we don't overwrite it when we loop around the journal. | |
58 | * | |
59 | * To do that we track, for each journal bucket, the sequence number of the | |
60 | * newest journal entry it contains - if we don't need that journal entry we | |
61 | * don't need anything in that bucket anymore. From that we track the last | |
62 | * journal bucket we still need; all this is tracked in struct journal_device | |
63 | * and updated by journal_reclaim(). | |
64 | * | |
65 | * JOURNAL FILLING UP: | |
66 | * | |
67 | * There are two ways the journal could fill up; either we could run out of | |
68 | * space to write to, or we could have too many open journal entries and run out | |
69 | * of room in the fifo of refcounts. Since those refcounts are decremented | |
70 | * without any locking we can't safely resize that fifo, so we handle it the | |
71 | * same way. | |
72 | * | |
73 | * If the journal fills up, we start flushing dirty btree nodes until we can | |
74 | * allocate space for a journal write again - preferentially flushing btree | |
75 | * nodes that are pinning the oldest journal entries first. | |
76 | */ | |
77 | ||
cafe5635 KO |
78 | /* |
79 | * Only used for holding the journal entries we read in btree_journal_read() | |
80 | * during cache_registration | |
81 | */ | |
82 | struct journal_replay { | |
83 | struct list_head list; | |
84 | atomic_t *pin; | |
85 | struct jset j; | |
86 | }; | |
87 | ||
88 | /* | |
89 | * We put two of these in struct journal; we used them for writes to the | |
90 | * journal that are being staged or in flight. | |
91 | */ | |
92 | struct journal_write { | |
93 | struct jset *data; | |
94 | #define JSET_BITS 3 | |
95 | ||
96 | struct cache_set *c; | |
97 | struct closure_waitlist wait; | |
98 | bool need_write; | |
99 | }; | |
100 | ||
101 | /* Embedded in struct cache_set */ | |
102 | struct journal { | |
103 | spinlock_t lock; | |
104 | /* used when waiting because the journal was full */ | |
105 | struct closure_waitlist wait; | |
7857d5d4 | 106 | struct closure io; |
cb7a583e | 107 | int io_in_flight; |
7857d5d4 | 108 | struct delayed_work work; |
cafe5635 KO |
109 | |
110 | /* Number of blocks free in the bucket(s) we're currently writing to */ | |
111 | unsigned blocks_free; | |
112 | uint64_t seq; | |
113 | DECLARE_FIFO(atomic_t, pin); | |
114 | ||
115 | BKEY_PADDED(key); | |
116 | ||
117 | struct journal_write w[2], *cur; | |
118 | }; | |
119 | ||
120 | /* | |
121 | * Embedded in struct cache. First three fields refer to the array of journal | |
122 | * buckets, in cache_sb. | |
123 | */ | |
124 | struct journal_device { | |
125 | /* | |
126 | * For each journal bucket, contains the max sequence number of the | |
127 | * journal writes it contains - so we know when a bucket can be reused. | |
128 | */ | |
129 | uint64_t seq[SB_JOURNAL_BUCKETS]; | |
130 | ||
131 | /* Journal bucket we're currently writing to */ | |
132 | unsigned cur_idx; | |
133 | ||
134 | /* Last journal bucket that still contains an open journal entry */ | |
135 | unsigned last_idx; | |
136 | ||
137 | /* Next journal bucket to be discarded */ | |
138 | unsigned discard_idx; | |
139 | ||
140 | #define DISCARD_READY 0 | |
141 | #define DISCARD_IN_FLIGHT 1 | |
142 | #define DISCARD_DONE 2 | |
143 | /* 1 - discard in flight, -1 - discard completed */ | |
144 | atomic_t discard_in_flight; | |
145 | ||
146 | struct work_struct discard_work; | |
147 | struct bio discard_bio; | |
148 | struct bio_vec discard_bv; | |
149 | ||
150 | /* Bio for journal reads/writes to this device */ | |
151 | struct bio bio; | |
152 | struct bio_vec bv[8]; | |
153 | }; | |
154 | ||
155 | #define journal_pin_cmp(c, l, r) \ | |
c18536a7 | 156 | (fifo_idx(&(c)->journal.pin, (l)) > fifo_idx(&(c)->journal.pin, (r))) |
cafe5635 KO |
157 | |
158 | #define JOURNAL_PIN 20000 | |
159 | ||
160 | #define journal_full(j) \ | |
161 | (!(j)->blocks_free || fifo_free(&(j)->pin) <= 1) | |
162 | ||
163 | struct closure; | |
164 | struct cache_set; | |
165 | struct btree_op; | |
a34a8bfd | 166 | struct keylist; |
cafe5635 | 167 | |
a34a8bfd | 168 | atomic_t *bch_journal(struct cache_set *, struct keylist *, struct closure *); |
cafe5635 KO |
169 | void bch_journal_next(struct journal *); |
170 | void bch_journal_mark(struct cache_set *, struct list_head *); | |
171 | void bch_journal_meta(struct cache_set *, struct closure *); | |
c18536a7 KO |
172 | int bch_journal_read(struct cache_set *, struct list_head *); |
173 | int bch_journal_replay(struct cache_set *, struct list_head *); | |
cafe5635 KO |
174 | |
175 | void bch_journal_free(struct cache_set *); | |
176 | int bch_journal_alloc(struct cache_set *); | |
177 | ||
178 | #endif /* _BCACHE_JOURNAL_H */ |