Commit | Line | Data |
---|---|---|
09eff388 MS |
1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | /* | |
3 | * Copyright 2023 Red Hat | |
4 | */ | |
5 | ||
6 | #include "slab-depot.h" | |
7 | ||
8 | #include <linux/atomic.h> | |
9 | #include <linux/bio.h> | |
4c79d556 | 10 | #include <linux/err.h> |
09eff388 MS |
11 | #include <linux/log2.h> |
12 | #include <linux/min_heap.h> | |
13 | #include <linux/minmax.h> | |
14 | ||
15 | #include "logger.h" | |
16 | #include "memory-alloc.h" | |
17 | #include "numeric.h" | |
18 | #include "permassert.h" | |
19 | #include "string-utils.h" | |
20 | ||
21 | #include "action-manager.h" | |
22 | #include "admin-state.h" | |
23 | #include "completion.h" | |
24 | #include "constants.h" | |
25 | #include "data-vio.h" | |
26 | #include "encodings.h" | |
27 | #include "io-submitter.h" | |
28 | #include "physical-zone.h" | |
29 | #include "priority-table.h" | |
30 | #include "recovery-journal.h" | |
31 | #include "repair.h" | |
32 | #include "status-codes.h" | |
33 | #include "types.h" | |
34 | #include "vdo.h" | |
35 | #include "vio.h" | |
36 | #include "wait-queue.h" | |
37 | ||
38 | static const u64 BYTES_PER_WORD = sizeof(u64); | |
39 | static const bool NORMAL_OPERATION = true; | |
40 | ||
41 | /** | |
42 | * get_lock() - Get the lock object for a slab journal block by sequence number. | |
43 | * @journal: vdo_slab journal to retrieve from. | |
44 | * @sequence_number: Sequence number of the block. | |
45 | * | |
46 | * Return: The lock object for the given sequence number. | |
47 | */ | |
48 | static inline struct journal_lock * __must_check get_lock(struct slab_journal *journal, | |
49 | sequence_number_t sequence_number) | |
50 | { | |
51 | return &journal->locks[sequence_number % journal->size]; | |
52 | } | |
53 | ||
54 | static bool is_slab_open(struct vdo_slab *slab) | |
55 | { | |
56 | return (!vdo_is_state_quiescing(&slab->state) && | |
57 | !vdo_is_state_quiescent(&slab->state)); | |
58 | } | |
59 | ||
60 | /** | |
61 | * must_make_entries_to_flush() - Check whether there are entry waiters which should delay a flush. | |
62 | * @journal: The journal to check. | |
63 | * | |
64 | * Return: true if there are no entry waiters, or if the slab is unrecovered. | |
65 | */ | |
66 | static inline bool __must_check must_make_entries_to_flush(struct slab_journal *journal) | |
67 | { | |
68 | return ((journal->slab->status != VDO_SLAB_REBUILDING) && | |
d6e260cc | 69 | vdo_waitq_has_waiters(&journal->entry_waiters)); |
09eff388 MS |
70 | } |
71 | ||
72 | /** | |
73 | * is_reaping() - Check whether a reap is currently in progress. | |
74 | * @journal: The journal which may be reaping. | |
75 | * | |
76 | * Return: true if the journal is reaping. | |
77 | */ | |
78 | static inline bool __must_check is_reaping(struct slab_journal *journal) | |
79 | { | |
80 | return (journal->head != journal->unreapable); | |
81 | } | |
82 | ||
83 | /** | |
84 | * initialize_tail_block() - Initialize tail block as a new block. | |
85 | * @journal: The journal whose tail block is being initialized. | |
86 | */ | |
87 | static void initialize_tail_block(struct slab_journal *journal) | |
88 | { | |
89 | struct slab_journal_block_header *header = &journal->tail_header; | |
90 | ||
91 | header->sequence_number = journal->tail; | |
92 | header->entry_count = 0; | |
93 | header->has_block_map_increments = false; | |
94 | } | |
95 | ||
96 | /** | |
97 | * initialize_journal_state() - Set all journal fields appropriately to start journaling. | |
98 | * @journal: The journal to be reset, based on its tail sequence number. | |
99 | */ | |
100 | static void initialize_journal_state(struct slab_journal *journal) | |
101 | { | |
102 | journal->unreapable = journal->head; | |
103 | journal->reap_lock = get_lock(journal, journal->unreapable); | |
104 | journal->next_commit = journal->tail; | |
105 | journal->summarized = journal->last_summarized = journal->tail; | |
106 | initialize_tail_block(journal); | |
107 | } | |
108 | ||
109 | /** | |
110 | * block_is_full() - Check whether a journal block is full. | |
111 | * @journal: The slab journal for the block. | |
112 | * | |
113 | * Return: true if the tail block is full. | |
114 | */ | |
115 | static bool __must_check block_is_full(struct slab_journal *journal) | |
116 | { | |
117 | journal_entry_count_t count = journal->tail_header.entry_count; | |
118 | ||
119 | return (journal->tail_header.has_block_map_increments ? | |
120 | (journal->full_entries_per_block == count) : | |
121 | (journal->entries_per_block == count)); | |
122 | } | |
123 | ||
124 | static void add_entries(struct slab_journal *journal); | |
125 | static void update_tail_block_location(struct slab_journal *journal); | |
d6e260cc | 126 | static void release_journal_locks(struct vdo_waiter *waiter, void *context); |
09eff388 MS |
127 | |
128 | /** | |
129 | * is_slab_journal_blank() - Check whether a slab's journal is blank. | |
130 | * | |
131 | * A slab journal is blank if it has never had any entries recorded in it. | |
132 | * | |
133 | * Return: true if the slab's journal has never been modified. | |
134 | */ | |
135 | static bool is_slab_journal_blank(const struct vdo_slab *slab) | |
136 | { | |
137 | return ((slab->journal.tail == 1) && | |
138 | (slab->journal.tail_header.entry_count == 0)); | |
139 | } | |
140 | ||
141 | /** | |
142 | * mark_slab_journal_dirty() - Put a slab journal on the dirty ring of its allocator in the correct | |
143 | * order. | |
144 | * @journal: The journal to be marked dirty. | |
145 | * @lock: The recovery journal lock held by the slab journal. | |
146 | */ | |
147 | static void mark_slab_journal_dirty(struct slab_journal *journal, sequence_number_t lock) | |
148 | { | |
149 | struct slab_journal *dirty_journal; | |
150 | struct list_head *dirty_list = &journal->slab->allocator->dirty_slab_journals; | |
151 | ||
152 | ASSERT_LOG_ONLY(journal->recovery_lock == 0, "slab journal was clean"); | |
153 | ||
154 | journal->recovery_lock = lock; | |
155 | list_for_each_entry_reverse(dirty_journal, dirty_list, dirty_entry) { | |
156 | if (dirty_journal->recovery_lock <= journal->recovery_lock) | |
157 | break; | |
158 | } | |
159 | ||
160 | list_move_tail(&journal->dirty_entry, dirty_journal->dirty_entry.next); | |
161 | } | |
162 | ||
163 | static void mark_slab_journal_clean(struct slab_journal *journal) | |
164 | { | |
165 | journal->recovery_lock = 0; | |
166 | list_del_init(&journal->dirty_entry); | |
167 | } | |
168 | ||
169 | static void check_if_slab_drained(struct vdo_slab *slab) | |
170 | { | |
171 | bool read_only; | |
172 | struct slab_journal *journal = &slab->journal; | |
173 | const struct admin_state_code *code; | |
174 | ||
175 | if (!vdo_is_state_draining(&slab->state) || | |
176 | must_make_entries_to_flush(journal) || | |
177 | is_reaping(journal) || | |
178 | journal->waiting_to_commit || | |
179 | !list_empty(&journal->uncommitted_blocks) || | |
180 | journal->updating_slab_summary || | |
181 | (slab->active_count > 0)) | |
182 | return; | |
183 | ||
184 | /* When not suspending or recovering, the slab must be clean. */ | |
185 | code = vdo_get_admin_state_code(&slab->state); | |
186 | read_only = vdo_is_read_only(slab->allocator->depot->vdo); | |
187 | if (!read_only && | |
d6e260cc | 188 | vdo_waitq_has_waiters(&slab->dirty_blocks) && |
09eff388 MS |
189 | (code != VDO_ADMIN_STATE_SUSPENDING) && |
190 | (code != VDO_ADMIN_STATE_RECOVERING)) | |
191 | return; | |
192 | ||
193 | vdo_finish_draining_with_result(&slab->state, | |
194 | (read_only ? VDO_READ_ONLY : VDO_SUCCESS)); | |
195 | } | |
196 | ||
883069e3 MS |
197 | /* FULLNESS HINT COMPUTATION */ |
198 | ||
199 | /** | |
200 | * compute_fullness_hint() - Translate a slab's free block count into a 'fullness hint' that can be | |
201 | * stored in a slab_summary_entry's 7 bits that are dedicated to its free | |
202 | * count. | |
203 | * @depot: The depot whose summary being updated. | |
204 | * @free_blocks: The number of free blocks. | |
205 | * | |
206 | * Note: the number of free blocks must be strictly less than 2^23 blocks, even though | |
207 | * theoretically slabs could contain precisely 2^23 blocks; there is an assumption that at least | |
208 | * one block is used by metadata. This assumption is necessary; otherwise, the fullness hint might | |
209 | * overflow. The fullness hint formula is roughly (fullness >> 16) & 0x7f, but (2^23 >> 16) & 0x7f | |
210 | * is 0, which would make it impossible to distinguish completely full from completely empty. | |
211 | * | |
212 | * Return: A fullness hint, which can be stored in 7 bits. | |
213 | */ | |
214 | static u8 __must_check compute_fullness_hint(struct slab_depot *depot, | |
215 | block_count_t free_blocks) | |
216 | { | |
217 | block_count_t hint; | |
218 | ||
219 | ASSERT_LOG_ONLY((free_blocks < (1 << 23)), "free blocks must be less than 2^23"); | |
220 | ||
221 | if (free_blocks == 0) | |
222 | return 0; | |
223 | ||
224 | hint = free_blocks >> depot->hint_shift; | |
225 | return ((hint == 0) ? 1 : hint); | |
226 | } | |
227 | ||
228 | /** | |
229 | * check_summary_drain_complete() - Check whether an allocators summary has finished draining. | |
230 | */ | |
231 | static void check_summary_drain_complete(struct block_allocator *allocator) | |
232 | { | |
883069e3 MS |
233 | if (!vdo_is_state_draining(&allocator->summary_state) || |
234 | (allocator->summary_write_count > 0)) | |
235 | return; | |
236 | ||
237 | vdo_finish_operation(&allocator->summary_state, | |
d6e260cc MS |
238 | (vdo_is_read_only(allocator->depot->vdo) ? |
239 | VDO_READ_ONLY : VDO_SUCCESS)); | |
883069e3 MS |
240 | } |
241 | ||
242 | /** | |
243 | * notify_summary_waiters() - Wake all the waiters in a given queue. | |
244 | * @allocator: The block allocator summary which owns the queue. | |
245 | * @queue: The queue to notify. | |
246 | */ | |
247 | static void notify_summary_waiters(struct block_allocator *allocator, | |
d6e260cc | 248 | struct vdo_wait_queue *queue) |
883069e3 | 249 | { |
d6e260cc MS |
250 | int result = (vdo_is_read_only(allocator->depot->vdo) ? |
251 | VDO_READ_ONLY : VDO_SUCCESS); | |
883069e3 | 252 | |
d6e260cc | 253 | vdo_waitq_notify_all_waiters(queue, NULL, &result); |
883069e3 MS |
254 | } |
255 | ||
256 | static void launch_write(struct slab_summary_block *summary_block); | |
257 | ||
258 | /** | |
259 | * finish_updating_slab_summary_block() - Finish processing a block which attempted to write, | |
260 | * whether or not the attempt succeeded. | |
261 | * @block: The block. | |
262 | */ | |
263 | static void finish_updating_slab_summary_block(struct slab_summary_block *block) | |
264 | { | |
265 | notify_summary_waiters(block->allocator, &block->current_update_waiters); | |
266 | block->writing = false; | |
267 | block->allocator->summary_write_count--; | |
d6e260cc | 268 | if (vdo_waitq_has_waiters(&block->next_update_waiters)) |
883069e3 MS |
269 | launch_write(block); |
270 | else | |
271 | check_summary_drain_complete(block->allocator); | |
272 | } | |
273 | ||
274 | /** | |
275 | * finish_update() - This is the callback for a successful summary block write. | |
276 | * @completion: The write vio. | |
277 | */ | |
278 | static void finish_update(struct vdo_completion *completion) | |
279 | { | |
280 | struct slab_summary_block *block = | |
281 | container_of(as_vio(completion), struct slab_summary_block, vio); | |
282 | ||
283 | atomic64_inc(&block->allocator->depot->summary_statistics.blocks_written); | |
284 | finish_updating_slab_summary_block(block); | |
285 | } | |
286 | ||
287 | /** | |
288 | * handle_write_error() - Handle an error writing a slab summary block. | |
289 | * @completion: The write VIO. | |
290 | */ | |
291 | static void handle_write_error(struct vdo_completion *completion) | |
292 | { | |
293 | struct slab_summary_block *block = | |
294 | container_of(as_vio(completion), struct slab_summary_block, vio); | |
295 | ||
296 | vio_record_metadata_io_error(as_vio(completion)); | |
297 | vdo_enter_read_only_mode(completion->vdo, completion->result); | |
298 | finish_updating_slab_summary_block(block); | |
299 | } | |
300 | ||
301 | static void write_slab_summary_endio(struct bio *bio) | |
302 | { | |
303 | struct vio *vio = bio->bi_private; | |
304 | struct slab_summary_block *block = | |
305 | container_of(vio, struct slab_summary_block, vio); | |
306 | ||
307 | continue_vio_after_io(vio, finish_update, block->allocator->thread_id); | |
308 | } | |
309 | ||
310 | /** | |
311 | * launch_write() - Write a slab summary block unless it is currently out for writing. | |
312 | * @block: The block that needs to be committed. | |
313 | */ | |
314 | static void launch_write(struct slab_summary_block *block) | |
315 | { | |
316 | struct block_allocator *allocator = block->allocator; | |
317 | struct slab_depot *depot = allocator->depot; | |
318 | physical_block_number_t pbn; | |
319 | ||
320 | if (block->writing) | |
321 | return; | |
322 | ||
323 | allocator->summary_write_count++; | |
d6e260cc MS |
324 | vdo_waitq_transfer_all_waiters(&block->next_update_waiters, |
325 | &block->current_update_waiters); | |
883069e3 MS |
326 | block->writing = true; |
327 | ||
328 | if (vdo_is_read_only(depot->vdo)) { | |
329 | finish_updating_slab_summary_block(block); | |
330 | return; | |
331 | } | |
332 | ||
333 | memcpy(block->outgoing_entries, block->entries, VDO_BLOCK_SIZE); | |
334 | ||
335 | /* | |
336 | * Flush before writing to ensure that the slab journal tail blocks and reference updates | |
dcd1332b SLM |
337 | * covered by this summary update are stable. Otherwise, a subsequent recovery could |
338 | * encounter a slab summary update that refers to a slab journal tail block that has not | |
339 | * actually been written. In such cases, the slab journal referenced will be treated as | |
340 | * empty, causing any data within the slab which predates the existing recovery journal | |
341 | * entries to be lost. | |
883069e3 MS |
342 | */ |
343 | pbn = (depot->summary_origin + | |
344 | (VDO_SLAB_SUMMARY_BLOCKS_PER_ZONE * allocator->zone_number) + | |
345 | block->index); | |
f7f46761 MS |
346 | vdo_submit_metadata_vio(&block->vio, pbn, write_slab_summary_endio, |
347 | handle_write_error, REQ_OP_WRITE | REQ_PREFLUSH); | |
883069e3 MS |
348 | } |
349 | ||
350 | /** | |
351 | * update_slab_summary_entry() - Update the entry for a slab. | |
352 | * @slab: The slab whose entry is to be updated | |
353 | * @waiter: The waiter that is updating the summary. | |
354 | * @tail_block_offset: The offset of the slab journal's tail block. | |
355 | * @load_ref_counts: Whether the reference counts must be loaded from disk on the vdo load. | |
356 | * @is_clean: Whether the slab is clean. | |
357 | * @free_blocks: The number of free blocks. | |
358 | */ | |
d6e260cc | 359 | static void update_slab_summary_entry(struct vdo_slab *slab, struct vdo_waiter *waiter, |
883069e3 MS |
360 | tail_block_offset_t tail_block_offset, |
361 | bool load_ref_counts, bool is_clean, | |
362 | block_count_t free_blocks) | |
363 | { | |
364 | u8 index = slab->slab_number / VDO_SLAB_SUMMARY_ENTRIES_PER_BLOCK; | |
365 | struct block_allocator *allocator = slab->allocator; | |
366 | struct slab_summary_block *block = &allocator->summary_blocks[index]; | |
367 | int result; | |
368 | struct slab_summary_entry *entry; | |
369 | ||
370 | if (vdo_is_read_only(block->vio.completion.vdo)) { | |
371 | result = VDO_READ_ONLY; | |
372 | waiter->callback(waiter, &result); | |
373 | return; | |
374 | } | |
375 | ||
376 | if (vdo_is_state_draining(&allocator->summary_state) || | |
377 | vdo_is_state_quiescent(&allocator->summary_state)) { | |
378 | result = VDO_INVALID_ADMIN_STATE; | |
379 | waiter->callback(waiter, &result); | |
380 | return; | |
381 | } | |
382 | ||
383 | entry = &allocator->summary_entries[slab->slab_number]; | |
384 | *entry = (struct slab_summary_entry) { | |
385 | .tail_block_offset = tail_block_offset, | |
386 | .load_ref_counts = (entry->load_ref_counts || load_ref_counts), | |
387 | .is_dirty = !is_clean, | |
388 | .fullness_hint = compute_fullness_hint(allocator->depot, free_blocks), | |
389 | }; | |
d6e260cc | 390 | vdo_waitq_enqueue_waiter(&block->next_update_waiters, waiter); |
883069e3 MS |
391 | launch_write(block); |
392 | } | |
393 | ||
09eff388 MS |
394 | /** |
395 | * finish_reaping() - Actually advance the head of the journal now that any necessary flushes are | |
396 | * complete. | |
397 | * @journal: The journal to be reaped. | |
398 | */ | |
399 | static void finish_reaping(struct slab_journal *journal) | |
400 | { | |
401 | journal->head = journal->unreapable; | |
402 | add_entries(journal); | |
403 | check_if_slab_drained(journal->slab); | |
404 | } | |
405 | ||
406 | static void reap_slab_journal(struct slab_journal *journal); | |
407 | ||
408 | /** | |
409 | * complete_reaping() - Finish reaping now that we have flushed the lower layer and then try | |
410 | * reaping again in case we deferred reaping due to an outstanding vio. | |
411 | * @completion: The flush vio. | |
412 | */ | |
413 | static void complete_reaping(struct vdo_completion *completion) | |
414 | { | |
415 | struct slab_journal *journal = completion->parent; | |
416 | ||
417 | return_vio_to_pool(journal->slab->allocator->vio_pool, | |
0eea6b6e | 418 | vio_as_pooled_vio(as_vio(vdo_forget(completion)))); |
09eff388 MS |
419 | finish_reaping(journal); |
420 | reap_slab_journal(journal); | |
421 | } | |
422 | ||
423 | /** | |
424 | * handle_flush_error() - Handle an error flushing the lower layer. | |
425 | * @completion: The flush vio. | |
426 | */ | |
427 | static void handle_flush_error(struct vdo_completion *completion) | |
428 | { | |
429 | vio_record_metadata_io_error(as_vio(completion)); | |
430 | vdo_enter_read_only_mode(completion->vdo, completion->result); | |
431 | complete_reaping(completion); | |
432 | } | |
433 | ||
434 | static void flush_endio(struct bio *bio) | |
435 | { | |
436 | struct vio *vio = bio->bi_private; | |
437 | struct slab_journal *journal = vio->completion.parent; | |
438 | ||
439 | continue_vio_after_io(vio, complete_reaping, | |
440 | journal->slab->allocator->thread_id); | |
441 | } | |
442 | ||
443 | /** | |
444 | * flush_for_reaping() - A waiter callback for getting a vio with which to flush the lower layer | |
445 | * prior to reaping. | |
446 | * @waiter: The journal as a flush waiter. | |
447 | * @context: The newly acquired flush vio. | |
448 | */ | |
d6e260cc | 449 | static void flush_for_reaping(struct vdo_waiter *waiter, void *context) |
09eff388 MS |
450 | { |
451 | struct slab_journal *journal = | |
452 | container_of(waiter, struct slab_journal, flush_waiter); | |
453 | struct pooled_vio *pooled = context; | |
454 | struct vio *vio = &pooled->vio; | |
455 | ||
456 | vio->completion.parent = journal; | |
ebe16015 | 457 | vdo_submit_flush_vio(vio, flush_endio, handle_flush_error); |
09eff388 MS |
458 | } |
459 | ||
460 | /** | |
461 | * reap_slab_journal() - Conduct a reap on a slab journal to reclaim unreferenced blocks. | |
462 | * @journal: The slab journal. | |
463 | */ | |
464 | static void reap_slab_journal(struct slab_journal *journal) | |
465 | { | |
466 | bool reaped = false; | |
467 | ||
468 | if (is_reaping(journal)) { | |
469 | /* We already have a reap in progress so wait for it to finish. */ | |
470 | return; | |
471 | } | |
472 | ||
473 | if ((journal->slab->status != VDO_SLAB_REBUILT) || | |
474 | !vdo_is_state_normal(&journal->slab->state) || | |
475 | vdo_is_read_only(journal->slab->allocator->depot->vdo)) { | |
476 | /* | |
477 | * We must not reap in the first two cases, and there's no point in read-only mode. | |
478 | */ | |
479 | return; | |
480 | } | |
481 | ||
482 | /* | |
483 | * Start reclaiming blocks only when the journal head has no references. Then stop when a | |
484 | * block is referenced or reap reaches the most recently written block, referenced by the | |
485 | * slab summary, which has the sequence number just before the tail. | |
486 | */ | |
487 | while ((journal->unreapable < journal->tail) && (journal->reap_lock->count == 0)) { | |
488 | reaped = true; | |
489 | journal->unreapable++; | |
490 | journal->reap_lock++; | |
491 | if (journal->reap_lock == &journal->locks[journal->size]) | |
492 | journal->reap_lock = &journal->locks[0]; | |
493 | } | |
494 | ||
495 | if (!reaped) | |
496 | return; | |
497 | ||
498 | /* | |
499 | * It is never safe to reap a slab journal block without first issuing a flush, regardless | |
500 | * of whether a user flush has been received or not. In the absence of the flush, the | |
501 | * reference block write which released the locks allowing the slab journal to reap may not | |
502 | * be persisted. Although slab summary writes will eventually issue flushes, multiple slab | |
503 | * journal block writes can be issued while previous slab summary updates have not yet been | |
504 | * made. Even though those slab journal block writes will be ignored if the slab summary | |
505 | * update is not persisted, they may still overwrite the to-be-reaped slab journal block | |
dcd1332b | 506 | * resulting in a loss of reference count updates. |
09eff388 MS |
507 | */ |
508 | journal->flush_waiter.callback = flush_for_reaping; | |
509 | acquire_vio_from_pool(journal->slab->allocator->vio_pool, | |
510 | &journal->flush_waiter); | |
511 | } | |
512 | ||
513 | /** | |
514 | * adjust_slab_journal_block_reference() - Adjust the reference count for a slab journal block. | |
515 | * @journal: The slab journal. | |
516 | * @sequence_number: The journal sequence number of the referenced block. | |
517 | * @adjustment: Amount to adjust the reference counter. | |
518 | * | |
519 | * Note that when the adjustment is negative, the slab journal will be reaped. | |
520 | */ | |
521 | static void adjust_slab_journal_block_reference(struct slab_journal *journal, | |
522 | sequence_number_t sequence_number, | |
523 | int adjustment) | |
524 | { | |
525 | struct journal_lock *lock; | |
526 | ||
527 | if (sequence_number == 0) | |
528 | return; | |
529 | ||
530 | if (journal->slab->status == VDO_SLAB_REPLAYING) { | |
531 | /* Locks should not be used during offline replay. */ | |
532 | return; | |
533 | } | |
534 | ||
535 | ASSERT_LOG_ONLY((adjustment != 0), "adjustment must be non-zero"); | |
536 | lock = get_lock(journal, sequence_number); | |
537 | if (adjustment < 0) { | |
538 | ASSERT_LOG_ONLY((-adjustment <= lock->count), | |
539 | "adjustment %d of lock count %u for slab journal block %llu must not underflow", | |
540 | adjustment, lock->count, | |
541 | (unsigned long long) sequence_number); | |
542 | } | |
543 | ||
544 | lock->count += adjustment; | |
545 | if (lock->count == 0) | |
546 | reap_slab_journal(journal); | |
547 | } | |
548 | ||
549 | /** | |
550 | * release_journal_locks() - Callback invoked after a slab summary update completes. | |
551 | * @waiter: The slab summary waiter that has just been notified. | |
552 | * @context: The result code of the update. | |
553 | * | |
554 | * Registered in the constructor on behalf of update_tail_block_location(). | |
555 | * | |
556 | * Implements waiter_callback_fn. | |
557 | */ | |
d6e260cc | 558 | static void release_journal_locks(struct vdo_waiter *waiter, void *context) |
09eff388 MS |
559 | { |
560 | sequence_number_t first, i; | |
561 | struct slab_journal *journal = | |
562 | container_of(waiter, struct slab_journal, slab_summary_waiter); | |
563 | int result = *((int *) context); | |
564 | ||
565 | if (result != VDO_SUCCESS) { | |
566 | if (result != VDO_READ_ONLY) { | |
567 | /* | |
568 | * Don't bother logging what might be lots of errors if we are already in | |
569 | * read-only mode. | |
570 | */ | |
571 | uds_log_error_strerror(result, "failed slab summary update %llu", | |
572 | (unsigned long long) journal->summarized); | |
573 | } | |
574 | ||
575 | journal->updating_slab_summary = false; | |
576 | vdo_enter_read_only_mode(journal->slab->allocator->depot->vdo, result); | |
577 | check_if_slab_drained(journal->slab); | |
578 | return; | |
579 | } | |
580 | ||
581 | if (journal->partial_write_in_progress && (journal->summarized == journal->tail)) { | |
582 | journal->partial_write_in_progress = false; | |
583 | add_entries(journal); | |
584 | } | |
585 | ||
586 | first = journal->last_summarized; | |
587 | journal->last_summarized = journal->summarized; | |
588 | for (i = journal->summarized - 1; i >= first; i--) { | |
589 | /* | |
590 | * Release the lock the summarized block held on the recovery journal. (During | |
591 | * replay, recovery_start will always be 0.) | |
592 | */ | |
593 | if (journal->recovery_journal != NULL) { | |
594 | zone_count_t zone_number = journal->slab->allocator->zone_number; | |
595 | struct journal_lock *lock = get_lock(journal, i); | |
596 | ||
597 | vdo_release_recovery_journal_block_reference(journal->recovery_journal, | |
598 | lock->recovery_start, | |
599 | VDO_ZONE_TYPE_PHYSICAL, | |
600 | zone_number); | |
601 | } | |
602 | ||
603 | /* | |
604 | * Release our own lock against reaping for blocks that are committed. (This | |
605 | * function will not change locks during replay.) | |
606 | */ | |
607 | adjust_slab_journal_block_reference(journal, i, -1); | |
608 | } | |
609 | ||
610 | journal->updating_slab_summary = false; | |
611 | ||
612 | reap_slab_journal(journal); | |
613 | ||
614 | /* Check if the slab summary needs to be updated again. */ | |
615 | update_tail_block_location(journal); | |
616 | } | |
617 | ||
618 | /** | |
619 | * update_tail_block_location() - Update the tail block location in the slab summary, if necessary. | |
620 | * @journal: The slab journal that is updating its tail block location. | |
621 | */ | |
622 | static void update_tail_block_location(struct slab_journal *journal) | |
623 | { | |
624 | block_count_t free_block_count; | |
625 | struct vdo_slab *slab = journal->slab; | |
626 | ||
627 | if (journal->updating_slab_summary || | |
628 | vdo_is_read_only(journal->slab->allocator->depot->vdo) || | |
629 | (journal->last_summarized >= journal->next_commit)) { | |
630 | check_if_slab_drained(slab); | |
631 | return; | |
632 | } | |
633 | ||
634 | if (slab->status != VDO_SLAB_REBUILT) { | |
635 | u8 hint = slab->allocator->summary_entries[slab->slab_number].fullness_hint; | |
636 | ||
637 | free_block_count = ((block_count_t) hint) << slab->allocator->depot->hint_shift; | |
638 | } else { | |
639 | free_block_count = slab->free_blocks; | |
640 | } | |
641 | ||
642 | journal->summarized = journal->next_commit; | |
643 | journal->updating_slab_summary = true; | |
644 | ||
645 | /* | |
646 | * Update slab summary as dirty. | |
647 | * vdo_slab journal can only reap past sequence number 1 when all the ref counts for this | |
648 | * slab have been written to the layer. Therefore, indicate that the ref counts must be | |
649 | * loaded when the journal head has reaped past sequence number 1. | |
650 | */ | |
651 | update_slab_summary_entry(slab, &journal->slab_summary_waiter, | |
652 | journal->summarized % journal->size, | |
653 | (journal->head > 1), false, free_block_count); | |
654 | } | |
655 | ||
656 | /** | |
657 | * reopen_slab_journal() - Reopen a slab's journal by emptying it and then adding pending entries. | |
658 | */ | |
659 | static void reopen_slab_journal(struct vdo_slab *slab) | |
660 | { | |
661 | struct slab_journal *journal = &slab->journal; | |
662 | sequence_number_t block; | |
663 | ||
664 | ASSERT_LOG_ONLY(journal->tail_header.entry_count == 0, | |
665 | "vdo_slab journal's active block empty before reopening"); | |
666 | journal->head = journal->tail; | |
667 | initialize_journal_state(journal); | |
668 | ||
669 | /* Ensure no locks are spuriously held on an empty journal. */ | |
670 | for (block = 1; block <= journal->size; block++) { | |
671 | ASSERT_LOG_ONLY((get_lock(journal, block)->count == 0), | |
672 | "Scrubbed journal's block %llu is not locked", | |
673 | (unsigned long long) block); | |
674 | } | |
675 | ||
676 | add_entries(journal); | |
677 | } | |
678 | ||
679 | static sequence_number_t get_committing_sequence_number(const struct pooled_vio *vio) | |
680 | { | |
681 | const struct packed_slab_journal_block *block = | |
682 | (const struct packed_slab_journal_block *) vio->vio.data; | |
683 | ||
684 | return __le64_to_cpu(block->header.sequence_number); | |
685 | } | |
686 | ||
687 | /** | |
688 | * complete_write() - Handle post-commit processing. | |
689 | * @completion: The write vio as a completion. | |
690 | * | |
691 | * This is the callback registered by write_slab_journal_block(). | |
692 | */ | |
693 | static void complete_write(struct vdo_completion *completion) | |
694 | { | |
695 | int result = completion->result; | |
696 | struct pooled_vio *pooled = vio_as_pooled_vio(as_vio(completion)); | |
697 | struct slab_journal *journal = completion->parent; | |
698 | sequence_number_t committed = get_committing_sequence_number(pooled); | |
699 | ||
700 | list_del_init(&pooled->list_entry); | |
0eea6b6e | 701 | return_vio_to_pool(journal->slab->allocator->vio_pool, vdo_forget(pooled)); |
09eff388 MS |
702 | |
703 | if (result != VDO_SUCCESS) { | |
704 | vio_record_metadata_io_error(as_vio(completion)); | |
705 | uds_log_error_strerror(result, "cannot write slab journal block %llu", | |
706 | (unsigned long long) committed); | |
707 | vdo_enter_read_only_mode(journal->slab->allocator->depot->vdo, result); | |
708 | check_if_slab_drained(journal->slab); | |
709 | return; | |
710 | } | |
711 | ||
712 | WRITE_ONCE(journal->events->blocks_written, journal->events->blocks_written + 1); | |
713 | ||
714 | if (list_empty(&journal->uncommitted_blocks)) { | |
715 | /* If no blocks are outstanding, then the commit point is at the tail. */ | |
716 | journal->next_commit = journal->tail; | |
717 | } else { | |
718 | /* The commit point is always the beginning of the oldest incomplete block. */ | |
719 | pooled = container_of(journal->uncommitted_blocks.next, | |
720 | struct pooled_vio, list_entry); | |
721 | journal->next_commit = get_committing_sequence_number(pooled); | |
722 | } | |
723 | ||
724 | update_tail_block_location(journal); | |
725 | } | |
726 | ||
727 | static void write_slab_journal_endio(struct bio *bio) | |
728 | { | |
729 | struct vio *vio = bio->bi_private; | |
730 | struct slab_journal *journal = vio->completion.parent; | |
731 | ||
732 | continue_vio_after_io(vio, complete_write, journal->slab->allocator->thread_id); | |
733 | } | |
734 | ||
735 | /** | |
736 | * write_slab_journal_block() - Write a slab journal block. | |
737 | * @waiter: The vio pool waiter which was just notified. | |
738 | * @context: The vio pool entry for the write. | |
739 | * | |
740 | * Callback from acquire_vio_from_pool() registered in commit_tail(). | |
741 | */ | |
d6e260cc | 742 | static void write_slab_journal_block(struct vdo_waiter *waiter, void *context) |
09eff388 MS |
743 | { |
744 | struct pooled_vio *pooled = context; | |
745 | struct vio *vio = &pooled->vio; | |
746 | struct slab_journal *journal = | |
747 | container_of(waiter, struct slab_journal, resource_waiter); | |
748 | struct slab_journal_block_header *header = &journal->tail_header; | |
749 | int unused_entries = journal->entries_per_block - header->entry_count; | |
750 | physical_block_number_t block_number; | |
751 | const struct admin_state_code *operation; | |
752 | ||
753 | header->head = journal->head; | |
754 | list_add_tail(&pooled->list_entry, &journal->uncommitted_blocks); | |
755 | vdo_pack_slab_journal_block_header(header, &journal->block->header); | |
756 | ||
757 | /* Copy the tail block into the vio. */ | |
758 | memcpy(pooled->vio.data, journal->block, VDO_BLOCK_SIZE); | |
759 | ||
760 | ASSERT_LOG_ONLY(unused_entries >= 0, "vdo_slab journal block is not overfull"); | |
761 | if (unused_entries > 0) { | |
762 | /* | |
763 | * Release the per-entry locks for any unused entries in the block we are about to | |
764 | * write. | |
765 | */ | |
766 | adjust_slab_journal_block_reference(journal, header->sequence_number, | |
767 | -unused_entries); | |
768 | journal->partial_write_in_progress = !block_is_full(journal); | |
769 | } | |
770 | ||
771 | block_number = journal->slab->journal_origin + | |
772 | (header->sequence_number % journal->size); | |
773 | vio->completion.parent = journal; | |
774 | ||
775 | /* | |
776 | * This block won't be read in recovery until the slab summary is updated to refer to it. | |
dcd1332b SLM |
777 | * The slab summary update does a flush which is sufficient to protect us from corruption |
778 | * due to out of order slab journal, reference block, or block map writes. | |
09eff388 | 779 | */ |
0eea6b6e | 780 | vdo_submit_metadata_vio(vdo_forget(vio), block_number, write_slab_journal_endio, |
f7f46761 | 781 | complete_write, REQ_OP_WRITE); |
09eff388 MS |
782 | |
783 | /* Since the write is submitted, the tail block structure can be reused. */ | |
784 | journal->tail++; | |
785 | initialize_tail_block(journal); | |
786 | journal->waiting_to_commit = false; | |
787 | ||
788 | operation = vdo_get_admin_state_code(&journal->slab->state); | |
789 | if (operation == VDO_ADMIN_STATE_WAITING_FOR_RECOVERY) { | |
790 | vdo_finish_operation(&journal->slab->state, | |
791 | (vdo_is_read_only(journal->slab->allocator->depot->vdo) ? | |
792 | VDO_READ_ONLY : VDO_SUCCESS)); | |
793 | return; | |
794 | } | |
795 | ||
796 | add_entries(journal); | |
797 | } | |
798 | ||
799 | /** | |
800 | * commit_tail() - Commit the tail block of the slab journal. | |
801 | * @journal: The journal whose tail block should be committed. | |
802 | */ | |
803 | static void commit_tail(struct slab_journal *journal) | |
804 | { | |
805 | if ((journal->tail_header.entry_count == 0) && must_make_entries_to_flush(journal)) { | |
806 | /* | |
807 | * There are no entries at the moment, but there are some waiters, so defer | |
808 | * initiating the flush until those entries are ready to write. | |
809 | */ | |
810 | return; | |
811 | } | |
812 | ||
813 | if (vdo_is_read_only(journal->slab->allocator->depot->vdo) || | |
814 | journal->waiting_to_commit || | |
815 | (journal->tail_header.entry_count == 0)) { | |
816 | /* | |
817 | * There is nothing to do since the tail block is empty, or writing, or the journal | |
818 | * is in read-only mode. | |
819 | */ | |
820 | return; | |
821 | } | |
822 | ||
823 | /* | |
824 | * Since we are about to commit the tail block, this journal no longer needs to be on the | |
825 | * ring of journals which the recovery journal might ask to commit. | |
826 | */ | |
827 | mark_slab_journal_clean(journal); | |
828 | ||
829 | journal->waiting_to_commit = true; | |
830 | ||
831 | journal->resource_waiter.callback = write_slab_journal_block; | |
832 | acquire_vio_from_pool(journal->slab->allocator->vio_pool, | |
833 | &journal->resource_waiter); | |
834 | } | |
835 | ||
836 | /** | |
837 | * encode_slab_journal_entry() - Encode a slab journal entry. | |
838 | * @tail_header: The unpacked header for the block. | |
839 | * @payload: The journal block payload to hold the entry. | |
840 | * @sbn: The slab block number of the entry to encode. | |
841 | * @operation: The type of the entry. | |
842 | * @increment: True if this is an increment. | |
843 | * | |
844 | * Exposed for unit tests. | |
845 | */ | |
846 | static void encode_slab_journal_entry(struct slab_journal_block_header *tail_header, | |
847 | slab_journal_payload *payload, | |
848 | slab_block_number sbn, | |
849 | enum journal_operation operation, | |
850 | bool increment) | |
851 | { | |
852 | journal_entry_count_t entry_number = tail_header->entry_count++; | |
853 | ||
854 | if (operation == VDO_JOURNAL_BLOCK_MAP_REMAPPING) { | |
855 | if (!tail_header->has_block_map_increments) { | |
856 | memset(payload->full_entries.entry_types, 0, | |
857 | VDO_SLAB_JOURNAL_ENTRY_TYPES_SIZE); | |
858 | tail_header->has_block_map_increments = true; | |
859 | } | |
860 | ||
861 | payload->full_entries.entry_types[entry_number / 8] |= | |
862 | ((u8)1 << (entry_number % 8)); | |
863 | } | |
864 | ||
865 | vdo_pack_slab_journal_entry(&payload->entries[entry_number], sbn, increment); | |
866 | } | |
867 | ||
868 | /** | |
869 | * expand_journal_point() - Convert a recovery journal journal_point which refers to both an | |
870 | * increment and a decrement to a single point which refers to one or the | |
871 | * other. | |
872 | * @recovery_point: The journal point to convert. | |
873 | * @increment: Whether the current entry is an increment. | |
874 | * | |
875 | * Return: The expanded journal point | |
876 | * | |
877 | * Because each data_vio has but a single recovery journal point, but may need to make both | |
878 | * increment and decrement entries in the same slab journal. In order to distinguish the two | |
879 | * entries, the entry count of the expanded journal point is twice the actual recovery journal | |
880 | * entry count for increments, and one more than that for decrements. | |
881 | */ | |
882 | static struct journal_point expand_journal_point(struct journal_point recovery_point, | |
883 | bool increment) | |
884 | { | |
885 | recovery_point.entry_count *= 2; | |
886 | if (!increment) | |
887 | recovery_point.entry_count++; | |
888 | ||
889 | return recovery_point; | |
890 | } | |
891 | ||
892 | /** | |
893 | * add_entry() - Actually add an entry to the slab journal, potentially firing off a write if a | |
894 | * block becomes full. | |
895 | * @journal: The slab journal to append to. | |
896 | * @pbn: The pbn being adjusted. | |
897 | * @operation: The type of entry to make. | |
898 | * @increment: True if this is an increment. | |
899 | * @recovery_point: The expanded recovery point. | |
900 | * | |
901 | * This function is synchronous. | |
902 | */ | |
903 | static void add_entry(struct slab_journal *journal, physical_block_number_t pbn, | |
904 | enum journal_operation operation, bool increment, | |
905 | struct journal_point recovery_point) | |
906 | { | |
907 | struct packed_slab_journal_block *block = journal->block; | |
908 | int result; | |
909 | ||
910 | result = ASSERT(vdo_before_journal_point(&journal->tail_header.recovery_point, | |
911 | &recovery_point), | |
912 | "recovery journal point is monotonically increasing, recovery point: %llu.%u, block recovery point: %llu.%u", | |
913 | (unsigned long long) recovery_point.sequence_number, | |
914 | recovery_point.entry_count, | |
915 | (unsigned long long) journal->tail_header.recovery_point.sequence_number, | |
916 | journal->tail_header.recovery_point.entry_count); | |
917 | if (result != VDO_SUCCESS) { | |
918 | vdo_enter_read_only_mode(journal->slab->allocator->depot->vdo, result); | |
919 | return; | |
920 | } | |
921 | ||
922 | if (operation == VDO_JOURNAL_BLOCK_MAP_REMAPPING) { | |
923 | result = ASSERT((journal->tail_header.entry_count < | |
924 | journal->full_entries_per_block), | |
925 | "block has room for full entries"); | |
926 | if (result != VDO_SUCCESS) { | |
927 | vdo_enter_read_only_mode(journal->slab->allocator->depot->vdo, | |
928 | result); | |
929 | return; | |
930 | } | |
931 | } | |
932 | ||
933 | encode_slab_journal_entry(&journal->tail_header, &block->payload, | |
934 | pbn - journal->slab->start, operation, increment); | |
935 | journal->tail_header.recovery_point = recovery_point; | |
936 | if (block_is_full(journal)) | |
937 | commit_tail(journal); | |
938 | } | |
939 | ||
940 | static inline block_count_t journal_length(const struct slab_journal *journal) | |
941 | { | |
942 | return journal->tail - journal->head; | |
943 | } | |
944 | ||
945 | /** | |
946 | * vdo_attempt_replay_into_slab() - Replay a recovery journal entry into a slab's journal. | |
947 | * @slab: The slab to play into. | |
948 | * @pbn: The PBN for the entry. | |
949 | * @operation: The type of entry to add. | |
950 | * @increment: True if this entry is an increment. | |
951 | * @recovery_point: The recovery journal point corresponding to this entry. | |
952 | * @parent: The completion to notify when there is space to add the entry if the entry could not be | |
953 | * added immediately. | |
954 | * | |
955 | * Return: true if the entry was added immediately. | |
956 | */ | |
957 | bool vdo_attempt_replay_into_slab(struct vdo_slab *slab, physical_block_number_t pbn, | |
958 | enum journal_operation operation, bool increment, | |
959 | struct journal_point *recovery_point, | |
960 | struct vdo_completion *parent) | |
961 | { | |
962 | struct slab_journal *journal = &slab->journal; | |
963 | struct slab_journal_block_header *header = &journal->tail_header; | |
964 | struct journal_point expanded = expand_journal_point(*recovery_point, increment); | |
965 | ||
966 | /* Only accept entries after the current recovery point. */ | |
967 | if (!vdo_before_journal_point(&journal->tail_header.recovery_point, &expanded)) | |
968 | return true; | |
969 | ||
970 | if ((header->entry_count >= journal->full_entries_per_block) && | |
971 | (header->has_block_map_increments || (operation == VDO_JOURNAL_BLOCK_MAP_REMAPPING))) { | |
972 | /* | |
973 | * The tail block does not have room for the entry we are attempting to add so | |
974 | * commit the tail block now. | |
975 | */ | |
976 | commit_tail(journal); | |
977 | } | |
978 | ||
979 | if (journal->waiting_to_commit) { | |
980 | vdo_start_operation_with_waiter(&journal->slab->state, | |
981 | VDO_ADMIN_STATE_WAITING_FOR_RECOVERY, | |
982 | parent, NULL); | |
983 | return false; | |
984 | } | |
985 | ||
986 | if (journal_length(journal) >= journal->size) { | |
987 | /* | |
988 | * We must have reaped the current head before the crash, since the blocked | |
989 | * threshold keeps us from having more entries than fit in a slab journal; hence we | |
990 | * can just advance the head (and unreapable block), as needed. | |
991 | */ | |
992 | journal->head++; | |
993 | journal->unreapable++; | |
994 | } | |
995 | ||
996 | if (journal->slab->status == VDO_SLAB_REBUILT) | |
997 | journal->slab->status = VDO_SLAB_REPLAYING; | |
998 | ||
999 | add_entry(journal, pbn, operation, increment, expanded); | |
1000 | return true; | |
1001 | } | |
1002 | ||
1003 | /** | |
1004 | * requires_reaping() - Check whether the journal must be reaped before adding new entries. | |
1005 | * @journal: The journal to check. | |
1006 | * | |
1007 | * Return: true if the journal must be reaped. | |
1008 | */ | |
1009 | static bool requires_reaping(const struct slab_journal *journal) | |
1010 | { | |
1011 | return (journal_length(journal) >= journal->blocking_threshold); | |
1012 | } | |
1013 | ||
1014 | /** finish_summary_update() - A waiter callback that resets the writing state of a slab. */ | |
d6e260cc | 1015 | static void finish_summary_update(struct vdo_waiter *waiter, void *context) |
09eff388 MS |
1016 | { |
1017 | struct vdo_slab *slab = container_of(waiter, struct vdo_slab, summary_waiter); | |
1018 | int result = *((int *) context); | |
1019 | ||
1020 | slab->active_count--; | |
1021 | ||
1022 | if ((result != VDO_SUCCESS) && (result != VDO_READ_ONLY)) { | |
1023 | uds_log_error_strerror(result, "failed to update slab summary"); | |
1024 | vdo_enter_read_only_mode(slab->allocator->depot->vdo, result); | |
1025 | } | |
1026 | ||
1027 | check_if_slab_drained(slab); | |
1028 | } | |
1029 | ||
d6e260cc | 1030 | static void write_reference_block(struct vdo_waiter *waiter, void *context); |
09eff388 MS |
1031 | |
1032 | /** | |
1033 | * launch_reference_block_write() - Launch the write of a dirty reference block by first acquiring | |
1034 | * a VIO for it from the pool. | |
1035 | * @waiter: The waiter of the block which is starting to write. | |
1036 | * @context: The parent slab of the block. | |
1037 | * | |
1038 | * This can be asynchronous since the writer will have to wait if all VIOs in the pool are | |
1039 | * currently in use. | |
1040 | */ | |
d6e260cc | 1041 | static void launch_reference_block_write(struct vdo_waiter *waiter, void *context) |
09eff388 MS |
1042 | { |
1043 | struct vdo_slab *slab = context; | |
1044 | ||
1045 | if (vdo_is_read_only(slab->allocator->depot->vdo)) | |
1046 | return; | |
1047 | ||
1048 | slab->active_count++; | |
1049 | container_of(waiter, struct reference_block, waiter)->is_writing = true; | |
1050 | waiter->callback = write_reference_block; | |
1051 | acquire_vio_from_pool(slab->allocator->vio_pool, waiter); | |
1052 | } | |
1053 | ||
1054 | static void save_dirty_reference_blocks(struct vdo_slab *slab) | |
1055 | { | |
d6e260cc MS |
1056 | vdo_waitq_notify_all_waiters(&slab->dirty_blocks, |
1057 | launch_reference_block_write, slab); | |
09eff388 MS |
1058 | check_if_slab_drained(slab); |
1059 | } | |
1060 | ||
1061 | /** | |
1062 | * finish_reference_block_write() - After a reference block has written, clean it, release its | |
1063 | * locks, and return its VIO to the pool. | |
1064 | * @completion: The VIO that just finished writing. | |
1065 | */ | |
1066 | static void finish_reference_block_write(struct vdo_completion *completion) | |
1067 | { | |
1068 | struct vio *vio = as_vio(completion); | |
1069 | struct pooled_vio *pooled = vio_as_pooled_vio(vio); | |
1070 | struct reference_block *block = completion->parent; | |
1071 | struct vdo_slab *slab = block->slab; | |
1072 | tail_block_offset_t offset; | |
1073 | ||
1074 | slab->active_count--; | |
1075 | ||
1076 | /* Release the slab journal lock. */ | |
1077 | adjust_slab_journal_block_reference(&slab->journal, | |
1078 | block->slab_journal_lock_to_release, -1); | |
1079 | return_vio_to_pool(slab->allocator->vio_pool, pooled); | |
1080 | ||
1081 | /* | |
1082 | * We can't clear the is_writing flag earlier as releasing the slab journal lock may cause | |
1083 | * us to be dirtied again, but we don't want to double enqueue. | |
1084 | */ | |
1085 | block->is_writing = false; | |
1086 | ||
1087 | if (vdo_is_read_only(completion->vdo)) { | |
1088 | check_if_slab_drained(slab); | |
1089 | return; | |
1090 | } | |
1091 | ||
1092 | /* Re-queue the block if it was re-dirtied while it was writing. */ | |
1093 | if (block->is_dirty) { | |
d6e260cc | 1094 | vdo_waitq_enqueue_waiter(&block->slab->dirty_blocks, &block->waiter); |
09eff388 MS |
1095 | if (vdo_is_state_draining(&slab->state)) { |
1096 | /* We must be saving, and this block will otherwise not be relaunched. */ | |
1097 | save_dirty_reference_blocks(slab); | |
1098 | } | |
1099 | ||
1100 | return; | |
1101 | } | |
1102 | ||
1103 | /* | |
1104 | * Mark the slab as clean in the slab summary if there are no dirty or writing blocks | |
1105 | * and no summary update in progress. | |
1106 | */ | |
d6e260cc | 1107 | if ((slab->active_count > 0) || vdo_waitq_has_waiters(&slab->dirty_blocks)) { |
09eff388 MS |
1108 | check_if_slab_drained(slab); |
1109 | return; | |
1110 | } | |
1111 | ||
1112 | offset = slab->allocator->summary_entries[slab->slab_number].tail_block_offset; | |
1113 | slab->active_count++; | |
1114 | slab->summary_waiter.callback = finish_summary_update; | |
1115 | update_slab_summary_entry(slab, &slab->summary_waiter, offset, | |
1116 | true, true, slab->free_blocks); | |
1117 | } | |
1118 | ||
1119 | /** | |
1120 | * get_reference_counters_for_block() - Find the reference counters for a given block. | |
1121 | * @block: The reference_block in question. | |
1122 | * | |
1123 | * Return: A pointer to the reference counters for this block. | |
1124 | */ | |
1125 | static vdo_refcount_t * __must_check get_reference_counters_for_block(struct reference_block *block) | |
1126 | { | |
1127 | size_t block_index = block - block->slab->reference_blocks; | |
1128 | ||
1129 | return &block->slab->counters[block_index * COUNTS_PER_BLOCK]; | |
1130 | } | |
1131 | ||
1132 | /** | |
1133 | * pack_reference_block() - Copy data from a reference block to a buffer ready to be written out. | |
1134 | * @block: The block to copy. | |
1135 | * @buffer: The char buffer to fill with the packed block. | |
1136 | */ | |
1137 | static void pack_reference_block(struct reference_block *block, void *buffer) | |
1138 | { | |
1139 | struct packed_reference_block *packed = buffer; | |
1140 | vdo_refcount_t *counters = get_reference_counters_for_block(block); | |
1141 | sector_count_t i; | |
1142 | struct packed_journal_point commit_point; | |
1143 | ||
1144 | vdo_pack_journal_point(&block->slab->slab_journal_point, &commit_point); | |
1145 | ||
1146 | for (i = 0; i < VDO_SECTORS_PER_BLOCK; i++) { | |
1147 | packed->sectors[i].commit_point = commit_point; | |
1148 | memcpy(packed->sectors[i].counts, counters + (i * COUNTS_PER_SECTOR), | |
1149 | (sizeof(vdo_refcount_t) * COUNTS_PER_SECTOR)); | |
1150 | } | |
1151 | } | |
1152 | ||
1153 | static void write_reference_block_endio(struct bio *bio) | |
1154 | { | |
1155 | struct vio *vio = bio->bi_private; | |
1156 | struct reference_block *block = vio->completion.parent; | |
1157 | thread_id_t thread_id = block->slab->allocator->thread_id; | |
1158 | ||
1159 | continue_vio_after_io(vio, finish_reference_block_write, thread_id); | |
1160 | } | |
1161 | ||
1162 | /** | |
1163 | * handle_io_error() - Handle an I/O error reading or writing a reference count block. | |
1164 | * @completion: The VIO doing the I/O as a completion. | |
1165 | */ | |
1166 | static void handle_io_error(struct vdo_completion *completion) | |
1167 | { | |
1168 | int result = completion->result; | |
1169 | struct vio *vio = as_vio(completion); | |
1170 | struct vdo_slab *slab = ((struct reference_block *) completion->parent)->slab; | |
1171 | ||
1172 | vio_record_metadata_io_error(vio); | |
1173 | return_vio_to_pool(slab->allocator->vio_pool, vio_as_pooled_vio(vio)); | |
1174 | slab->active_count--; | |
1175 | vdo_enter_read_only_mode(slab->allocator->depot->vdo, result); | |
1176 | check_if_slab_drained(slab); | |
1177 | } | |
1178 | ||
1179 | /** | |
1180 | * write_reference_block() - After a dirty block waiter has gotten a VIO from the VIO pool, copy | |
1181 | * its counters and associated data into the VIO, and launch the write. | |
1182 | * @waiter: The waiter of the dirty block. | |
1183 | * @context: The VIO returned by the pool. | |
1184 | */ | |
d6e260cc | 1185 | static void write_reference_block(struct vdo_waiter *waiter, void *context) |
09eff388 MS |
1186 | { |
1187 | size_t block_offset; | |
1188 | physical_block_number_t pbn; | |
1189 | struct pooled_vio *pooled = context; | |
1190 | struct vdo_completion *completion = &pooled->vio.completion; | |
1191 | struct reference_block *block = container_of(waiter, struct reference_block, | |
1192 | waiter); | |
1193 | ||
1194 | pack_reference_block(block, pooled->vio.data); | |
1195 | block_offset = (block - block->slab->reference_blocks); | |
1196 | pbn = (block->slab->ref_counts_origin + block_offset); | |
1197 | block->slab_journal_lock_to_release = block->slab_journal_lock; | |
1198 | completion->parent = block; | |
1199 | ||
1200 | /* | |
1201 | * Mark the block as clean, since we won't be committing any updates that happen after this | |
1202 | * moment. As long as VIO order is preserved, two VIOs updating this block at once will not | |
1203 | * cause complications. | |
1204 | */ | |
1205 | block->is_dirty = false; | |
1206 | ||
1207 | /* | |
1208 | * Flush before writing to ensure that the recovery journal and slab journal entries which | |
dcd1332b SLM |
1209 | * cover this reference update are stable. This prevents data corruption that can be caused |
1210 | * by out of order writes. | |
09eff388 MS |
1211 | */ |
1212 | WRITE_ONCE(block->slab->allocator->ref_counts_statistics.blocks_written, | |
1213 | block->slab->allocator->ref_counts_statistics.blocks_written + 1); | |
1214 | ||
1215 | completion->callback_thread_id = ((struct block_allocator *) pooled->context)->thread_id; | |
f7f46761 MS |
1216 | vdo_submit_metadata_vio(&pooled->vio, pbn, write_reference_block_endio, |
1217 | handle_io_error, REQ_OP_WRITE | REQ_PREFLUSH); | |
09eff388 MS |
1218 | } |
1219 | ||
1220 | static void reclaim_journal_space(struct slab_journal *journal) | |
1221 | { | |
1222 | block_count_t length = journal_length(journal); | |
1223 | struct vdo_slab *slab = journal->slab; | |
d6e260cc | 1224 | block_count_t write_count = vdo_waitq_num_waiters(&slab->dirty_blocks); |
09eff388 MS |
1225 | block_count_t written; |
1226 | ||
1227 | if ((length < journal->flushing_threshold) || (write_count == 0)) | |
1228 | return; | |
1229 | ||
1230 | /* The slab journal is over the first threshold, schedule some reference block writes. */ | |
1231 | WRITE_ONCE(journal->events->flush_count, journal->events->flush_count + 1); | |
1232 | if (length < journal->flushing_deadline) { | |
1233 | /* Schedule more writes the closer to the deadline we get. */ | |
1234 | write_count /= journal->flushing_deadline - length + 1; | |
1235 | write_count = max_t(block_count_t, write_count, 1); | |
1236 | } | |
1237 | ||
1238 | for (written = 0; written < write_count; written++) { | |
d6e260cc MS |
1239 | vdo_waitq_notify_next_waiter(&slab->dirty_blocks, |
1240 | launch_reference_block_write, slab); | |
09eff388 MS |
1241 | } |
1242 | } | |
1243 | ||
1244 | /** | |
1245 | * reference_count_to_status() - Convert a reference count to a reference status. | |
1246 | * @count: The count to convert. | |
1247 | * | |
1248 | * Return: The appropriate reference status. | |
1249 | */ | |
1250 | static enum reference_status __must_check reference_count_to_status(vdo_refcount_t count) | |
1251 | { | |
1252 | if (count == EMPTY_REFERENCE_COUNT) | |
1253 | return RS_FREE; | |
1254 | else if (count == 1) | |
1255 | return RS_SINGLE; | |
1256 | else if (count == PROVISIONAL_REFERENCE_COUNT) | |
1257 | return RS_PROVISIONAL; | |
1258 | else | |
1259 | return RS_SHARED; | |
1260 | } | |
1261 | ||
1262 | /** | |
1263 | * dirty_block() - Mark a reference count block as dirty, potentially adding it to the dirty queue | |
1264 | * if it wasn't already dirty. | |
1265 | * @block: The reference block to mark as dirty. | |
1266 | */ | |
1267 | static void dirty_block(struct reference_block *block) | |
1268 | { | |
1269 | if (block->is_dirty) | |
1270 | return; | |
1271 | ||
1272 | block->is_dirty = true; | |
1273 | if (!block->is_writing) | |
d6e260cc | 1274 | vdo_waitq_enqueue_waiter(&block->slab->dirty_blocks, &block->waiter); |
09eff388 MS |
1275 | } |
1276 | ||
1277 | /** | |
1278 | * get_reference_block() - Get the reference block that covers the given block index. | |
1279 | */ | |
1280 | static struct reference_block * __must_check get_reference_block(struct vdo_slab *slab, | |
1281 | slab_block_number index) | |
1282 | { | |
1283 | return &slab->reference_blocks[index / COUNTS_PER_BLOCK]; | |
1284 | } | |
1285 | ||
1286 | /** | |
1287 | * slab_block_number_from_pbn() - Determine the index within the slab of a particular physical | |
1288 | * block number. | |
1289 | * @slab: The slab. | |
1290 | * @physical_block_number: The physical block number. | |
1291 | * @slab_block_number_ptr: A pointer to the slab block number. | |
1292 | * | |
1293 | * Return: VDO_SUCCESS or an error code. | |
1294 | */ | |
1295 | static int __must_check slab_block_number_from_pbn(struct vdo_slab *slab, | |
1296 | physical_block_number_t pbn, | |
1297 | slab_block_number *slab_block_number_ptr) | |
1298 | { | |
1299 | u64 slab_block_number; | |
1300 | ||
1301 | if (pbn < slab->start) | |
1302 | return VDO_OUT_OF_RANGE; | |
1303 | ||
1304 | slab_block_number = pbn - slab->start; | |
1305 | if (slab_block_number >= slab->allocator->depot->slab_config.data_blocks) | |
1306 | return VDO_OUT_OF_RANGE; | |
1307 | ||
1308 | *slab_block_number_ptr = slab_block_number; | |
1309 | return VDO_SUCCESS; | |
1310 | } | |
1311 | ||
1312 | /** | |
1313 | * get_reference_counter() - Get the reference counter that covers the given physical block number. | |
1314 | * @slab: The slab to query. | |
1315 | * @pbn: The physical block number. | |
1316 | * @counter_ptr: A pointer to the reference counter. | |
1317 | */ | |
1318 | static int __must_check get_reference_counter(struct vdo_slab *slab, | |
1319 | physical_block_number_t pbn, | |
1320 | vdo_refcount_t **counter_ptr) | |
1321 | { | |
1322 | slab_block_number index; | |
1323 | int result = slab_block_number_from_pbn(slab, pbn, &index); | |
1324 | ||
1325 | if (result != VDO_SUCCESS) | |
1326 | return result; | |
1327 | ||
1328 | *counter_ptr = &slab->counters[index]; | |
1329 | ||
1330 | return VDO_SUCCESS; | |
1331 | } | |
1332 | ||
1333 | static unsigned int calculate_slab_priority(struct vdo_slab *slab) | |
1334 | { | |
1335 | block_count_t free_blocks = slab->free_blocks; | |
1336 | unsigned int unopened_slab_priority = slab->allocator->unopened_slab_priority; | |
1337 | unsigned int priority; | |
1338 | ||
1339 | /* | |
1340 | * Wholly full slabs must be the only ones with lowest priority, 0. | |
1341 | * | |
1342 | * Slabs that have never been opened (empty, newly initialized, and never been written to) | |
1343 | * have lower priority than previously opened slabs that have a significant number of free | |
1344 | * blocks. This ranking causes VDO to avoid writing physical blocks for the first time | |
1345 | * unless there are very few free blocks that have been previously written to. | |
1346 | * | |
1347 | * Since VDO doesn't discard blocks currently, reusing previously written blocks makes VDO | |
1348 | * a better client of any underlying storage that is thinly-provisioned (though discarding | |
1349 | * would be better). | |
1350 | * | |
1351 | * For all other slabs, the priority is derived from the logarithm of the number of free | |
1352 | * blocks. Slabs with the same order of magnitude of free blocks have the same priority. | |
1353 | * With 2^23 blocks, the priority will range from 1 to 25. The reserved | |
1354 | * unopened_slab_priority divides the range and is skipped by the logarithmic mapping. | |
1355 | */ | |
1356 | ||
1357 | if (free_blocks == 0) | |
1358 | return 0; | |
1359 | ||
1360 | if (is_slab_journal_blank(slab)) | |
1361 | return unopened_slab_priority; | |
1362 | ||
1363 | priority = (1 + ilog2(free_blocks)); | |
1364 | return ((priority < unopened_slab_priority) ? priority : priority + 1); | |
1365 | } | |
1366 | ||
1367 | /* | |
1368 | * Slabs are essentially prioritized by an approximation of the number of free blocks in the slab | |
1ccef45a | 1369 | * so slabs with lots of free blocks will be opened for allocation before slabs that have few free |
09eff388 MS |
1370 | * blocks. |
1371 | */ | |
1372 | static void prioritize_slab(struct vdo_slab *slab) | |
1373 | { | |
1374 | ASSERT_LOG_ONLY(list_empty(&slab->allocq_entry), | |
1375 | "a slab must not already be on a ring when prioritizing"); | |
1376 | slab->priority = calculate_slab_priority(slab); | |
1377 | vdo_priority_table_enqueue(slab->allocator->prioritized_slabs, | |
1378 | slab->priority, &slab->allocq_entry); | |
1379 | } | |
1380 | ||
1381 | /** | |
1382 | * adjust_free_block_count() - Adjust the free block count and (if needed) reprioritize the slab. | |
1ccef45a | 1383 | * @incremented: true if the free block count went up. |
09eff388 | 1384 | */ |
1ccef45a | 1385 | static void adjust_free_block_count(struct vdo_slab *slab, bool incremented) |
09eff388 MS |
1386 | { |
1387 | struct block_allocator *allocator = slab->allocator; | |
1388 | ||
1389 | WRITE_ONCE(allocator->allocated_blocks, | |
1ccef45a | 1390 | allocator->allocated_blocks + (incremented ? -1 : 1)); |
09eff388 MS |
1391 | |
1392 | /* The open slab doesn't need to be reprioritized until it is closed. */ | |
1393 | if (slab == allocator->open_slab) | |
1394 | return; | |
1395 | ||
1396 | /* Don't bother adjusting the priority table if unneeded. */ | |
1397 | if (slab->priority == calculate_slab_priority(slab)) | |
1398 | return; | |
1399 | ||
1400 | /* | |
1401 | * Reprioritize the slab to reflect the new free block count by removing it from the table | |
1402 | * and re-enqueuing it with the new priority. | |
1403 | */ | |
1404 | vdo_priority_table_remove(allocator->prioritized_slabs, &slab->allocq_entry); | |
1405 | prioritize_slab(slab); | |
1406 | } | |
1407 | ||
1408 | /** | |
1409 | * increment_for_data() - Increment the reference count for a data block. | |
1410 | * @slab: The slab which owns the block. | |
1411 | * @block: The reference block which contains the block being updated. | |
1412 | * @block_number: The block to update. | |
1413 | * @old_status: The reference status of the data block before this increment. | |
1414 | * @lock: The pbn_lock associated with this increment (may be NULL). | |
1415 | * @counter_ptr: A pointer to the count for the data block (in, out). | |
1416 | * @adjust_block_count: Whether to update the allocator's free block count. | |
1417 | * | |
1418 | * Return: VDO_SUCCESS or an error. | |
1419 | */ | |
1420 | static int increment_for_data(struct vdo_slab *slab, struct reference_block *block, | |
1421 | slab_block_number block_number, | |
1422 | enum reference_status old_status, | |
1423 | struct pbn_lock *lock, vdo_refcount_t *counter_ptr, | |
1424 | bool adjust_block_count) | |
1425 | { | |
1426 | switch (old_status) { | |
1427 | case RS_FREE: | |
1428 | *counter_ptr = 1; | |
1429 | block->allocated_count++; | |
1430 | slab->free_blocks--; | |
1431 | if (adjust_block_count) | |
1432 | adjust_free_block_count(slab, false); | |
1433 | ||
1434 | break; | |
1435 | ||
1436 | case RS_PROVISIONAL: | |
1437 | *counter_ptr = 1; | |
1438 | break; | |
1439 | ||
1440 | default: | |
1441 | /* Single or shared */ | |
1442 | if (*counter_ptr >= MAXIMUM_REFERENCE_COUNT) { | |
1443 | return uds_log_error_strerror(VDO_REF_COUNT_INVALID, | |
1444 | "Incrementing a block already having 254 references (slab %u, offset %u)", | |
1445 | slab->slab_number, block_number); | |
1446 | } | |
1447 | (*counter_ptr)++; | |
1448 | } | |
1449 | ||
1450 | if (lock != NULL) | |
1451 | vdo_unassign_pbn_lock_provisional_reference(lock); | |
1452 | return VDO_SUCCESS; | |
1453 | } | |
1454 | ||
1455 | /** | |
1456 | * decrement_for_data() - Decrement the reference count for a data block. | |
1457 | * @slab: The slab which owns the block. | |
1458 | * @block: The reference block which contains the block being updated. | |
1459 | * @block_number: The block to update. | |
1460 | * @old_status: The reference status of the data block before this decrement. | |
1461 | * @updater: The reference updater doing this operation in case we need to look up the pbn lock. | |
1462 | * @lock: The pbn_lock associated with the block being decremented (may be NULL). | |
1463 | * @counter_ptr: A pointer to the count for the data block (in, out). | |
1464 | * @adjust_block_count: Whether to update the allocator's free block count. | |
1465 | * | |
1466 | * Return: VDO_SUCCESS or an error. | |
1467 | */ | |
1468 | static int decrement_for_data(struct vdo_slab *slab, struct reference_block *block, | |
1469 | slab_block_number block_number, | |
1470 | enum reference_status old_status, | |
1471 | struct reference_updater *updater, | |
1472 | vdo_refcount_t *counter_ptr, bool adjust_block_count) | |
1473 | { | |
1474 | switch (old_status) { | |
1475 | case RS_FREE: | |
1476 | return uds_log_error_strerror(VDO_REF_COUNT_INVALID, | |
1477 | "Decrementing free block at offset %u in slab %u", | |
1478 | block_number, slab->slab_number); | |
1479 | ||
1480 | case RS_PROVISIONAL: | |
1481 | case RS_SINGLE: | |
1482 | if (updater->zpbn.zone != NULL) { | |
1483 | struct pbn_lock *lock = vdo_get_physical_zone_pbn_lock(updater->zpbn.zone, | |
1484 | updater->zpbn.pbn); | |
1485 | ||
1486 | if (lock != NULL) { | |
1487 | /* | |
1488 | * There is a read lock on this block, so the block must not become | |
1489 | * unreferenced. | |
1490 | */ | |
1491 | *counter_ptr = PROVISIONAL_REFERENCE_COUNT; | |
1492 | vdo_assign_pbn_lock_provisional_reference(lock); | |
1493 | break; | |
1494 | } | |
1495 | } | |
1496 | ||
1497 | *counter_ptr = EMPTY_REFERENCE_COUNT; | |
1498 | block->allocated_count--; | |
1499 | slab->free_blocks++; | |
1500 | if (adjust_block_count) | |
1501 | adjust_free_block_count(slab, true); | |
1502 | ||
1503 | break; | |
1504 | ||
1505 | default: | |
1506 | /* Shared */ | |
1507 | (*counter_ptr)--; | |
1508 | } | |
1509 | ||
1510 | return VDO_SUCCESS; | |
1511 | } | |
1512 | ||
1513 | /** | |
1514 | * increment_for_block_map() - Increment the reference count for a block map page. | |
1515 | * @slab: The slab which owns the block. | |
1516 | * @block: The reference block which contains the block being updated. | |
1517 | * @block_number: The block to update. | |
1518 | * @old_status: The reference status of the block before this increment. | |
1519 | * @lock: The pbn_lock associated with this increment (may be NULL). | |
1520 | * @normal_operation: Whether we are in normal operation vs. recovery or rebuild. | |
1521 | * @counter_ptr: A pointer to the count for the block (in, out). | |
1522 | * @adjust_block_count: Whether to update the allocator's free block count. | |
1523 | * | |
1524 | * All block map increments should be from provisional to MAXIMUM_REFERENCE_COUNT. Since block map | |
1525 | * blocks never dedupe they should never be adjusted from any other state. The adjustment always | |
1526 | * results in MAXIMUM_REFERENCE_COUNT as this value is used to prevent dedupe against block map | |
1527 | * blocks. | |
1528 | * | |
1529 | * Return: VDO_SUCCESS or an error. | |
1530 | */ | |
1531 | static int increment_for_block_map(struct vdo_slab *slab, struct reference_block *block, | |
1532 | slab_block_number block_number, | |
1533 | enum reference_status old_status, | |
1534 | struct pbn_lock *lock, bool normal_operation, | |
1535 | vdo_refcount_t *counter_ptr, bool adjust_block_count) | |
1536 | { | |
1537 | switch (old_status) { | |
1538 | case RS_FREE: | |
1539 | if (normal_operation) { | |
1540 | return uds_log_error_strerror(VDO_REF_COUNT_INVALID, | |
1541 | "Incrementing unallocated block map block (slab %u, offset %u)", | |
1542 | slab->slab_number, block_number); | |
1543 | } | |
1544 | ||
1545 | *counter_ptr = MAXIMUM_REFERENCE_COUNT; | |
1546 | block->allocated_count++; | |
1547 | slab->free_blocks--; | |
1548 | if (adjust_block_count) | |
1549 | adjust_free_block_count(slab, false); | |
1550 | ||
1551 | return VDO_SUCCESS; | |
1552 | ||
1553 | case RS_PROVISIONAL: | |
1554 | if (!normal_operation) | |
1555 | return uds_log_error_strerror(VDO_REF_COUNT_INVALID, | |
1556 | "Block map block had provisional reference during replay (slab %u, offset %u)", | |
1557 | slab->slab_number, block_number); | |
1558 | ||
1559 | *counter_ptr = MAXIMUM_REFERENCE_COUNT; | |
1560 | if (lock != NULL) | |
1561 | vdo_unassign_pbn_lock_provisional_reference(lock); | |
1562 | return VDO_SUCCESS; | |
1563 | ||
1564 | default: | |
1565 | return uds_log_error_strerror(VDO_REF_COUNT_INVALID, | |
1566 | "Incrementing a block map block which is already referenced %u times (slab %u, offset %u)", | |
1567 | *counter_ptr, slab->slab_number, | |
1568 | block_number); | |
1569 | } | |
1570 | } | |
1571 | ||
1572 | static bool __must_check is_valid_journal_point(const struct journal_point *point) | |
1573 | { | |
1574 | return ((point != NULL) && (point->sequence_number > 0)); | |
1575 | } | |
1576 | ||
1577 | /** | |
1578 | * update_reference_count() - Update the reference count of a block. | |
1579 | * @slab: The slab which owns the block. | |
1580 | * @block: The reference block which contains the block being updated. | |
1581 | * @block_number: The block to update. | |
1582 | * @slab_journal_point: The slab journal point at which this update is journaled. | |
1583 | * @updater: The reference updater. | |
1584 | * @normal_operation: Whether we are in normal operation vs. recovery or rebuild. | |
1585 | * @adjust_block_count: Whether to update the slab's free block count. | |
1586 | * @provisional_decrement_ptr: A pointer which will be set to true if this update was a decrement | |
1587 | * of a provisional reference. | |
1588 | * | |
1589 | * Return: VDO_SUCCESS or an error. | |
1590 | */ | |
1591 | static int update_reference_count(struct vdo_slab *slab, struct reference_block *block, | |
1592 | slab_block_number block_number, | |
1593 | const struct journal_point *slab_journal_point, | |
1594 | struct reference_updater *updater, | |
1595 | bool normal_operation, bool adjust_block_count, | |
1596 | bool *provisional_decrement_ptr) | |
1597 | { | |
1598 | vdo_refcount_t *counter_ptr = &slab->counters[block_number]; | |
1599 | enum reference_status old_status = reference_count_to_status(*counter_ptr); | |
1600 | int result; | |
1601 | ||
1602 | if (!updater->increment) { | |
1603 | result = decrement_for_data(slab, block, block_number, old_status, | |
1604 | updater, counter_ptr, adjust_block_count); | |
1605 | if ((result == VDO_SUCCESS) && (old_status == RS_PROVISIONAL)) { | |
1606 | if (provisional_decrement_ptr != NULL) | |
1607 | *provisional_decrement_ptr = true; | |
1608 | return VDO_SUCCESS; | |
1609 | } | |
1610 | } else if (updater->operation == VDO_JOURNAL_DATA_REMAPPING) { | |
1611 | result = increment_for_data(slab, block, block_number, old_status, | |
1612 | updater->lock, counter_ptr, adjust_block_count); | |
1613 | } else { | |
1614 | result = increment_for_block_map(slab, block, block_number, old_status, | |
1615 | updater->lock, normal_operation, | |
1616 | counter_ptr, adjust_block_count); | |
1617 | } | |
1618 | ||
1619 | if (result != VDO_SUCCESS) | |
1620 | return result; | |
1621 | ||
1622 | if (is_valid_journal_point(slab_journal_point)) | |
1623 | slab->slab_journal_point = *slab_journal_point; | |
1624 | ||
1625 | return VDO_SUCCESS; | |
1626 | } | |
1627 | ||
1628 | static int __must_check adjust_reference_count(struct vdo_slab *slab, | |
1629 | struct reference_updater *updater, | |
1630 | const struct journal_point *slab_journal_point) | |
1631 | { | |
1632 | slab_block_number block_number; | |
1633 | int result; | |
1634 | struct reference_block *block; | |
1635 | bool provisional_decrement = false; | |
1636 | ||
1637 | if (!is_slab_open(slab)) | |
1638 | return VDO_INVALID_ADMIN_STATE; | |
1639 | ||
1640 | result = slab_block_number_from_pbn(slab, updater->zpbn.pbn, &block_number); | |
1641 | if (result != VDO_SUCCESS) | |
1642 | return result; | |
1643 | ||
1644 | block = get_reference_block(slab, block_number); | |
1645 | result = update_reference_count(slab, block, block_number, slab_journal_point, | |
1646 | updater, NORMAL_OPERATION, true, | |
1647 | &provisional_decrement); | |
1648 | if ((result != VDO_SUCCESS) || provisional_decrement) | |
1649 | return result; | |
1650 | ||
1651 | if (block->is_dirty && (block->slab_journal_lock > 0)) { | |
1652 | sequence_number_t entry_lock = slab_journal_point->sequence_number; | |
1653 | /* | |
1654 | * This block is already dirty and a slab journal entry has been made for it since | |
1655 | * the last time it was clean. We must release the per-entry slab journal lock for | |
1656 | * the entry associated with the update we are now doing. | |
1657 | */ | |
1658 | result = ASSERT(is_valid_journal_point(slab_journal_point), | |
1659 | "Reference count adjustments need slab journal points."); | |
1660 | if (result != VDO_SUCCESS) | |
1661 | return result; | |
1662 | ||
1663 | adjust_slab_journal_block_reference(&slab->journal, entry_lock, -1); | |
1664 | return VDO_SUCCESS; | |
1665 | } | |
1666 | ||
1667 | /* | |
1668 | * This may be the first time we are applying an update for which there is a slab journal | |
1669 | * entry to this block since the block was cleaned. Therefore, we convert the per-entry | |
1670 | * slab journal lock to an uncommitted reference block lock, if there is a per-entry lock. | |
1671 | */ | |
1672 | if (is_valid_journal_point(slab_journal_point)) | |
1673 | block->slab_journal_lock = slab_journal_point->sequence_number; | |
1674 | else | |
1675 | block->slab_journal_lock = 0; | |
1676 | ||
1677 | dirty_block(block); | |
1678 | return VDO_SUCCESS; | |
1679 | } | |
1680 | ||
1681 | /** | |
1682 | * add_entry_from_waiter() - Add an entry to the slab journal. | |
1683 | * @waiter: The vio which should make an entry now. | |
1684 | * @context: The slab journal to make an entry in. | |
1685 | * | |
1686 | * This callback is invoked by add_entries() once it has determined that we are ready to make | |
1687 | * another entry in the slab journal. Implements waiter_callback_fn. | |
1688 | */ | |
d6e260cc | 1689 | static void add_entry_from_waiter(struct vdo_waiter *waiter, void *context) |
09eff388 MS |
1690 | { |
1691 | int result; | |
1692 | struct reference_updater *updater = | |
1693 | container_of(waiter, struct reference_updater, waiter); | |
1694 | struct data_vio *data_vio = data_vio_from_reference_updater(updater); | |
1695 | struct slab_journal *journal = context; | |
1696 | struct slab_journal_block_header *header = &journal->tail_header; | |
1697 | struct journal_point slab_journal_point = { | |
1698 | .sequence_number = header->sequence_number, | |
1699 | .entry_count = header->entry_count, | |
1700 | }; | |
1701 | sequence_number_t recovery_block = data_vio->recovery_journal_point.sequence_number; | |
1702 | ||
1703 | if (header->entry_count == 0) { | |
1704 | /* | |
1705 | * This is the first entry in the current tail block, so get a lock on the recovery | |
1706 | * journal which we will hold until this tail block is committed. | |
1707 | */ | |
1708 | get_lock(journal, header->sequence_number)->recovery_start = recovery_block; | |
1709 | if (journal->recovery_journal != NULL) { | |
1710 | zone_count_t zone_number = journal->slab->allocator->zone_number; | |
1711 | ||
1712 | vdo_acquire_recovery_journal_block_reference(journal->recovery_journal, | |
1713 | recovery_block, | |
1714 | VDO_ZONE_TYPE_PHYSICAL, | |
1715 | zone_number); | |
1716 | } | |
1717 | ||
1718 | mark_slab_journal_dirty(journal, recovery_block); | |
1719 | reclaim_journal_space(journal); | |
1720 | } | |
1721 | ||
1722 | add_entry(journal, updater->zpbn.pbn, updater->operation, updater->increment, | |
1723 | expand_journal_point(data_vio->recovery_journal_point, | |
1724 | updater->increment)); | |
1725 | ||
1726 | if (journal->slab->status != VDO_SLAB_REBUILT) { | |
1727 | /* | |
1728 | * If the slab is unrecovered, scrubbing will take care of the count since the | |
1729 | * update is now recorded in the journal. | |
1730 | */ | |
1731 | adjust_slab_journal_block_reference(journal, | |
1732 | slab_journal_point.sequence_number, -1); | |
1733 | result = VDO_SUCCESS; | |
1734 | } else { | |
1735 | /* Now that an entry has been made in the slab journal, update the counter. */ | |
1736 | result = adjust_reference_count(journal->slab, updater, | |
1737 | &slab_journal_point); | |
1738 | } | |
1739 | ||
1740 | if (updater->increment) | |
1741 | continue_data_vio_with_error(data_vio, result); | |
1742 | else | |
1743 | vdo_continue_completion(&data_vio->decrement_completion, result); | |
1744 | } | |
1745 | ||
1746 | /** | |
1747 | * is_next_entry_a_block_map_increment() - Check whether the next entry to be made is a block map | |
1748 | * increment. | |
1749 | * @journal: The journal. | |
1750 | * | |
1751 | * Return: true if the first entry waiter's operation is a block map increment. | |
1752 | */ | |
1753 | static inline bool is_next_entry_a_block_map_increment(struct slab_journal *journal) | |
1754 | { | |
d6e260cc | 1755 | struct vdo_waiter *waiter = vdo_waitq_get_first_waiter(&journal->entry_waiters); |
1ccef45a MS |
1756 | struct reference_updater *updater = |
1757 | container_of(waiter, struct reference_updater, waiter); | |
09eff388 MS |
1758 | |
1759 | return (updater->operation == VDO_JOURNAL_BLOCK_MAP_REMAPPING); | |
1760 | } | |
1761 | ||
1762 | /** | |
1763 | * add_entries() - Add as many entries as possible from the queue of vios waiting to make entries. | |
1764 | * @journal: The journal to which entries may be added. | |
1765 | * | |
1766 | * By processing the queue in order, we ensure that slab journal entries are made in the same order | |
1767 | * as recovery journal entries for the same increment or decrement. | |
1768 | */ | |
1769 | static void add_entries(struct slab_journal *journal) | |
1770 | { | |
1771 | if (journal->adding_entries) { | |
1772 | /* Protect against re-entrancy. */ | |
1773 | return; | |
1774 | } | |
1775 | ||
1776 | journal->adding_entries = true; | |
d6e260cc | 1777 | while (vdo_waitq_has_waiters(&journal->entry_waiters)) { |
09eff388 MS |
1778 | struct slab_journal_block_header *header = &journal->tail_header; |
1779 | ||
1780 | if (journal->partial_write_in_progress || | |
1781 | (journal->slab->status == VDO_SLAB_REBUILDING)) { | |
1782 | /* | |
1783 | * Don't add entries while rebuilding or while a partial write is | |
dcd1332b | 1784 | * outstanding, as it could result in reference count corruption. |
09eff388 MS |
1785 | */ |
1786 | break; | |
1787 | } | |
1788 | ||
1789 | if (journal->waiting_to_commit) { | |
1790 | /* | |
1791 | * If we are waiting for resources to write the tail block, and the tail | |
1792 | * block is full, we can't make another entry. | |
1793 | */ | |
1794 | WRITE_ONCE(journal->events->tail_busy_count, | |
1795 | journal->events->tail_busy_count + 1); | |
1796 | break; | |
1797 | } else if (is_next_entry_a_block_map_increment(journal) && | |
1798 | (header->entry_count >= journal->full_entries_per_block)) { | |
1799 | /* | |
1800 | * The tail block does not have room for a block map increment, so commit | |
1801 | * it now. | |
1802 | */ | |
1803 | commit_tail(journal); | |
1804 | if (journal->waiting_to_commit) { | |
1805 | WRITE_ONCE(journal->events->tail_busy_count, | |
1806 | journal->events->tail_busy_count + 1); | |
1807 | break; | |
1808 | } | |
1809 | } | |
1810 | ||
1811 | /* If the slab is over the blocking threshold, make the vio wait. */ | |
1812 | if (requires_reaping(journal)) { | |
1813 | WRITE_ONCE(journal->events->blocked_count, | |
1814 | journal->events->blocked_count + 1); | |
1815 | save_dirty_reference_blocks(journal->slab); | |
1816 | break; | |
1817 | } | |
1818 | ||
1819 | if (header->entry_count == 0) { | |
1820 | struct journal_lock *lock = | |
1821 | get_lock(journal, header->sequence_number); | |
1822 | ||
1823 | /* | |
1824 | * Check if the on disk slab journal is full. Because of the blocking and | |
1825 | * scrubbing thresholds, this should never happen. | |
1826 | */ | |
1827 | if (lock->count > 0) { | |
1828 | ASSERT_LOG_ONLY((journal->head + journal->size) == journal->tail, | |
1829 | "New block has locks, but journal is not full"); | |
1830 | ||
1831 | /* | |
1832 | * The blocking threshold must let the journal fill up if the new | |
1833 | * block has locks; if the blocking threshold is smaller than the | |
1834 | * journal size, the new block cannot possibly have locks already. | |
1835 | */ | |
1836 | ASSERT_LOG_ONLY((journal->blocking_threshold >= journal->size), | |
1837 | "New block can have locks already iff blocking threshold is at the end of the journal"); | |
1838 | ||
1839 | WRITE_ONCE(journal->events->disk_full_count, | |
1840 | journal->events->disk_full_count + 1); | |
1841 | save_dirty_reference_blocks(journal->slab); | |
1842 | break; | |
1843 | } | |
1844 | ||
1845 | /* | |
1846 | * Don't allow the new block to be reaped until all of the reference count | |
1847 | * blocks are written and the journal block has been fully committed as | |
1848 | * well. | |
1849 | */ | |
1850 | lock->count = journal->entries_per_block + 1; | |
1851 | ||
1852 | if (header->sequence_number == 1) { | |
1853 | struct vdo_slab *slab = journal->slab; | |
1854 | block_count_t i; | |
1855 | ||
1856 | /* | |
1857 | * This is the first entry in this slab journal, ever. Dirty all of | |
1858 | * the reference count blocks. Each will acquire a lock on the tail | |
1859 | * block so that the journal won't be reaped until the reference | |
1860 | * counts are initialized. The lock acquisition must be done by the | |
1861 | * ref_counts since here we don't know how many reference blocks | |
1862 | * the ref_counts has. | |
1863 | */ | |
1864 | for (i = 0; i < slab->reference_block_count; i++) { | |
1865 | slab->reference_blocks[i].slab_journal_lock = 1; | |
1866 | dirty_block(&slab->reference_blocks[i]); | |
1867 | } | |
1868 | ||
1869 | adjust_slab_journal_block_reference(journal, 1, | |
1870 | slab->reference_block_count); | |
1871 | } | |
1872 | } | |
1873 | ||
d6e260cc MS |
1874 | vdo_waitq_notify_next_waiter(&journal->entry_waiters, |
1875 | add_entry_from_waiter, journal); | |
09eff388 MS |
1876 | } |
1877 | ||
1878 | journal->adding_entries = false; | |
1879 | ||
1880 | /* If there are no waiters, and we are flushing or saving, commit the tail block. */ | |
1881 | if (vdo_is_state_draining(&journal->slab->state) && | |
1882 | !vdo_is_state_suspending(&journal->slab->state) && | |
d6e260cc | 1883 | !vdo_waitq_has_waiters(&journal->entry_waiters)) |
09eff388 MS |
1884 | commit_tail(journal); |
1885 | } | |
1886 | ||
1887 | /** | |
1888 | * reset_search_cursor() - Reset the free block search back to the first reference counter in the | |
1889 | * first reference block of a slab. | |
1890 | */ | |
1891 | static void reset_search_cursor(struct vdo_slab *slab) | |
1892 | { | |
1893 | struct search_cursor *cursor = &slab->search_cursor; | |
1894 | ||
1895 | cursor->block = cursor->first_block; | |
1896 | cursor->index = 0; | |
1897 | /* Unit tests have slabs with only one reference block (and it's a runt). */ | |
1898 | cursor->end_index = min_t(u32, COUNTS_PER_BLOCK, slab->block_count); | |
1899 | } | |
1900 | ||
1901 | /** | |
1902 | * advance_search_cursor() - Advance the search cursor to the start of the next reference block in | |
1903 | * a slab, | |
1904 | * | |
1905 | * Wraps around to the first reference block if the current block is the last reference block. | |
1906 | * | |
1907 | * Return: true unless the cursor was at the last reference block. | |
1908 | */ | |
1909 | static bool advance_search_cursor(struct vdo_slab *slab) | |
1910 | { | |
1911 | struct search_cursor *cursor = &slab->search_cursor; | |
1912 | ||
1913 | /* | |
1914 | * If we just finished searching the last reference block, then wrap back around to the | |
1915 | * start of the array. | |
1916 | */ | |
1917 | if (cursor->block == cursor->last_block) { | |
1918 | reset_search_cursor(slab); | |
1919 | return false; | |
1920 | } | |
1921 | ||
1922 | /* We're not already at the end, so advance to cursor to the next block. */ | |
1923 | cursor->block++; | |
1924 | cursor->index = cursor->end_index; | |
1925 | ||
1926 | if (cursor->block == cursor->last_block) { | |
1927 | /* The last reference block will usually be a runt. */ | |
1928 | cursor->end_index = slab->block_count; | |
1929 | } else { | |
1930 | cursor->end_index += COUNTS_PER_BLOCK; | |
1931 | } | |
1932 | ||
1933 | return true; | |
1934 | } | |
1935 | ||
c9ba9fd3 MS |
1936 | /** |
1937 | * vdo_adjust_reference_count_for_rebuild() - Adjust the reference count of a block during rebuild. | |
1938 | * | |
1939 | * Return: VDO_SUCCESS or an error. | |
1940 | */ | |
1941 | int vdo_adjust_reference_count_for_rebuild(struct slab_depot *depot, | |
1942 | physical_block_number_t pbn, | |
1943 | enum journal_operation operation) | |
1944 | { | |
1945 | int result; | |
1946 | slab_block_number block_number; | |
1947 | struct reference_block *block; | |
1948 | struct vdo_slab *slab = vdo_get_slab(depot, pbn); | |
1949 | struct reference_updater updater = { | |
1950 | .operation = operation, | |
1951 | .increment = true, | |
1952 | }; | |
1953 | ||
1954 | result = slab_block_number_from_pbn(slab, pbn, &block_number); | |
1955 | if (result != VDO_SUCCESS) | |
1956 | return result; | |
1957 | ||
1958 | block = get_reference_block(slab, block_number); | |
1959 | result = update_reference_count(slab, block, block_number, NULL, | |
1960 | &updater, !NORMAL_OPERATION, false, NULL); | |
1961 | if (result != VDO_SUCCESS) | |
1962 | return result; | |
1963 | ||
1964 | dirty_block(block); | |
1965 | return VDO_SUCCESS; | |
1966 | } | |
1967 | ||
09eff388 MS |
1968 | /** |
1969 | * replay_reference_count_change() - Replay the reference count adjustment from a slab journal | |
1970 | * entry into the reference count for a block. | |
1971 | * @slab: The slab. | |
1972 | * @entry_point: The slab journal point for the entry. | |
1973 | * @entry: The slab journal entry being replayed. | |
1974 | * | |
1975 | * The adjustment will be ignored if it was already recorded in the reference count. | |
1976 | * | |
1977 | * Return: VDO_SUCCESS or an error code. | |
1978 | */ | |
1979 | static int replay_reference_count_change(struct vdo_slab *slab, | |
1980 | const struct journal_point *entry_point, | |
1981 | struct slab_journal_entry entry) | |
1982 | { | |
1983 | int result; | |
1984 | struct reference_block *block = get_reference_block(slab, entry.sbn); | |
1985 | sector_count_t sector = (entry.sbn % COUNTS_PER_BLOCK) / COUNTS_PER_SECTOR; | |
1986 | struct reference_updater updater = { | |
1987 | .operation = entry.operation, | |
1988 | .increment = entry.increment, | |
1989 | }; | |
1990 | ||
1991 | if (!vdo_before_journal_point(&block->commit_points[sector], entry_point)) { | |
1992 | /* This entry is already reflected in the existing counts, so do nothing. */ | |
1993 | return VDO_SUCCESS; | |
1994 | } | |
1995 | ||
1996 | /* This entry is not yet counted in the reference counts. */ | |
1997 | result = update_reference_count(slab, block, entry.sbn, entry_point, | |
1998 | &updater, !NORMAL_OPERATION, false, NULL); | |
1999 | if (result != VDO_SUCCESS) | |
2000 | return result; | |
2001 | ||
2002 | dirty_block(block); | |
2003 | return VDO_SUCCESS; | |
2004 | } | |
2005 | ||
2006 | /** | |
2007 | * find_zero_byte_in_word() - Find the array index of the first zero byte in word-sized range of | |
2008 | * reference counters. | |
2009 | * @word_ptr: A pointer to the eight counter bytes to check. | |
2010 | * @start_index: The array index corresponding to word_ptr[0]. | |
2011 | * @fail_index: The array index to return if no zero byte is found. | |
2012 | * | |
2013 | * The search does no bounds checking; the function relies on the array being sufficiently padded. | |
2014 | * | |
2015 | * Return: The array index of the first zero byte in the word, or the value passed as fail_index if | |
2016 | * no zero byte was found. | |
2017 | */ | |
2018 | static inline slab_block_number find_zero_byte_in_word(const u8 *word_ptr, | |
2019 | slab_block_number start_index, | |
2020 | slab_block_number fail_index) | |
2021 | { | |
2022 | u64 word = get_unaligned_le64(word_ptr); | |
2023 | ||
2024 | /* This looks like a loop, but GCC will unroll the eight iterations for us. */ | |
2025 | unsigned int offset; | |
2026 | ||
2027 | for (offset = 0; offset < BYTES_PER_WORD; offset++) { | |
2028 | /* Assumes little-endian byte order, which we have on X86. */ | |
2029 | if ((word & 0xFF) == 0) | |
2030 | return (start_index + offset); | |
2031 | word >>= 8; | |
2032 | } | |
2033 | ||
2034 | return fail_index; | |
2035 | } | |
2036 | ||
2037 | /** | |
eebd4e16 JC |
2038 | * find_free_block() - Find the first block with a reference count of zero in the specified |
2039 | * range of reference counter indexes. | |
09eff388 MS |
2040 | * @slab: The slab counters to scan. |
2041 | * @index_ptr: A pointer to hold the array index of the free block. | |
2042 | * | |
2043 | * Exposed for unit testing. | |
2044 | * | |
2045 | * Return: true if a free block was found in the specified range. | |
2046 | */ | |
2047 | static bool find_free_block(const struct vdo_slab *slab, slab_block_number *index_ptr) | |
2048 | { | |
2049 | slab_block_number zero_index; | |
2050 | slab_block_number next_index = slab->search_cursor.index; | |
2051 | slab_block_number end_index = slab->search_cursor.end_index; | |
2052 | u8 *next_counter = &slab->counters[next_index]; | |
2053 | u8 *end_counter = &slab->counters[end_index]; | |
2054 | ||
2055 | /* | |
2056 | * Search every byte of the first unaligned word. (Array is padded so reading past end is | |
2057 | * safe.) | |
2058 | */ | |
2059 | zero_index = find_zero_byte_in_word(next_counter, next_index, end_index); | |
2060 | if (zero_index < end_index) { | |
2061 | *index_ptr = zero_index; | |
2062 | return true; | |
2063 | } | |
2064 | ||
2065 | /* | |
2066 | * On architectures where unaligned word access is expensive, this would be a good place to | |
2067 | * advance to an alignment boundary. | |
2068 | */ | |
2069 | next_index += BYTES_PER_WORD; | |
2070 | next_counter += BYTES_PER_WORD; | |
2071 | ||
2072 | /* | |
2073 | * Now we're word-aligned; check an word at a time until we find a word containing a zero. | |
2074 | * (Array is padded so reading past end is safe.) | |
2075 | */ | |
2076 | while (next_counter < end_counter) { | |
2077 | /* | |
2078 | * The following code is currently an exact copy of the code preceding the loop, | |
2079 | * but if you try to merge them by using a do loop, it runs slower because a jump | |
2080 | * instruction gets added at the start of the iteration. | |
2081 | */ | |
2082 | zero_index = find_zero_byte_in_word(next_counter, next_index, end_index); | |
2083 | if (zero_index < end_index) { | |
2084 | *index_ptr = zero_index; | |
2085 | return true; | |
2086 | } | |
2087 | ||
2088 | next_index += BYTES_PER_WORD; | |
2089 | next_counter += BYTES_PER_WORD; | |
2090 | } | |
2091 | ||
2092 | return false; | |
2093 | } | |
2094 | ||
2095 | /** | |
2096 | * search_current_reference_block() - Search the reference block currently saved in the search | |
2097 | * cursor for a reference count of zero, starting at the saved | |
2098 | * counter index. | |
2099 | * @slab: The slab to search. | |
2100 | * @free_index_ptr: A pointer to receive the array index of the zero reference count. | |
2101 | * | |
2102 | * Return: true if an unreferenced counter was found. | |
2103 | */ | |
2104 | static bool search_current_reference_block(const struct vdo_slab *slab, | |
2105 | slab_block_number *free_index_ptr) | |
2106 | { | |
2107 | /* Don't bother searching if the current block is known to be full. */ | |
2108 | return ((slab->search_cursor.block->allocated_count < COUNTS_PER_BLOCK) && | |
2109 | find_free_block(slab, free_index_ptr)); | |
2110 | } | |
2111 | ||
2112 | /** | |
2113 | * search_reference_blocks() - Search each reference block for a reference count of zero. | |
2114 | * @slab: The slab to search. | |
2115 | * @free_index_ptr: A pointer to receive the array index of the zero reference count. | |
2116 | * | |
2117 | * Searches each reference block for a reference count of zero, starting at the reference block and | |
2118 | * counter index saved in the search cursor and searching up to the end of the last reference | |
2119 | * block. The search does not wrap. | |
2120 | * | |
2121 | * Return: true if an unreferenced counter was found. | |
2122 | */ | |
2123 | static bool search_reference_blocks(struct vdo_slab *slab, | |
2124 | slab_block_number *free_index_ptr) | |
2125 | { | |
2126 | /* Start searching at the saved search position in the current block. */ | |
2127 | if (search_current_reference_block(slab, free_index_ptr)) | |
2128 | return true; | |
2129 | ||
2130 | /* Search each reference block up to the end of the slab. */ | |
2131 | while (advance_search_cursor(slab)) { | |
2132 | if (search_current_reference_block(slab, free_index_ptr)) | |
2133 | return true; | |
2134 | } | |
2135 | ||
2136 | return false; | |
2137 | } | |
2138 | ||
2139 | /** | |
2140 | * make_provisional_reference() - Do the bookkeeping for making a provisional reference. | |
2141 | */ | |
2142 | static void make_provisional_reference(struct vdo_slab *slab, | |
2143 | slab_block_number block_number) | |
2144 | { | |
2145 | struct reference_block *block = get_reference_block(slab, block_number); | |
2146 | ||
2147 | /* | |
2148 | * Make the initial transition from an unreferenced block to a | |
2149 | * provisionally allocated block. | |
2150 | */ | |
2151 | slab->counters[block_number] = PROVISIONAL_REFERENCE_COUNT; | |
2152 | ||
2153 | /* Account for the allocation. */ | |
2154 | block->allocated_count++; | |
2155 | slab->free_blocks--; | |
2156 | } | |
2157 | ||
2158 | /** | |
2159 | * dirty_all_reference_blocks() - Mark all reference count blocks in a slab as dirty. | |
2160 | */ | |
2161 | static void dirty_all_reference_blocks(struct vdo_slab *slab) | |
2162 | { | |
2163 | block_count_t i; | |
2164 | ||
2165 | for (i = 0; i < slab->reference_block_count; i++) | |
2166 | dirty_block(&slab->reference_blocks[i]); | |
2167 | } | |
2168 | ||
2169 | /** | |
2170 | * clear_provisional_references() - Clear the provisional reference counts from a reference block. | |
2171 | * @block: The block to clear. | |
2172 | */ | |
2173 | static void clear_provisional_references(struct reference_block *block) | |
2174 | { | |
2175 | vdo_refcount_t *counters = get_reference_counters_for_block(block); | |
2176 | block_count_t j; | |
2177 | ||
2178 | for (j = 0; j < COUNTS_PER_BLOCK; j++) { | |
2179 | if (counters[j] == PROVISIONAL_REFERENCE_COUNT) { | |
2180 | counters[j] = EMPTY_REFERENCE_COUNT; | |
2181 | block->allocated_count--; | |
2182 | } | |
2183 | } | |
2184 | } | |
2185 | ||
2186 | static inline bool journal_points_equal(struct journal_point first, | |
2187 | struct journal_point second) | |
2188 | { | |
2189 | return ((first.sequence_number == second.sequence_number) && | |
2190 | (first.entry_count == second.entry_count)); | |
2191 | } | |
2192 | ||
2193 | /** | |
2194 | * unpack_reference_block() - Unpack reference counts blocks into the internal memory structure. | |
2195 | * @packed: The written reference block to be unpacked. | |
2196 | * @block: The internal reference block to be loaded. | |
2197 | */ | |
2198 | static void unpack_reference_block(struct packed_reference_block *packed, | |
2199 | struct reference_block *block) | |
2200 | { | |
2201 | block_count_t index; | |
2202 | sector_count_t i; | |
2203 | struct vdo_slab *slab = block->slab; | |
2204 | vdo_refcount_t *counters = get_reference_counters_for_block(block); | |
2205 | ||
2206 | for (i = 0; i < VDO_SECTORS_PER_BLOCK; i++) { | |
2207 | struct packed_reference_sector *sector = &packed->sectors[i]; | |
2208 | ||
2209 | vdo_unpack_journal_point(§or->commit_point, &block->commit_points[i]); | |
2210 | memcpy(counters + (i * COUNTS_PER_SECTOR), sector->counts, | |
2211 | (sizeof(vdo_refcount_t) * COUNTS_PER_SECTOR)); | |
2212 | /* The slab_journal_point must be the latest point found in any sector. */ | |
2213 | if (vdo_before_journal_point(&slab->slab_journal_point, | |
2214 | &block->commit_points[i])) | |
2215 | slab->slab_journal_point = block->commit_points[i]; | |
2216 | ||
2217 | if ((i > 0) && | |
2218 | !journal_points_equal(block->commit_points[0], | |
2219 | block->commit_points[i])) { | |
2220 | size_t block_index = block - block->slab->reference_blocks; | |
2221 | ||
2222 | uds_log_warning("Torn write detected in sector %u of reference block %zu of slab %u", | |
2223 | i, block_index, block->slab->slab_number); | |
2224 | } | |
2225 | } | |
2226 | ||
2227 | block->allocated_count = 0; | |
2228 | for (index = 0; index < COUNTS_PER_BLOCK; index++) { | |
2229 | if (counters[index] != EMPTY_REFERENCE_COUNT) | |
2230 | block->allocated_count++; | |
2231 | } | |
2232 | } | |
2233 | ||
2234 | /** | |
2235 | * finish_reference_block_load() - After a reference block has been read, unpack it. | |
2236 | * @completion: The VIO that just finished reading. | |
2237 | */ | |
2238 | static void finish_reference_block_load(struct vdo_completion *completion) | |
2239 | { | |
2240 | struct vio *vio = as_vio(completion); | |
2241 | struct pooled_vio *pooled = vio_as_pooled_vio(vio); | |
2242 | struct reference_block *block = completion->parent; | |
2243 | struct vdo_slab *slab = block->slab; | |
2244 | ||
2245 | unpack_reference_block((struct packed_reference_block *) vio->data, block); | |
2246 | return_vio_to_pool(slab->allocator->vio_pool, pooled); | |
2247 | slab->active_count--; | |
2248 | clear_provisional_references(block); | |
2249 | ||
2250 | slab->free_blocks -= block->allocated_count; | |
2251 | check_if_slab_drained(slab); | |
2252 | } | |
2253 | ||
2254 | static void load_reference_block_endio(struct bio *bio) | |
2255 | { | |
2256 | struct vio *vio = bio->bi_private; | |
2257 | struct reference_block *block = vio->completion.parent; | |
2258 | ||
2259 | continue_vio_after_io(vio, finish_reference_block_load, | |
2260 | block->slab->allocator->thread_id); | |
2261 | } | |
2262 | ||
2263 | /** | |
2264 | * load_reference_block() - After a block waiter has gotten a VIO from the VIO pool, load the | |
2265 | * block. | |
2266 | * @waiter: The waiter of the block to load. | |
2267 | * @context: The VIO returned by the pool. | |
2268 | */ | |
d6e260cc | 2269 | static void load_reference_block(struct vdo_waiter *waiter, void *context) |
09eff388 MS |
2270 | { |
2271 | struct pooled_vio *pooled = context; | |
2272 | struct vio *vio = &pooled->vio; | |
2273 | struct reference_block *block = | |
2274 | container_of(waiter, struct reference_block, waiter); | |
2275 | size_t block_offset = (block - block->slab->reference_blocks); | |
2276 | ||
2277 | vio->completion.parent = block; | |
f7f46761 MS |
2278 | vdo_submit_metadata_vio(vio, block->slab->ref_counts_origin + block_offset, |
2279 | load_reference_block_endio, handle_io_error, | |
2280 | REQ_OP_READ); | |
09eff388 MS |
2281 | } |
2282 | ||
2283 | /** | |
2284 | * load_reference_blocks() - Load a slab's reference blocks from the underlying storage into a | |
2285 | * pre-allocated reference counter. | |
2286 | */ | |
2287 | static void load_reference_blocks(struct vdo_slab *slab) | |
2288 | { | |
2289 | block_count_t i; | |
2290 | ||
2291 | slab->free_blocks = slab->block_count; | |
2292 | slab->active_count = slab->reference_block_count; | |
2293 | for (i = 0; i < slab->reference_block_count; i++) { | |
d6e260cc | 2294 | struct vdo_waiter *waiter = &slab->reference_blocks[i].waiter; |
09eff388 MS |
2295 | |
2296 | waiter->callback = load_reference_block; | |
2297 | acquire_vio_from_pool(slab->allocator->vio_pool, waiter); | |
2298 | } | |
2299 | } | |
2300 | ||
2301 | /** | |
2302 | * drain_slab() - Drain all reference count I/O. | |
2303 | * | |
2304 | * Depending upon the type of drain being performed (as recorded in the ref_count's vdo_slab), the | |
2305 | * reference blocks may be loaded from disk or dirty reference blocks may be written out. | |
2306 | */ | |
2307 | static void drain_slab(struct vdo_slab *slab) | |
2308 | { | |
2309 | bool save; | |
2310 | bool load; | |
2311 | const struct admin_state_code *state = vdo_get_admin_state_code(&slab->state); | |
2312 | ||
2313 | if (state == VDO_ADMIN_STATE_SUSPENDING) | |
2314 | return; | |
2315 | ||
2316 | if ((state != VDO_ADMIN_STATE_REBUILDING) && | |
2317 | (state != VDO_ADMIN_STATE_SAVE_FOR_SCRUBBING)) | |
2318 | commit_tail(&slab->journal); | |
2319 | ||
2320 | if ((state == VDO_ADMIN_STATE_RECOVERING) || (slab->counters == NULL)) | |
2321 | return; | |
2322 | ||
2323 | save = false; | |
2324 | load = slab->allocator->summary_entries[slab->slab_number].load_ref_counts; | |
2325 | if (state == VDO_ADMIN_STATE_SCRUBBING) { | |
2326 | if (load) { | |
2327 | load_reference_blocks(slab); | |
2328 | return; | |
2329 | } | |
2330 | } else if (state == VDO_ADMIN_STATE_SAVE_FOR_SCRUBBING) { | |
2331 | if (!load) { | |
2332 | /* These reference counts were never written, so mark them all dirty. */ | |
2333 | dirty_all_reference_blocks(slab); | |
2334 | } | |
2335 | save = true; | |
2336 | } else if (state == VDO_ADMIN_STATE_REBUILDING) { | |
2337 | /* | |
2338 | * Write out the counters if the slab has written them before, or it has any | |
2339 | * non-zero reference counts, or there are any slab journal blocks. | |
2340 | */ | |
2341 | block_count_t data_blocks = slab->allocator->depot->slab_config.data_blocks; | |
2342 | ||
2343 | if (load || (slab->free_blocks != data_blocks) || | |
2344 | !is_slab_journal_blank(slab)) { | |
2345 | dirty_all_reference_blocks(slab); | |
2346 | save = true; | |
2347 | } | |
2348 | } else if (state == VDO_ADMIN_STATE_SAVING) { | |
2349 | save = (slab->status == VDO_SLAB_REBUILT); | |
2350 | } else { | |
2351 | vdo_finish_draining_with_result(&slab->state, VDO_SUCCESS); | |
2352 | return; | |
2353 | } | |
2354 | ||
2355 | if (save) | |
2356 | save_dirty_reference_blocks(slab); | |
2357 | } | |
2358 | ||
2359 | static int allocate_slab_counters(struct vdo_slab *slab) | |
2360 | { | |
2361 | int result; | |
2362 | size_t index, bytes; | |
2363 | ||
2364 | result = ASSERT(slab->reference_blocks == NULL, | |
2365 | "vdo_slab %u doesn't allocate refcounts twice", | |
2366 | slab->slab_number); | |
2367 | if (result != VDO_SUCCESS) | |
2368 | return result; | |
2369 | ||
0eea6b6e | 2370 | result = vdo_allocate(slab->reference_block_count, struct reference_block, |
09eff388 MS |
2371 | __func__, &slab->reference_blocks); |
2372 | if (result != VDO_SUCCESS) | |
2373 | return result; | |
2374 | ||
2375 | /* | |
2376 | * Allocate such that the runt slab has a full-length memory array, plus a little padding | |
2377 | * so we can word-search even at the very end. | |
2378 | */ | |
2379 | bytes = (slab->reference_block_count * COUNTS_PER_BLOCK) + (2 * BYTES_PER_WORD); | |
0eea6b6e | 2380 | result = vdo_allocate(bytes, vdo_refcount_t, "ref counts array", |
09eff388 MS |
2381 | &slab->counters); |
2382 | if (result != UDS_SUCCESS) { | |
0eea6b6e | 2383 | vdo_free(vdo_forget(slab->reference_blocks)); |
09eff388 MS |
2384 | return result; |
2385 | } | |
2386 | ||
2387 | slab->search_cursor.first_block = slab->reference_blocks; | |
2388 | slab->search_cursor.last_block = &slab->reference_blocks[slab->reference_block_count - 1]; | |
2389 | reset_search_cursor(slab); | |
2390 | ||
2391 | for (index = 0; index < slab->reference_block_count; index++) { | |
2392 | slab->reference_blocks[index] = (struct reference_block) { | |
2393 | .slab = slab, | |
2394 | }; | |
2395 | } | |
2396 | ||
2397 | return VDO_SUCCESS; | |
2398 | } | |
2399 | ||
2400 | static int allocate_counters_if_clean(struct vdo_slab *slab) | |
2401 | { | |
2402 | if (vdo_is_state_clean_load(&slab->state)) | |
2403 | return allocate_slab_counters(slab); | |
2404 | ||
2405 | return VDO_SUCCESS; | |
2406 | } | |
2407 | ||
2408 | static void finish_loading_journal(struct vdo_completion *completion) | |
2409 | { | |
2410 | struct vio *vio = as_vio(completion); | |
2411 | struct slab_journal *journal = completion->parent; | |
2412 | struct vdo_slab *slab = journal->slab; | |
2413 | struct packed_slab_journal_block *block = (struct packed_slab_journal_block *) vio->data; | |
2414 | struct slab_journal_block_header header; | |
2415 | ||
2416 | vdo_unpack_slab_journal_block_header(&block->header, &header); | |
2417 | ||
2418 | /* FIXME: should it be an error if the following conditional fails? */ | |
2419 | if ((header.metadata_type == VDO_METADATA_SLAB_JOURNAL) && | |
2420 | (header.nonce == slab->allocator->nonce)) { | |
2421 | journal->tail = header.sequence_number + 1; | |
2422 | ||
2423 | /* | |
2424 | * If the slab is clean, this implies the slab journal is empty, so advance the | |
2425 | * head appropriately. | |
2426 | */ | |
2427 | journal->head = (slab->allocator->summary_entries[slab->slab_number].is_dirty ? | |
2428 | header.head : journal->tail); | |
2429 | journal->tail_header = header; | |
2430 | initialize_journal_state(journal); | |
2431 | } | |
2432 | ||
2433 | return_vio_to_pool(slab->allocator->vio_pool, vio_as_pooled_vio(vio)); | |
2434 | vdo_finish_loading_with_result(&slab->state, allocate_counters_if_clean(slab)); | |
2435 | } | |
2436 | ||
2437 | static void read_slab_journal_tail_endio(struct bio *bio) | |
2438 | { | |
2439 | struct vio *vio = bio->bi_private; | |
2440 | struct slab_journal *journal = vio->completion.parent; | |
2441 | ||
2442 | continue_vio_after_io(vio, finish_loading_journal, | |
2443 | journal->slab->allocator->thread_id); | |
2444 | } | |
2445 | ||
2446 | static void handle_load_error(struct vdo_completion *completion) | |
2447 | { | |
2448 | int result = completion->result; | |
2449 | struct slab_journal *journal = completion->parent; | |
2450 | struct vio *vio = as_vio(completion); | |
2451 | ||
2452 | vio_record_metadata_io_error(vio); | |
2453 | return_vio_to_pool(journal->slab->allocator->vio_pool, vio_as_pooled_vio(vio)); | |
2454 | vdo_finish_loading_with_result(&journal->slab->state, result); | |
2455 | } | |
2456 | ||
2457 | /** | |
2458 | * read_slab_journal_tail() - Read the slab journal tail block by using a vio acquired from the vio | |
2459 | * pool. | |
2460 | * @waiter: The vio pool waiter which has just been notified. | |
2461 | * @context: The vio pool entry given to the waiter. | |
2462 | * | |
2463 | * This is the success callback from acquire_vio_from_pool() when loading a slab journal. | |
2464 | */ | |
d6e260cc | 2465 | static void read_slab_journal_tail(struct vdo_waiter *waiter, void *context) |
09eff388 MS |
2466 | { |
2467 | struct slab_journal *journal = | |
2468 | container_of(waiter, struct slab_journal, resource_waiter); | |
2469 | struct vdo_slab *slab = journal->slab; | |
2470 | struct pooled_vio *pooled = context; | |
2471 | struct vio *vio = &pooled->vio; | |
2472 | tail_block_offset_t last_commit_point = | |
2473 | slab->allocator->summary_entries[slab->slab_number].tail_block_offset; | |
2474 | ||
2475 | /* | |
2476 | * Slab summary keeps the commit point offset, so the tail block is the block before that. | |
2477 | * Calculation supports small journals in unit tests. | |
2478 | */ | |
2479 | tail_block_offset_t tail_block = ((last_commit_point == 0) ? | |
2480 | (tail_block_offset_t)(journal->size - 1) : | |
2481 | (last_commit_point - 1)); | |
2482 | ||
2483 | vio->completion.parent = journal; | |
2484 | vio->completion.callback_thread_id = slab->allocator->thread_id; | |
f7f46761 MS |
2485 | vdo_submit_metadata_vio(vio, slab->journal_origin + tail_block, |
2486 | read_slab_journal_tail_endio, handle_load_error, | |
2487 | REQ_OP_READ); | |
09eff388 MS |
2488 | } |
2489 | ||
2490 | /** | |
2491 | * load_slab_journal() - Load a slab's journal by reading the journal's tail. | |
2492 | */ | |
2493 | static void load_slab_journal(struct vdo_slab *slab) | |
2494 | { | |
2495 | struct slab_journal *journal = &slab->journal; | |
2496 | tail_block_offset_t last_commit_point; | |
2497 | ||
2498 | last_commit_point = slab->allocator->summary_entries[slab->slab_number].tail_block_offset; | |
2499 | if ((last_commit_point == 0) && | |
2500 | !slab->allocator->summary_entries[slab->slab_number].load_ref_counts) { | |
2501 | /* | |
2502 | * This slab claims that it has a tail block at (journal->size - 1), but a head of | |
2503 | * 1. This is impossible, due to the scrubbing threshold, on a real system, so | |
2504 | * don't bother reading the (bogus) data off disk. | |
2505 | */ | |
2506 | ASSERT_LOG_ONLY(((journal->size < 16) || | |
2507 | (journal->scrubbing_threshold < (journal->size - 1))), | |
2508 | "Scrubbing threshold protects against reads of unwritten slab journal blocks"); | |
2509 | vdo_finish_loading_with_result(&slab->state, | |
2510 | allocate_counters_if_clean(slab)); | |
2511 | return; | |
2512 | } | |
2513 | ||
2514 | journal->resource_waiter.callback = read_slab_journal_tail; | |
2515 | acquire_vio_from_pool(slab->allocator->vio_pool, &journal->resource_waiter); | |
2516 | } | |
2517 | ||
c9ba9fd3 | 2518 | static void register_slab_for_scrubbing(struct vdo_slab *slab, bool high_priority) |
09eff388 | 2519 | { |
c9ba9fd3 MS |
2520 | struct slab_scrubber *scrubber = &slab->allocator->scrubber; |
2521 | ||
2522 | ASSERT_LOG_ONLY((slab->status != VDO_SLAB_REBUILT), | |
2523 | "slab to be scrubbed is unrecovered"); | |
2524 | ||
2525 | if (slab->status != VDO_SLAB_REQUIRES_SCRUBBING) | |
09eff388 MS |
2526 | return; |
2527 | ||
c9ba9fd3 MS |
2528 | list_del_init(&slab->allocq_entry); |
2529 | if (!slab->was_queued_for_scrubbing) { | |
2530 | WRITE_ONCE(scrubber->slab_count, scrubber->slab_count + 1); | |
2531 | slab->was_queued_for_scrubbing = true; | |
2532 | } | |
2533 | ||
2534 | if (high_priority) { | |
2535 | slab->status = VDO_SLAB_REQUIRES_HIGH_PRIORITY_SCRUBBING; | |
2536 | list_add_tail(&slab->allocq_entry, &scrubber->high_priority_slabs); | |
2537 | return; | |
2538 | } | |
2539 | ||
2540 | list_add_tail(&slab->allocq_entry, &scrubber->slabs); | |
09eff388 MS |
2541 | } |
2542 | ||
c9ba9fd3 MS |
2543 | /* Queue a slab for allocation or scrubbing. */ |
2544 | static void queue_slab(struct vdo_slab *slab) | |
09eff388 | 2545 | { |
c9ba9fd3 MS |
2546 | struct block_allocator *allocator = slab->allocator; |
2547 | block_count_t free_blocks; | |
09eff388 MS |
2548 | int result; |
2549 | ||
c9ba9fd3 MS |
2550 | ASSERT_LOG_ONLY(list_empty(&slab->allocq_entry), |
2551 | "a requeued slab must not already be on a ring"); | |
09eff388 | 2552 | |
c9ba9fd3 MS |
2553 | if (vdo_is_read_only(allocator->depot->vdo)) |
2554 | return; | |
09eff388 | 2555 | |
c9ba9fd3 MS |
2556 | free_blocks = slab->free_blocks; |
2557 | result = ASSERT((free_blocks <= allocator->depot->slab_config.data_blocks), | |
2558 | "rebuilt slab %u must have a valid free block count (has %llu, expected maximum %llu)", | |
2559 | slab->slab_number, (unsigned long long) free_blocks, | |
2560 | (unsigned long long) allocator->depot->slab_config.data_blocks); | |
2561 | if (result != VDO_SUCCESS) { | |
2562 | vdo_enter_read_only_mode(allocator->depot->vdo, result); | |
2563 | return; | |
2564 | } | |
09eff388 | 2565 | |
c9ba9fd3 MS |
2566 | if (slab->status != VDO_SLAB_REBUILT) { |
2567 | register_slab_for_scrubbing(slab, false); | |
2568 | return; | |
2569 | } | |
09eff388 | 2570 | |
c9ba9fd3 MS |
2571 | if (!vdo_is_state_resuming(&slab->state)) { |
2572 | /* | |
2573 | * If the slab is resuming, we've already accounted for it here, so don't do it | |
2574 | * again. | |
2575 | * FIXME: under what situation would the slab be resuming here? | |
2576 | */ | |
2577 | WRITE_ONCE(allocator->allocated_blocks, | |
2578 | allocator->allocated_blocks - free_blocks); | |
2579 | if (!is_slab_journal_blank(slab)) { | |
2580 | WRITE_ONCE(allocator->statistics.slabs_opened, | |
2581 | allocator->statistics.slabs_opened + 1); | |
2582 | } | |
2583 | } | |
09eff388 | 2584 | |
c9ba9fd3 MS |
2585 | if (allocator->depot->vdo->suspend_type == VDO_ADMIN_STATE_SAVING) |
2586 | reopen_slab_journal(slab); | |
09eff388 | 2587 | |
c9ba9fd3 | 2588 | prioritize_slab(slab); |
09eff388 MS |
2589 | } |
2590 | ||
2591 | /** | |
c9ba9fd3 | 2592 | * initiate_slab_action() - Initiate a slab action. |
09eff388 | 2593 | * |
c9ba9fd3 | 2594 | * Implements vdo_admin_initiator_fn. |
09eff388 | 2595 | */ |
c9ba9fd3 | 2596 | static void initiate_slab_action(struct admin_state *state) |
09eff388 | 2597 | { |
c9ba9fd3 | 2598 | struct vdo_slab *slab = container_of(state, struct vdo_slab, state); |
09eff388 | 2599 | |
c9ba9fd3 MS |
2600 | if (vdo_is_state_draining(state)) { |
2601 | const struct admin_state_code *operation = vdo_get_admin_state_code(state); | |
09eff388 | 2602 | |
c9ba9fd3 MS |
2603 | if (operation == VDO_ADMIN_STATE_SCRUBBING) |
2604 | slab->status = VDO_SLAB_REBUILDING; | |
09eff388 | 2605 | |
c9ba9fd3 MS |
2606 | drain_slab(slab); |
2607 | check_if_slab_drained(slab); | |
2608 | return; | |
09eff388 MS |
2609 | } |
2610 | ||
c9ba9fd3 MS |
2611 | if (vdo_is_state_loading(state)) { |
2612 | load_slab_journal(slab); | |
2613 | return; | |
09eff388 MS |
2614 | } |
2615 | ||
c9ba9fd3 MS |
2616 | if (vdo_is_state_resuming(state)) { |
2617 | queue_slab(slab); | |
2618 | vdo_finish_resuming(state); | |
2619 | return; | |
2620 | } | |
2621 | ||
2622 | vdo_finish_operation(state, VDO_INVALID_ADMIN_STATE); | |
09eff388 | 2623 | } |
883069e3 MS |
2624 | |
2625 | /** | |
c9ba9fd3 MS |
2626 | * get_next_slab() - Get the next slab to scrub. |
2627 | * @scrubber: The slab scrubber. | |
2628 | * | |
2629 | * Return: The next slab to scrub or NULL if there are none. | |
883069e3 | 2630 | */ |
c9ba9fd3 | 2631 | static struct vdo_slab *get_next_slab(struct slab_scrubber *scrubber) |
883069e3 | 2632 | { |
c9ba9fd3 | 2633 | struct vdo_slab *slab; |
883069e3 | 2634 | |
c9ba9fd3 MS |
2635 | slab = list_first_entry_or_null(&scrubber->high_priority_slabs, |
2636 | struct vdo_slab, allocq_entry); | |
2637 | if (slab != NULL) | |
2638 | return slab; | |
2639 | ||
2640 | return list_first_entry_or_null(&scrubber->slabs, struct vdo_slab, | |
2641 | allocq_entry); | |
883069e3 MS |
2642 | } |
2643 | ||
c9ba9fd3 MS |
2644 | /** |
2645 | * has_slabs_to_scrub() - Check whether a scrubber has slabs to scrub. | |
2646 | * @scrubber: The scrubber to check. | |
2647 | * | |
2648 | * Return: true if the scrubber has slabs to scrub. | |
2649 | */ | |
1ccef45a | 2650 | static inline bool __must_check has_slabs_to_scrub(struct slab_scrubber *scrubber) |
883069e3 | 2651 | { |
c9ba9fd3 | 2652 | return (get_next_slab(scrubber) != NULL); |
883069e3 MS |
2653 | } |
2654 | ||
c9ba9fd3 MS |
2655 | /** |
2656 | * uninitialize_scrubber_vio() - Clean up the slab_scrubber's vio. | |
2657 | * @scrubber: The scrubber. | |
2658 | */ | |
2659 | static void uninitialize_scrubber_vio(struct slab_scrubber *scrubber) | |
883069e3 | 2660 | { |
0eea6b6e | 2661 | vdo_free(vdo_forget(scrubber->vio.data)); |
c9ba9fd3 | 2662 | free_vio_components(&scrubber->vio); |
883069e3 MS |
2663 | } |
2664 | ||
2665 | /** | |
c9ba9fd3 MS |
2666 | * finish_scrubbing() - Stop scrubbing, either because there are no more slabs to scrub or because |
2667 | * there's been an error. | |
2668 | * @scrubber: The scrubber. | |
883069e3 | 2669 | */ |
c9ba9fd3 | 2670 | static void finish_scrubbing(struct slab_scrubber *scrubber, int result) |
883069e3 | 2671 | { |
d6e260cc | 2672 | bool notify = vdo_waitq_has_waiters(&scrubber->waiters); |
c9ba9fd3 MS |
2673 | bool done = !has_slabs_to_scrub(scrubber); |
2674 | struct block_allocator *allocator = | |
2675 | container_of(scrubber, struct block_allocator, scrubber); | |
2676 | ||
2677 | if (done) | |
2678 | uninitialize_scrubber_vio(scrubber); | |
2679 | ||
2680 | if (scrubber->high_priority_only) { | |
2681 | scrubber->high_priority_only = false; | |
0eea6b6e | 2682 | vdo_fail_completion(vdo_forget(scrubber->vio.completion.parent), result); |
c9ba9fd3 MS |
2683 | } else if (done && (atomic_add_return(-1, &allocator->depot->zones_to_scrub) == 0)) { |
2684 | /* All of our slabs were scrubbed, and we're the last allocator to finish. */ | |
2685 | enum vdo_state prior_state = | |
2686 | atomic_cmpxchg(&allocator->depot->vdo->state, VDO_RECOVERING, | |
2687 | VDO_DIRTY); | |
883069e3 | 2688 | |
c9ba9fd3 MS |
2689 | /* |
2690 | * To be safe, even if the CAS failed, ensure anything that follows is ordered with | |
2691 | * respect to whatever state change did happen. | |
2692 | */ | |
2693 | smp_mb__after_atomic(); | |
883069e3 | 2694 | |
c9ba9fd3 MS |
2695 | /* |
2696 | * We must check the VDO state here and not the depot's read_only_notifier since | |
2697 | * the compare-swap-above could have failed due to a read-only entry which our own | |
2698 | * thread does not yet know about. | |
2699 | */ | |
2700 | if (prior_state == VDO_DIRTY) | |
2701 | uds_log_info("VDO commencing normal operation"); | |
2702 | else if (prior_state == VDO_RECOVERING) | |
2703 | uds_log_info("Exiting recovery mode"); | |
883069e3 MS |
2704 | } |
2705 | ||
c9ba9fd3 MS |
2706 | /* |
2707 | * Note that the scrubber has stopped, and inform anyone who might be waiting for that to | |
2708 | * happen. | |
2709 | */ | |
2710 | if (!vdo_finish_draining(&scrubber->admin_state)) | |
2711 | WRITE_ONCE(scrubber->admin_state.current_state, | |
2712 | VDO_ADMIN_STATE_SUSPENDED); | |
2713 | ||
2714 | /* | |
2715 | * We can't notify waiters until after we've finished draining or they'll just requeue. | |
2716 | * Fortunately if there were waiters, we can't have been freed yet. | |
2717 | */ | |
2718 | if (notify) | |
d6e260cc | 2719 | vdo_waitq_notify_all_waiters(&scrubber->waiters, NULL, NULL); |
883069e3 MS |
2720 | } |
2721 | ||
c9ba9fd3 MS |
2722 | static void scrub_next_slab(struct slab_scrubber *scrubber); |
2723 | ||
883069e3 | 2724 | /** |
c9ba9fd3 MS |
2725 | * slab_scrubbed() - Notify the scrubber that a slab has been scrubbed. |
2726 | * @completion: The slab rebuild completion. | |
883069e3 | 2727 | * |
c9ba9fd3 | 2728 | * This callback is registered in apply_journal_entries(). |
883069e3 | 2729 | */ |
c9ba9fd3 | 2730 | static void slab_scrubbed(struct vdo_completion *completion) |
883069e3 | 2731 | { |
c9ba9fd3 MS |
2732 | struct slab_scrubber *scrubber = |
2733 | container_of(as_vio(completion), struct slab_scrubber, vio); | |
2734 | struct vdo_slab *slab = scrubber->slab; | |
2735 | ||
2736 | slab->status = VDO_SLAB_REBUILT; | |
2737 | queue_slab(slab); | |
2738 | reopen_slab_journal(slab); | |
2739 | WRITE_ONCE(scrubber->slab_count, scrubber->slab_count - 1); | |
2740 | scrub_next_slab(scrubber); | |
2741 | } | |
883069e3 | 2742 | |
c9ba9fd3 MS |
2743 | /** |
2744 | * abort_scrubbing() - Abort scrubbing due to an error. | |
2745 | * @scrubber: The slab scrubber. | |
2746 | * @result: The error. | |
2747 | */ | |
2748 | static void abort_scrubbing(struct slab_scrubber *scrubber, int result) | |
2749 | { | |
2750 | vdo_enter_read_only_mode(scrubber->vio.completion.vdo, result); | |
2751 | finish_scrubbing(scrubber, result); | |
883069e3 MS |
2752 | } |
2753 | ||
c9ba9fd3 MS |
2754 | /** |
2755 | * handle_scrubber_error() - Handle errors while rebuilding a slab. | |
2756 | * @completion: The slab rebuild completion. | |
2757 | */ | |
2758 | static void handle_scrubber_error(struct vdo_completion *completion) | |
883069e3 | 2759 | { |
c9ba9fd3 | 2760 | struct vio *vio = as_vio(completion); |
883069e3 | 2761 | |
c9ba9fd3 MS |
2762 | vio_record_metadata_io_error(vio); |
2763 | abort_scrubbing(container_of(vio, struct slab_scrubber, vio), | |
2764 | completion->result); | |
883069e3 MS |
2765 | } |
2766 | ||
2767 | /** | |
c9ba9fd3 MS |
2768 | * apply_block_entries() - Apply all the entries in a block to the reference counts. |
2769 | * @block: A block with entries to apply. | |
2770 | * @entry_count: The number of entries to apply. | |
2771 | * @block_number: The sequence number of the block. | |
2772 | * @slab: The slab to apply the entries to. | |
883069e3 | 2773 | * |
c9ba9fd3 | 2774 | * Return: VDO_SUCCESS or an error code. |
883069e3 | 2775 | */ |
c9ba9fd3 MS |
2776 | static int apply_block_entries(struct packed_slab_journal_block *block, |
2777 | journal_entry_count_t entry_count, | |
2778 | sequence_number_t block_number, struct vdo_slab *slab) | |
883069e3 | 2779 | { |
c9ba9fd3 MS |
2780 | struct journal_point entry_point = { |
2781 | .sequence_number = block_number, | |
2782 | .entry_count = 0, | |
2783 | }; | |
883069e3 | 2784 | int result; |
c9ba9fd3 MS |
2785 | slab_block_number max_sbn = slab->end - slab->start; |
2786 | ||
2787 | while (entry_point.entry_count < entry_count) { | |
2788 | struct slab_journal_entry entry = | |
2789 | vdo_decode_slab_journal_entry(block, entry_point.entry_count); | |
2790 | ||
2791 | if (entry.sbn > max_sbn) { | |
2792 | /* This entry is out of bounds. */ | |
2793 | return uds_log_error_strerror(VDO_CORRUPT_JOURNAL, | |
2794 | "vdo_slab journal entry (%llu, %u) had invalid offset %u in slab (size %u blocks)", | |
2795 | (unsigned long long) block_number, | |
2796 | entry_point.entry_count, | |
2797 | entry.sbn, max_sbn); | |
2798 | } | |
883069e3 | 2799 | |
c9ba9fd3 MS |
2800 | result = replay_reference_count_change(slab, &entry_point, entry); |
2801 | if (result != VDO_SUCCESS) { | |
2802 | uds_log_error_strerror(result, | |
2803 | "vdo_slab journal entry (%llu, %u) (%s of offset %u) could not be applied in slab %u", | |
2804 | (unsigned long long) block_number, | |
2805 | entry_point.entry_count, | |
2806 | vdo_get_journal_operation_name(entry.operation), | |
2807 | entry.sbn, slab->slab_number); | |
2808 | return result; | |
2809 | } | |
2810 | entry_point.entry_count++; | |
2811 | } | |
2812 | ||
2813 | return VDO_SUCCESS; | |
2814 | } | |
2815 | ||
2816 | /** | |
2817 | * apply_journal_entries() - Find the relevant vio of the slab journal and apply all valid entries. | |
2818 | * @completion: The metadata read vio completion. | |
2819 | * | |
2820 | * This is a callback registered in start_scrubbing(). | |
2821 | */ | |
2822 | static void apply_journal_entries(struct vdo_completion *completion) | |
2823 | { | |
2824 | int result; | |
1ccef45a MS |
2825 | struct slab_scrubber *scrubber = |
2826 | container_of(as_vio(completion), struct slab_scrubber, vio); | |
c9ba9fd3 MS |
2827 | struct vdo_slab *slab = scrubber->slab; |
2828 | struct slab_journal *journal = &slab->journal; | |
2829 | ||
2830 | /* Find the boundaries of the useful part of the journal. */ | |
2831 | sequence_number_t tail = journal->tail; | |
2832 | tail_block_offset_t end_index = (tail - 1) % journal->size; | |
2833 | char *end_data = scrubber->vio.data + (end_index * VDO_BLOCK_SIZE); | |
2834 | struct packed_slab_journal_block *end_block = | |
2835 | (struct packed_slab_journal_block *) end_data; | |
2836 | ||
2837 | sequence_number_t head = __le64_to_cpu(end_block->header.head); | |
2838 | tail_block_offset_t head_index = head % journal->size; | |
2839 | block_count_t index = head_index; | |
2840 | ||
2841 | struct journal_point ref_counts_point = slab->slab_journal_point; | |
2842 | struct journal_point last_entry_applied = ref_counts_point; | |
2843 | sequence_number_t sequence; | |
2844 | ||
2845 | for (sequence = head; sequence < tail; sequence++) { | |
2846 | char *block_data = scrubber->vio.data + (index * VDO_BLOCK_SIZE); | |
2847 | struct packed_slab_journal_block *block = | |
2848 | (struct packed_slab_journal_block *) block_data; | |
2849 | struct slab_journal_block_header header; | |
2850 | ||
2851 | vdo_unpack_slab_journal_block_header(&block->header, &header); | |
2852 | ||
2853 | if ((header.nonce != slab->allocator->nonce) || | |
2854 | (header.metadata_type != VDO_METADATA_SLAB_JOURNAL) || | |
2855 | (header.sequence_number != sequence) || | |
2856 | (header.entry_count > journal->entries_per_block) || | |
2857 | (header.has_block_map_increments && | |
2858 | (header.entry_count > journal->full_entries_per_block))) { | |
2859 | /* The block is not what we expect it to be. */ | |
2860 | uds_log_error("vdo_slab journal block for slab %u was invalid", | |
2861 | slab->slab_number); | |
2862 | abort_scrubbing(scrubber, VDO_CORRUPT_JOURNAL); | |
2863 | return; | |
2864 | } | |
2865 | ||
2866 | result = apply_block_entries(block, header.entry_count, sequence, slab); | |
2867 | if (result != VDO_SUCCESS) { | |
2868 | abort_scrubbing(scrubber, result); | |
2869 | return; | |
2870 | } | |
2871 | ||
2872 | last_entry_applied.sequence_number = sequence; | |
2873 | last_entry_applied.entry_count = header.entry_count - 1; | |
2874 | index++; | |
2875 | if (index == journal->size) | |
2876 | index = 0; | |
2877 | } | |
2878 | ||
2879 | /* | |
2880 | * At the end of rebuild, the reference counters should be accurate to the end of the | |
2881 | * journal we just applied. | |
2882 | */ | |
2883 | result = ASSERT(!vdo_before_journal_point(&last_entry_applied, | |
2884 | &ref_counts_point), | |
2885 | "Refcounts are not more accurate than the slab journal"); | |
2886 | if (result != VDO_SUCCESS) { | |
2887 | abort_scrubbing(scrubber, result); | |
2888 | return; | |
2889 | } | |
2890 | ||
2891 | /* Save out the rebuilt reference blocks. */ | |
2892 | vdo_prepare_completion(completion, slab_scrubbed, handle_scrubber_error, | |
2893 | slab->allocator->thread_id, completion->parent); | |
2894 | vdo_start_operation_with_waiter(&slab->state, | |
2895 | VDO_ADMIN_STATE_SAVE_FOR_SCRUBBING, | |
2896 | completion, initiate_slab_action); | |
2897 | } | |
2898 | ||
2899 | static void read_slab_journal_endio(struct bio *bio) | |
2900 | { | |
2901 | struct vio *vio = bio->bi_private; | |
2902 | struct slab_scrubber *scrubber = container_of(vio, struct slab_scrubber, vio); | |
2903 | ||
2904 | continue_vio_after_io(bio->bi_private, apply_journal_entries, | |
2905 | scrubber->slab->allocator->thread_id); | |
2906 | } | |
2907 | ||
2908 | /** | |
2909 | * start_scrubbing() - Read the current slab's journal from disk now that it has been flushed. | |
2910 | * @completion: The scrubber's vio completion. | |
2911 | * | |
2912 | * This callback is registered in scrub_next_slab(). | |
2913 | */ | |
2914 | static void start_scrubbing(struct vdo_completion *completion) | |
2915 | { | |
2916 | struct slab_scrubber *scrubber = | |
2917 | container_of(as_vio(completion), struct slab_scrubber, vio); | |
2918 | struct vdo_slab *slab = scrubber->slab; | |
2919 | ||
2920 | if (!slab->allocator->summary_entries[slab->slab_number].is_dirty) { | |
2921 | slab_scrubbed(completion); | |
2922 | return; | |
2923 | } | |
2924 | ||
f7f46761 MS |
2925 | vdo_submit_metadata_vio(&scrubber->vio, slab->journal_origin, |
2926 | read_slab_journal_endio, handle_scrubber_error, | |
2927 | REQ_OP_READ); | |
c9ba9fd3 MS |
2928 | } |
2929 | ||
2930 | /** | |
2931 | * scrub_next_slab() - Scrub the next slab if there is one. | |
2932 | * @scrubber: The scrubber. | |
2933 | */ | |
2934 | static void scrub_next_slab(struct slab_scrubber *scrubber) | |
2935 | { | |
2936 | struct vdo_completion *completion = &scrubber->vio.completion; | |
2937 | struct vdo_slab *slab; | |
2938 | ||
2939 | /* | |
2940 | * Note: this notify call is always safe only because scrubbing can only be started when | |
2941 | * the VDO is quiescent. | |
2942 | */ | |
d6e260cc | 2943 | vdo_waitq_notify_all_waiters(&scrubber->waiters, NULL, NULL); |
c9ba9fd3 MS |
2944 | |
2945 | if (vdo_is_read_only(completion->vdo)) { | |
2946 | finish_scrubbing(scrubber, VDO_READ_ONLY); | |
2947 | return; | |
2948 | } | |
2949 | ||
2950 | slab = get_next_slab(scrubber); | |
2951 | if ((slab == NULL) || | |
2952 | (scrubber->high_priority_only && list_empty(&scrubber->high_priority_slabs))) { | |
2953 | finish_scrubbing(scrubber, VDO_SUCCESS); | |
2954 | return; | |
2955 | } | |
2956 | ||
2957 | if (vdo_finish_draining(&scrubber->admin_state)) | |
2958 | return; | |
2959 | ||
2960 | list_del_init(&slab->allocq_entry); | |
2961 | scrubber->slab = slab; | |
2962 | vdo_prepare_completion(completion, start_scrubbing, handle_scrubber_error, | |
2963 | slab->allocator->thread_id, completion->parent); | |
2964 | vdo_start_operation_with_waiter(&slab->state, VDO_ADMIN_STATE_SCRUBBING, | |
2965 | completion, initiate_slab_action); | |
2966 | } | |
2967 | ||
2968 | /** | |
2969 | * scrub_slabs() - Scrub all of an allocator's slabs that are eligible for scrubbing. | |
2970 | * @allocator: The block_allocator to scrub. | |
2971 | * @parent: The completion to notify when scrubbing is done, implies high_priority, may be NULL. | |
2972 | */ | |
2973 | static void scrub_slabs(struct block_allocator *allocator, struct vdo_completion *parent) | |
2974 | { | |
2975 | struct slab_scrubber *scrubber = &allocator->scrubber; | |
2976 | ||
2977 | scrubber->vio.completion.parent = parent; | |
2978 | scrubber->high_priority_only = (parent != NULL); | |
2979 | if (!has_slabs_to_scrub(scrubber)) { | |
2980 | finish_scrubbing(scrubber, VDO_SUCCESS); | |
2981 | return; | |
2982 | } | |
2983 | ||
2984 | if (scrubber->high_priority_only && | |
2985 | vdo_is_priority_table_empty(allocator->prioritized_slabs) && | |
2986 | list_empty(&scrubber->high_priority_slabs)) | |
2987 | register_slab_for_scrubbing(get_next_slab(scrubber), true); | |
2988 | ||
2989 | vdo_resume_if_quiescent(&scrubber->admin_state); | |
2990 | scrub_next_slab(scrubber); | |
2991 | } | |
2992 | ||
2993 | static inline void assert_on_allocator_thread(thread_id_t thread_id, | |
2994 | const char *function_name) | |
2995 | { | |
2996 | ASSERT_LOG_ONLY((vdo_get_callback_thread_id() == thread_id), | |
2997 | "%s called on correct thread", function_name); | |
2998 | } | |
2999 | ||
3000 | static void register_slab_with_allocator(struct block_allocator *allocator, | |
3001 | struct vdo_slab *slab) | |
3002 | { | |
3003 | allocator->slab_count++; | |
3004 | allocator->last_slab = slab->slab_number; | |
3005 | } | |
3006 | ||
7ce49449 MS |
3007 | /** |
3008 | * get_depot_slab_iterator() - Return a slab_iterator over the slabs in a slab_depot. | |
3009 | * @depot: The depot over which to iterate. | |
3010 | * @start: The number of the slab to start iterating from. | |
3011 | * @end: The number of the last slab which may be returned. | |
3012 | * @stride: The difference in slab number between successive slabs. | |
3013 | * | |
3014 | * Iteration always occurs from higher to lower numbered slabs. | |
3015 | * | |
3016 | * Return: An initialized iterator structure. | |
3017 | */ | |
3018 | static struct slab_iterator get_depot_slab_iterator(struct slab_depot *depot, | |
3019 | slab_count_t start, slab_count_t end, | |
3020 | slab_count_t stride) | |
3021 | { | |
3022 | struct vdo_slab **slabs = depot->slabs; | |
3023 | ||
3024 | return (struct slab_iterator) { | |
3025 | .slabs = slabs, | |
3026 | .next = (((slabs == NULL) || (start < end)) ? NULL : slabs[start]), | |
3027 | .end = end, | |
3028 | .stride = stride, | |
3029 | }; | |
3030 | } | |
3031 | ||
c9ba9fd3 MS |
3032 | static struct slab_iterator get_slab_iterator(const struct block_allocator *allocator) |
3033 | { | |
3034 | return get_depot_slab_iterator(allocator->depot, allocator->last_slab, | |
3035 | allocator->zone_number, | |
3036 | allocator->depot->zone_count); | |
3037 | } | |
3038 | ||
3039 | /** | |
3040 | * next_slab() - Get the next slab from a slab_iterator and advance the iterator | |
3041 | * @iterator: The slab_iterator. | |
3042 | * | |
3043 | * Return: The next slab or NULL if the iterator is exhausted. | |
3044 | */ | |
3045 | static struct vdo_slab *next_slab(struct slab_iterator *iterator) | |
3046 | { | |
3047 | struct vdo_slab *slab = iterator->next; | |
3048 | ||
3049 | if ((slab == NULL) || (slab->slab_number < iterator->end + iterator->stride)) | |
3050 | iterator->next = NULL; | |
3051 | else | |
3052 | iterator->next = iterator->slabs[slab->slab_number - iterator->stride]; | |
3053 | ||
3054 | return slab; | |
3055 | } | |
3056 | ||
3057 | /** | |
3058 | * abort_waiter() - Abort vios waiting to make journal entries when read-only. | |
3059 | * | |
3060 | * This callback is invoked on all vios waiting to make slab journal entries after the VDO has gone | |
3061 | * into read-only mode. Implements waiter_callback_fn. | |
3062 | */ | |
d6e260cc | 3063 | static void abort_waiter(struct vdo_waiter *waiter, void *context __always_unused) |
c9ba9fd3 MS |
3064 | { |
3065 | struct reference_updater *updater = | |
3066 | container_of(waiter, struct reference_updater, waiter); | |
3067 | struct data_vio *data_vio = data_vio_from_reference_updater(updater); | |
3068 | ||
3069 | if (updater->increment) { | |
3070 | continue_data_vio_with_error(data_vio, VDO_READ_ONLY); | |
3071 | return; | |
3072 | } | |
3073 | ||
3074 | vdo_continue_completion(&data_vio->decrement_completion, VDO_READ_ONLY); | |
3075 | } | |
3076 | ||
3077 | /* Implements vdo_read_only_notification_fn. */ | |
3078 | static void notify_block_allocator_of_read_only_mode(void *listener, | |
3079 | struct vdo_completion *parent) | |
3080 | { | |
3081 | struct block_allocator *allocator = listener; | |
3082 | struct slab_iterator iterator; | |
3083 | ||
3084 | assert_on_allocator_thread(allocator->thread_id, __func__); | |
3085 | iterator = get_slab_iterator(allocator); | |
3086 | while (iterator.next != NULL) { | |
3087 | struct vdo_slab *slab = next_slab(&iterator); | |
3088 | ||
d6e260cc MS |
3089 | vdo_waitq_notify_all_waiters(&slab->journal.entry_waiters, |
3090 | abort_waiter, &slab->journal); | |
c9ba9fd3 MS |
3091 | check_if_slab_drained(slab); |
3092 | } | |
3093 | ||
3094 | vdo_finish_completion(parent); | |
3095 | } | |
3096 | ||
3097 | /** | |
3098 | * vdo_acquire_provisional_reference() - Acquire a provisional reference on behalf of a PBN lock if | |
3099 | * the block it locks is unreferenced. | |
3100 | * @slab: The slab which contains the block. | |
3101 | * @pbn: The physical block to reference. | |
3102 | * @lock: The lock. | |
3103 | * | |
3104 | * Return: VDO_SUCCESS or an error. | |
3105 | */ | |
3106 | int vdo_acquire_provisional_reference(struct vdo_slab *slab, physical_block_number_t pbn, | |
3107 | struct pbn_lock *lock) | |
3108 | { | |
3109 | slab_block_number block_number; | |
3110 | int result; | |
3111 | ||
3112 | if (vdo_pbn_lock_has_provisional_reference(lock)) | |
3113 | return VDO_SUCCESS; | |
3114 | ||
3115 | if (!is_slab_open(slab)) | |
3116 | return VDO_INVALID_ADMIN_STATE; | |
3117 | ||
3118 | result = slab_block_number_from_pbn(slab, pbn, &block_number); | |
3119 | if (result != VDO_SUCCESS) | |
3120 | return result; | |
3121 | ||
3122 | if (slab->counters[block_number] == EMPTY_REFERENCE_COUNT) { | |
3123 | make_provisional_reference(slab, block_number); | |
3124 | if (lock != NULL) | |
3125 | vdo_assign_pbn_lock_provisional_reference(lock); | |
3126 | } | |
3127 | ||
3128 | if (vdo_pbn_lock_has_provisional_reference(lock)) | |
3129 | adjust_free_block_count(slab, false); | |
3130 | ||
3131 | return VDO_SUCCESS; | |
3132 | } | |
3133 | ||
3134 | static int __must_check allocate_slab_block(struct vdo_slab *slab, | |
3135 | physical_block_number_t *block_number_ptr) | |
3136 | { | |
3137 | slab_block_number free_index; | |
3138 | ||
3139 | if (!is_slab_open(slab)) | |
3140 | return VDO_INVALID_ADMIN_STATE; | |
3141 | ||
3142 | if (!search_reference_blocks(slab, &free_index)) | |
3143 | return VDO_NO_SPACE; | |
3144 | ||
3145 | ASSERT_LOG_ONLY((slab->counters[free_index] == EMPTY_REFERENCE_COUNT), | |
3146 | "free block must have ref count of zero"); | |
3147 | make_provisional_reference(slab, free_index); | |
3148 | adjust_free_block_count(slab, false); | |
3149 | ||
3150 | /* | |
3151 | * Update the search hint so the next search will start at the array index just past the | |
3152 | * free block we just found. | |
3153 | */ | |
3154 | slab->search_cursor.index = (free_index + 1); | |
3155 | ||
3156 | *block_number_ptr = slab->start + free_index; | |
3157 | return VDO_SUCCESS; | |
3158 | } | |
3159 | ||
3160 | /** | |
3161 | * open_slab() - Prepare a slab to be allocated from. | |
3162 | * @slab: The slab. | |
3163 | */ | |
3164 | static void open_slab(struct vdo_slab *slab) | |
3165 | { | |
3166 | reset_search_cursor(slab); | |
3167 | if (is_slab_journal_blank(slab)) { | |
3168 | WRITE_ONCE(slab->allocator->statistics.slabs_opened, | |
3169 | slab->allocator->statistics.slabs_opened + 1); | |
3170 | dirty_all_reference_blocks(slab); | |
3171 | } else { | |
3172 | WRITE_ONCE(slab->allocator->statistics.slabs_reopened, | |
3173 | slab->allocator->statistics.slabs_reopened + 1); | |
3174 | } | |
3175 | ||
3176 | slab->allocator->open_slab = slab; | |
3177 | } | |
3178 | ||
3179 | ||
3180 | /* | |
3181 | * The block allocated will have a provisional reference and the reference must be either confirmed | |
3182 | * with a subsequent increment or vacated with a subsequent decrement via | |
3183 | * vdo_release_block_reference(). | |
3184 | */ | |
3185 | int vdo_allocate_block(struct block_allocator *allocator, | |
3186 | physical_block_number_t *block_number_ptr) | |
3187 | { | |
3188 | int result; | |
3189 | ||
3190 | if (allocator->open_slab != NULL) { | |
3191 | /* Try to allocate the next block in the currently open slab. */ | |
3192 | result = allocate_slab_block(allocator->open_slab, block_number_ptr); | |
3193 | if ((result == VDO_SUCCESS) || (result != VDO_NO_SPACE)) | |
3194 | return result; | |
3195 | ||
3196 | /* Put the exhausted open slab back into the priority table. */ | |
3197 | prioritize_slab(allocator->open_slab); | |
3198 | } | |
3199 | ||
3200 | /* Remove the highest priority slab from the priority table and make it the open slab. */ | |
3201 | open_slab(list_entry(vdo_priority_table_dequeue(allocator->prioritized_slabs), | |
3202 | struct vdo_slab, allocq_entry)); | |
3203 | ||
3204 | /* | |
3205 | * Try allocating again. If we're out of space immediately after opening a slab, then every | |
3206 | * slab must be fully allocated. | |
3207 | */ | |
3208 | return allocate_slab_block(allocator->open_slab, block_number_ptr); | |
3209 | } | |
3210 | ||
3211 | /** | |
3212 | * vdo_enqueue_clean_slab_waiter() - Wait for a clean slab. | |
3213 | * @allocator: The block_allocator on which to wait. | |
3214 | * @waiter: The waiter. | |
3215 | * | |
3216 | * Return: VDO_SUCCESS if the waiter was queued, VDO_NO_SPACE if there are no slabs to scrub, and | |
3217 | * some other error otherwise. | |
3218 | */ | |
3219 | int vdo_enqueue_clean_slab_waiter(struct block_allocator *allocator, | |
d6e260cc | 3220 | struct vdo_waiter *waiter) |
c9ba9fd3 MS |
3221 | { |
3222 | if (vdo_is_read_only(allocator->depot->vdo)) | |
3223 | return VDO_READ_ONLY; | |
3224 | ||
3225 | if (vdo_is_state_quiescent(&allocator->scrubber.admin_state)) | |
3226 | return VDO_NO_SPACE; | |
3227 | ||
d6e260cc | 3228 | vdo_waitq_enqueue_waiter(&allocator->scrubber.waiters, waiter); |
c9ba9fd3 MS |
3229 | return VDO_SUCCESS; |
3230 | } | |
3231 | ||
3232 | /** | |
3233 | * vdo_modify_reference_count() - Modify the reference count of a block by first making a slab | |
3234 | * journal entry and then updating the reference counter. | |
3235 | * | |
3236 | * @data_vio: The data_vio for which to add the entry. | |
3237 | * @updater: Which of the data_vio's reference updaters is being submitted. | |
3238 | */ | |
3239 | void vdo_modify_reference_count(struct vdo_completion *completion, | |
3240 | struct reference_updater *updater) | |
3241 | { | |
3242 | struct vdo_slab *slab = vdo_get_slab(completion->vdo->depot, updater->zpbn.pbn); | |
3243 | ||
3244 | if (!is_slab_open(slab)) { | |
3245 | vdo_continue_completion(completion, VDO_INVALID_ADMIN_STATE); | |
3246 | return; | |
3247 | } | |
3248 | ||
3249 | if (vdo_is_read_only(completion->vdo)) { | |
3250 | vdo_continue_completion(completion, VDO_READ_ONLY); | |
3251 | return; | |
3252 | } | |
3253 | ||
d6e260cc | 3254 | vdo_waitq_enqueue_waiter(&slab->journal.entry_waiters, &updater->waiter); |
c9ba9fd3 MS |
3255 | if ((slab->status != VDO_SLAB_REBUILT) && requires_reaping(&slab->journal)) |
3256 | register_slab_for_scrubbing(slab, true); | |
3257 | ||
3258 | add_entries(&slab->journal); | |
3259 | } | |
3260 | ||
3261 | /* Release an unused provisional reference. */ | |
3262 | int vdo_release_block_reference(struct block_allocator *allocator, | |
3263 | physical_block_number_t pbn) | |
3264 | { | |
3265 | struct reference_updater updater; | |
3266 | ||
3267 | if (pbn == VDO_ZERO_BLOCK) | |
3268 | return VDO_SUCCESS; | |
3269 | ||
3270 | updater = (struct reference_updater) { | |
3271 | .operation = VDO_JOURNAL_DATA_REMAPPING, | |
3272 | .increment = false, | |
3273 | .zpbn = { | |
3274 | .pbn = pbn, | |
3275 | }, | |
3276 | }; | |
3277 | ||
3278 | return adjust_reference_count(vdo_get_slab(allocator->depot, pbn), | |
3279 | &updater, NULL); | |
3280 | } | |
3281 | ||
3282 | /* | |
3283 | * This is a min_heap callback function orders slab_status structures using the 'is_clean' field as | |
3284 | * the primary key and the 'emptiness' field as the secondary key. | |
3285 | * | |
3286 | * Slabs need to be pushed onto the rings in the same order they are to be popped off. Popping | |
3287 | * should always get the most empty first, so pushing should be from most empty to least empty. | |
3288 | * Thus, the ordering is reversed from the usual sense since min_heap returns smaller elements | |
3289 | * before larger ones. | |
3290 | */ | |
3291 | static bool slab_status_is_less_than(const void *item1, const void *item2) | |
3292 | { | |
3293 | const struct slab_status *info1 = item1; | |
3294 | const struct slab_status *info2 = item2; | |
3295 | ||
3296 | if (info1->is_clean != info2->is_clean) | |
3297 | return info1->is_clean; | |
3298 | if (info1->emptiness != info2->emptiness) | |
3299 | return info1->emptiness > info2->emptiness; | |
3300 | return info1->slab_number < info2->slab_number; | |
3301 | } | |
3302 | ||
3303 | static void swap_slab_statuses(void *item1, void *item2) | |
3304 | { | |
3305 | struct slab_status *info1 = item1; | |
3306 | struct slab_status *info2 = item2; | |
3307 | ||
3308 | swap(*info1, *info2); | |
3309 | } | |
3310 | ||
3311 | static const struct min_heap_callbacks slab_status_min_heap = { | |
3312 | .elem_size = sizeof(struct slab_status), | |
3313 | .less = slab_status_is_less_than, | |
3314 | .swp = swap_slab_statuses, | |
3315 | }; | |
3316 | ||
3317 | /* Inform the slab actor that a action has finished on some slab; used by apply_to_slabs(). */ | |
3318 | static void slab_action_callback(struct vdo_completion *completion) | |
3319 | { | |
3320 | struct block_allocator *allocator = vdo_as_block_allocator(completion); | |
3321 | struct slab_actor *actor = &allocator->slab_actor; | |
3322 | ||
3323 | if (--actor->slab_action_count == 0) { | |
3324 | actor->callback(completion); | |
3325 | return; | |
3326 | } | |
3327 | ||
3328 | vdo_reset_completion(completion); | |
3329 | } | |
3330 | ||
3331 | /* Preserve the error from part of an action and continue. */ | |
3332 | static void handle_operation_error(struct vdo_completion *completion) | |
3333 | { | |
3334 | struct block_allocator *allocator = vdo_as_block_allocator(completion); | |
3335 | ||
3336 | if (allocator->state.waiter != NULL) | |
3337 | vdo_set_completion_result(allocator->state.waiter, completion->result); | |
3338 | completion->callback(completion); | |
3339 | } | |
3340 | ||
3341 | /* Perform an action on each of an allocator's slabs in parallel. */ | |
3342 | static void apply_to_slabs(struct block_allocator *allocator, vdo_action_fn callback) | |
3343 | { | |
3344 | struct slab_iterator iterator; | |
3345 | ||
3346 | vdo_prepare_completion(&allocator->completion, slab_action_callback, | |
3347 | handle_operation_error, allocator->thread_id, NULL); | |
3348 | allocator->completion.requeue = false; | |
3349 | ||
3350 | /* | |
3351 | * Since we are going to dequeue all of the slabs, the open slab will become invalid, so | |
3352 | * clear it. | |
3353 | */ | |
3354 | allocator->open_slab = NULL; | |
3355 | ||
3356 | /* Ensure that we don't finish before we're done starting. */ | |
3357 | allocator->slab_actor = (struct slab_actor) { | |
3358 | .slab_action_count = 1, | |
3359 | .callback = callback, | |
3360 | }; | |
3361 | ||
3362 | iterator = get_slab_iterator(allocator); | |
3363 | while (iterator.next != NULL) { | |
3364 | const struct admin_state_code *operation = | |
3365 | vdo_get_admin_state_code(&allocator->state); | |
3366 | struct vdo_slab *slab = next_slab(&iterator); | |
3367 | ||
3368 | list_del_init(&slab->allocq_entry); | |
3369 | allocator->slab_actor.slab_action_count++; | |
3370 | vdo_start_operation_with_waiter(&slab->state, operation, | |
3371 | &allocator->completion, | |
3372 | initiate_slab_action); | |
3373 | } | |
3374 | ||
3375 | slab_action_callback(&allocator->completion); | |
3376 | } | |
3377 | ||
3378 | static void finish_loading_allocator(struct vdo_completion *completion) | |
3379 | { | |
3380 | struct block_allocator *allocator = vdo_as_block_allocator(completion); | |
3381 | const struct admin_state_code *operation = | |
3382 | vdo_get_admin_state_code(&allocator->state); | |
3383 | ||
3384 | if (allocator->eraser != NULL) | |
0eea6b6e | 3385 | dm_kcopyd_client_destroy(vdo_forget(allocator->eraser)); |
c9ba9fd3 MS |
3386 | |
3387 | if (operation == VDO_ADMIN_STATE_LOADING_FOR_RECOVERY) { | |
3388 | void *context = | |
3389 | vdo_get_current_action_context(allocator->depot->action_manager); | |
3390 | ||
3391 | vdo_replay_into_slab_journals(allocator, context); | |
3392 | return; | |
3393 | } | |
3394 | ||
3395 | vdo_finish_loading(&allocator->state); | |
3396 | } | |
3397 | ||
3398 | static void erase_next_slab_journal(struct block_allocator *allocator); | |
3399 | ||
3400 | static void copy_callback(int read_err, unsigned long write_err, void *context) | |
3401 | { | |
3402 | struct block_allocator *allocator = context; | |
3403 | int result = (((read_err == 0) && (write_err == 0)) ? VDO_SUCCESS : -EIO); | |
3404 | ||
3405 | if (result != VDO_SUCCESS) { | |
3406 | vdo_fail_completion(&allocator->completion, result); | |
3407 | return; | |
3408 | } | |
3409 | ||
3410 | erase_next_slab_journal(allocator); | |
3411 | } | |
3412 | ||
3413 | /* erase_next_slab_journal() - Erase the next slab journal. */ | |
3414 | static void erase_next_slab_journal(struct block_allocator *allocator) | |
3415 | { | |
3416 | struct vdo_slab *slab; | |
3417 | physical_block_number_t pbn; | |
3418 | struct dm_io_region regions[1]; | |
3419 | struct slab_depot *depot = allocator->depot; | |
3420 | block_count_t blocks = depot->slab_config.slab_journal_blocks; | |
3421 | ||
3422 | if (allocator->slabs_to_erase.next == NULL) { | |
3423 | vdo_finish_completion(&allocator->completion); | |
3424 | return; | |
3425 | } | |
3426 | ||
3427 | slab = next_slab(&allocator->slabs_to_erase); | |
3428 | pbn = slab->journal_origin - depot->vdo->geometry.bio_offset; | |
3429 | regions[0] = (struct dm_io_region) { | |
3430 | .bdev = vdo_get_backing_device(depot->vdo), | |
3431 | .sector = pbn * VDO_SECTORS_PER_BLOCK, | |
3432 | .count = blocks * VDO_SECTORS_PER_BLOCK, | |
3433 | }; | |
3434 | dm_kcopyd_zero(allocator->eraser, 1, regions, 0, copy_callback, allocator); | |
3435 | } | |
3436 | ||
3437 | /* Implements vdo_admin_initiator_fn. */ | |
3438 | static void initiate_load(struct admin_state *state) | |
3439 | { | |
3440 | struct block_allocator *allocator = | |
3441 | container_of(state, struct block_allocator, state); | |
3442 | const struct admin_state_code *operation = vdo_get_admin_state_code(state); | |
3443 | ||
3444 | if (operation == VDO_ADMIN_STATE_LOADING_FOR_REBUILD) { | |
3445 | /* | |
3446 | * Must requeue because the kcopyd client cannot be freed in the same stack frame | |
3447 | * as the kcopyd callback, lest it deadlock. | |
3448 | */ | |
3449 | vdo_prepare_completion_for_requeue(&allocator->completion, | |
3450 | finish_loading_allocator, | |
3451 | handle_operation_error, | |
3452 | allocator->thread_id, NULL); | |
3453 | allocator->eraser = dm_kcopyd_client_create(NULL); | |
4c79d556 MS |
3454 | if (IS_ERR(allocator->eraser)) { |
3455 | vdo_fail_completion(&allocator->completion, | |
3456 | PTR_ERR(allocator->eraser)); | |
3457 | allocator->eraser = NULL; | |
c9ba9fd3 MS |
3458 | return; |
3459 | } | |
3460 | allocator->slabs_to_erase = get_slab_iterator(allocator); | |
3461 | ||
3462 | erase_next_slab_journal(allocator); | |
3463 | return; | |
3464 | } | |
3465 | ||
3466 | apply_to_slabs(allocator, finish_loading_allocator); | |
3467 | } | |
3468 | ||
3469 | /** | |
3470 | * vdo_notify_slab_journals_are_recovered() - Inform a block allocator that its slab journals have | |
3471 | * been recovered from the recovery journal. | |
3472 | * @completion The allocator completion | |
3473 | */ | |
3474 | void vdo_notify_slab_journals_are_recovered(struct vdo_completion *completion) | |
3475 | { | |
3476 | struct block_allocator *allocator = vdo_as_block_allocator(completion); | |
3477 | ||
3478 | vdo_finish_loading_with_result(&allocator->state, completion->result); | |
3479 | } | |
3480 | ||
3481 | static int get_slab_statuses(struct block_allocator *allocator, | |
3482 | struct slab_status **statuses_ptr) | |
3483 | { | |
3484 | int result; | |
3485 | struct slab_status *statuses; | |
3486 | struct slab_iterator iterator = get_slab_iterator(allocator); | |
3487 | ||
0eea6b6e | 3488 | result = vdo_allocate(allocator->slab_count, struct slab_status, __func__, |
c9ba9fd3 MS |
3489 | &statuses); |
3490 | if (result != VDO_SUCCESS) | |
3491 | return result; | |
3492 | ||
3493 | *statuses_ptr = statuses; | |
3494 | ||
3495 | while (iterator.next != NULL) { | |
3496 | slab_count_t slab_number = next_slab(&iterator)->slab_number; | |
3497 | ||
3498 | *statuses++ = (struct slab_status) { | |
3499 | .slab_number = slab_number, | |
3500 | .is_clean = !allocator->summary_entries[slab_number].is_dirty, | |
3501 | .emptiness = allocator->summary_entries[slab_number].fullness_hint, | |
3502 | }; | |
3503 | } | |
3504 | ||
3505 | return VDO_SUCCESS; | |
3506 | } | |
3507 | ||
3508 | /* Prepare slabs for allocation or scrubbing. */ | |
3509 | static int __must_check vdo_prepare_slabs_for_allocation(struct block_allocator *allocator) | |
3510 | { | |
3511 | struct slab_status current_slab_status; | |
3512 | struct min_heap heap; | |
3513 | int result; | |
3514 | struct slab_status *slab_statuses; | |
3515 | struct slab_depot *depot = allocator->depot; | |
3516 | ||
3517 | WRITE_ONCE(allocator->allocated_blocks, | |
3518 | allocator->slab_count * depot->slab_config.data_blocks); | |
3519 | result = get_slab_statuses(allocator, &slab_statuses); | |
3520 | if (result != VDO_SUCCESS) | |
3521 | return result; | |
3522 | ||
3523 | /* Sort the slabs by cleanliness, then by emptiness hint. */ | |
3524 | heap = (struct min_heap) { | |
3525 | .data = slab_statuses, | |
3526 | .nr = allocator->slab_count, | |
3527 | .size = allocator->slab_count, | |
3528 | }; | |
3529 | min_heapify_all(&heap, &slab_status_min_heap); | |
3530 | ||
3531 | while (heap.nr > 0) { | |
3532 | bool high_priority; | |
3533 | struct vdo_slab *slab; | |
3534 | struct slab_journal *journal; | |
3535 | ||
3536 | current_slab_status = slab_statuses[0]; | |
3537 | min_heap_pop(&heap, &slab_status_min_heap); | |
3538 | slab = depot->slabs[current_slab_status.slab_number]; | |
3539 | ||
3540 | if ((depot->load_type == VDO_SLAB_DEPOT_REBUILD_LOAD) || | |
3541 | (!allocator->summary_entries[slab->slab_number].load_ref_counts && | |
3542 | current_slab_status.is_clean)) { | |
3543 | queue_slab(slab); | |
3544 | continue; | |
3545 | } | |
3546 | ||
3547 | slab->status = VDO_SLAB_REQUIRES_SCRUBBING; | |
3548 | journal = &slab->journal; | |
3549 | high_priority = ((current_slab_status.is_clean && | |
3550 | (depot->load_type == VDO_SLAB_DEPOT_NORMAL_LOAD)) || | |
3551 | (journal_length(journal) >= journal->scrubbing_threshold)); | |
3552 | register_slab_for_scrubbing(slab, high_priority); | |
3553 | } | |
3554 | ||
0eea6b6e | 3555 | vdo_free(slab_statuses); |
c9ba9fd3 MS |
3556 | return VDO_SUCCESS; |
3557 | } | |
3558 | ||
3559 | static const char *status_to_string(enum slab_rebuild_status status) | |
3560 | { | |
3561 | switch (status) { | |
3562 | case VDO_SLAB_REBUILT: | |
3563 | return "REBUILT"; | |
3564 | case VDO_SLAB_REQUIRES_SCRUBBING: | |
3565 | return "SCRUBBING"; | |
3566 | case VDO_SLAB_REQUIRES_HIGH_PRIORITY_SCRUBBING: | |
3567 | return "PRIORITY_SCRUBBING"; | |
3568 | case VDO_SLAB_REBUILDING: | |
3569 | return "REBUILDING"; | |
3570 | case VDO_SLAB_REPLAYING: | |
3571 | return "REPLAYING"; | |
3572 | default: | |
3573 | return "UNKNOWN"; | |
3574 | } | |
3575 | } | |
3576 | ||
3577 | void vdo_dump_block_allocator(const struct block_allocator *allocator) | |
3578 | { | |
3579 | unsigned int pause_counter = 0; | |
3580 | struct slab_iterator iterator = get_slab_iterator(allocator); | |
3581 | const struct slab_scrubber *scrubber = &allocator->scrubber; | |
3582 | ||
3583 | uds_log_info("block_allocator zone %u", allocator->zone_number); | |
3584 | while (iterator.next != NULL) { | |
3585 | struct vdo_slab *slab = next_slab(&iterator); | |
3586 | struct slab_journal *journal = &slab->journal; | |
3587 | ||
3588 | if (slab->reference_blocks != NULL) { | |
3589 | /* Terse because there are a lot of slabs to dump and syslog is lossy. */ | |
3590 | uds_log_info("slab %u: P%u, %llu free", slab->slab_number, | |
3591 | slab->priority, | |
3592 | (unsigned long long) slab->free_blocks); | |
3593 | } else { | |
3594 | uds_log_info("slab %u: status %s", slab->slab_number, | |
3595 | status_to_string(slab->status)); | |
3596 | } | |
3597 | ||
3598 | uds_log_info(" slab journal: entry_waiters=%zu waiting_to_commit=%s updating_slab_summary=%s head=%llu unreapable=%llu tail=%llu next_commit=%llu summarized=%llu last_summarized=%llu recovery_lock=%llu dirty=%s", | |
d6e260cc | 3599 | vdo_waitq_num_waiters(&journal->entry_waiters), |
c9ba9fd3 MS |
3600 | uds_bool_to_string(journal->waiting_to_commit), |
3601 | uds_bool_to_string(journal->updating_slab_summary), | |
3602 | (unsigned long long) journal->head, | |
3603 | (unsigned long long) journal->unreapable, | |
3604 | (unsigned long long) journal->tail, | |
3605 | (unsigned long long) journal->next_commit, | |
3606 | (unsigned long long) journal->summarized, | |
3607 | (unsigned long long) journal->last_summarized, | |
3608 | (unsigned long long) journal->recovery_lock, | |
3609 | uds_bool_to_string(journal->recovery_lock != 0)); | |
3610 | /* | |
3611 | * Given the frequency with which the locks are just a tiny bit off, it might be | |
3612 | * worth dumping all the locks, but that might be too much logging. | |
3613 | */ | |
3614 | ||
3615 | if (slab->counters != NULL) { | |
3616 | /* Terse because there are a lot of slabs to dump and syslog is lossy. */ | |
3617 | uds_log_info(" slab: free=%u/%u blocks=%u dirty=%zu active=%zu journal@(%llu,%u)", | |
3618 | slab->free_blocks, slab->block_count, | |
3619 | slab->reference_block_count, | |
d6e260cc | 3620 | vdo_waitq_num_waiters(&slab->dirty_blocks), |
c9ba9fd3 MS |
3621 | slab->active_count, |
3622 | (unsigned long long) slab->slab_journal_point.sequence_number, | |
3623 | slab->slab_journal_point.entry_count); | |
3624 | } else { | |
3625 | uds_log_info(" no counters"); | |
3626 | } | |
3627 | ||
3628 | /* | |
3629 | * Wait for a while after each batch of 32 slabs dumped, an arbitrary number, | |
3630 | * allowing the kernel log a chance to be flushed instead of being overrun. | |
3631 | */ | |
3632 | if (pause_counter++ == 31) { | |
3633 | pause_counter = 0; | |
3634 | uds_pause_for_logger(); | |
3635 | } | |
3636 | } | |
3637 | ||
3638 | uds_log_info("slab_scrubber slab_count %u waiters %zu %s%s", | |
3639 | READ_ONCE(scrubber->slab_count), | |
d6e260cc | 3640 | vdo_waitq_num_waiters(&scrubber->waiters), |
c9ba9fd3 MS |
3641 | vdo_get_admin_state_code(&scrubber->admin_state)->name, |
3642 | scrubber->high_priority_only ? ", high_priority_only " : ""); | |
3643 | } | |
3644 | ||
3645 | static void free_slab(struct vdo_slab *slab) | |
3646 | { | |
3647 | if (slab == NULL) | |
3648 | return; | |
3649 | ||
3650 | list_del(&slab->allocq_entry); | |
0eea6b6e MS |
3651 | vdo_free(vdo_forget(slab->journal.block)); |
3652 | vdo_free(vdo_forget(slab->journal.locks)); | |
3653 | vdo_free(vdo_forget(slab->counters)); | |
3654 | vdo_free(vdo_forget(slab->reference_blocks)); | |
3655 | vdo_free(slab); | |
c9ba9fd3 MS |
3656 | } |
3657 | ||
3658 | static int initialize_slab_journal(struct vdo_slab *slab) | |
3659 | { | |
3660 | struct slab_journal *journal = &slab->journal; | |
3661 | const struct slab_config *slab_config = &slab->allocator->depot->slab_config; | |
3662 | int result; | |
3663 | ||
0eea6b6e | 3664 | result = vdo_allocate(slab_config->slab_journal_blocks, struct journal_lock, |
c9ba9fd3 MS |
3665 | __func__, &journal->locks); |
3666 | if (result != VDO_SUCCESS) | |
3667 | return result; | |
3668 | ||
0eea6b6e | 3669 | result = vdo_allocate(VDO_BLOCK_SIZE, char, "struct packed_slab_journal_block", |
c9ba9fd3 MS |
3670 | (char **) &journal->block); |
3671 | if (result != VDO_SUCCESS) | |
3672 | return result; | |
3673 | ||
3674 | journal->slab = slab; | |
3675 | journal->size = slab_config->slab_journal_blocks; | |
3676 | journal->flushing_threshold = slab_config->slab_journal_flushing_threshold; | |
3677 | journal->blocking_threshold = slab_config->slab_journal_blocking_threshold; | |
3678 | journal->scrubbing_threshold = slab_config->slab_journal_scrubbing_threshold; | |
3679 | journal->entries_per_block = VDO_SLAB_JOURNAL_ENTRIES_PER_BLOCK; | |
3680 | journal->full_entries_per_block = VDO_SLAB_JOURNAL_FULL_ENTRIES_PER_BLOCK; | |
3681 | journal->events = &slab->allocator->slab_journal_statistics; | |
3682 | journal->recovery_journal = slab->allocator->depot->vdo->recovery_journal; | |
3683 | journal->tail = 1; | |
3684 | journal->head = 1; | |
3685 | ||
3686 | journal->flushing_deadline = journal->flushing_threshold; | |
3687 | /* | |
3688 | * Set there to be some time between the deadline and the blocking threshold, so that | |
3689 | * hopefully all are done before blocking. | |
3690 | */ | |
3691 | if ((journal->blocking_threshold - journal->flushing_threshold) > 5) | |
3692 | journal->flushing_deadline = journal->blocking_threshold - 5; | |
3693 | ||
3694 | journal->slab_summary_waiter.callback = release_journal_locks; | |
3695 | ||
3696 | INIT_LIST_HEAD(&journal->dirty_entry); | |
3697 | INIT_LIST_HEAD(&journal->uncommitted_blocks); | |
3698 | ||
3699 | journal->tail_header.nonce = slab->allocator->nonce; | |
3700 | journal->tail_header.metadata_type = VDO_METADATA_SLAB_JOURNAL; | |
3701 | initialize_journal_state(journal); | |
3702 | return VDO_SUCCESS; | |
3703 | } | |
3704 | ||
3705 | /** | |
3706 | * make_slab() - Construct a new, empty slab. | |
3707 | * @slab_origin: The physical block number within the block allocator partition of the first block | |
3708 | * in the slab. | |
3709 | * @allocator: The block allocator to which the slab belongs. | |
3710 | * @slab_number: The slab number of the slab. | |
3711 | * @is_new: true if this slab is being allocated as part of a resize. | |
3712 | * @slab_ptr: A pointer to receive the new slab. | |
3713 | * | |
3714 | * Return: VDO_SUCCESS or an error code. | |
3715 | */ | |
3716 | static int __must_check make_slab(physical_block_number_t slab_origin, | |
3717 | struct block_allocator *allocator, | |
3718 | slab_count_t slab_number, bool is_new, | |
3719 | struct vdo_slab **slab_ptr) | |
3720 | { | |
3721 | const struct slab_config *slab_config = &allocator->depot->slab_config; | |
3722 | struct vdo_slab *slab; | |
3723 | int result; | |
3724 | ||
0eea6b6e | 3725 | result = vdo_allocate(1, struct vdo_slab, __func__, &slab); |
c9ba9fd3 MS |
3726 | if (result != VDO_SUCCESS) |
3727 | return result; | |
3728 | ||
3729 | *slab = (struct vdo_slab) { | |
3730 | .allocator = allocator, | |
3731 | .start = slab_origin, | |
3732 | .end = slab_origin + slab_config->slab_blocks, | |
3733 | .slab_number = slab_number, | |
3734 | .ref_counts_origin = slab_origin + slab_config->data_blocks, | |
3735 | .journal_origin = | |
3736 | vdo_get_slab_journal_start_block(slab_config, slab_origin), | |
3737 | .block_count = slab_config->data_blocks, | |
3738 | .free_blocks = slab_config->data_blocks, | |
3739 | .reference_block_count = | |
3740 | vdo_get_saved_reference_count_size(slab_config->data_blocks), | |
3741 | }; | |
3742 | INIT_LIST_HEAD(&slab->allocq_entry); | |
3743 | ||
3744 | result = initialize_slab_journal(slab); | |
3745 | if (result != VDO_SUCCESS) { | |
3746 | free_slab(slab); | |
3747 | return result; | |
3748 | } | |
3749 | ||
3750 | if (is_new) { | |
3751 | vdo_set_admin_state_code(&slab->state, VDO_ADMIN_STATE_NEW); | |
3752 | result = allocate_slab_counters(slab); | |
3753 | if (result != VDO_SUCCESS) { | |
3754 | free_slab(slab); | |
3755 | return result; | |
3756 | } | |
3757 | } else { | |
3758 | vdo_set_admin_state_code(&slab->state, VDO_ADMIN_STATE_NORMAL_OPERATION); | |
3759 | } | |
3760 | ||
3761 | *slab_ptr = slab; | |
3762 | return VDO_SUCCESS; | |
3763 | } | |
3764 | ||
7ce49449 MS |
3765 | /** |
3766 | * allocate_slabs() - Allocate a new slab pointer array. | |
3767 | * @depot: The depot. | |
3768 | * @slab_count: The number of slabs the depot should have in the new array. | |
3769 | * | |
3770 | * Any existing slab pointers will be copied into the new array, and slabs will be allocated as | |
3771 | * needed. The newly allocated slabs will not be distributed for use by the block allocators. | |
3772 | * | |
3773 | * Return: VDO_SUCCESS or an error code. | |
3774 | */ | |
3775 | static int allocate_slabs(struct slab_depot *depot, slab_count_t slab_count) | |
3776 | { | |
3777 | block_count_t slab_size; | |
3778 | bool resizing = false; | |
3779 | physical_block_number_t slab_origin; | |
3780 | int result; | |
3781 | ||
0eea6b6e | 3782 | result = vdo_allocate(slab_count, struct vdo_slab *, |
7ce49449 MS |
3783 | "slab pointer array", &depot->new_slabs); |
3784 | if (result != VDO_SUCCESS) | |
3785 | return result; | |
3786 | ||
3787 | if (depot->slabs != NULL) { | |
3788 | memcpy(depot->new_slabs, depot->slabs, | |
3789 | depot->slab_count * sizeof(struct vdo_slab *)); | |
3790 | resizing = true; | |
3791 | } | |
3792 | ||
3793 | slab_size = depot->slab_config.slab_blocks; | |
3794 | slab_origin = depot->first_block + (depot->slab_count * slab_size); | |
3795 | ||
3796 | for (depot->new_slab_count = depot->slab_count; | |
3797 | depot->new_slab_count < slab_count; | |
3798 | depot->new_slab_count++, slab_origin += slab_size) { | |
3799 | struct block_allocator *allocator = | |
3800 | &depot->allocators[depot->new_slab_count % depot->zone_count]; | |
3801 | struct vdo_slab **slab_ptr = &depot->new_slabs[depot->new_slab_count]; | |
3802 | ||
3803 | result = make_slab(slab_origin, allocator, depot->new_slab_count, | |
3804 | resizing, slab_ptr); | |
3805 | if (result != VDO_SUCCESS) | |
3806 | return result; | |
3807 | } | |
3808 | ||
3809 | return VDO_SUCCESS; | |
3810 | } | |
3811 | ||
3812 | /** | |
3813 | * vdo_abandon_new_slabs() - Abandon any new slabs in this depot, freeing them as needed. | |
3814 | * @depot: The depot. | |
3815 | */ | |
3816 | void vdo_abandon_new_slabs(struct slab_depot *depot) | |
3817 | { | |
3818 | slab_count_t i; | |
3819 | ||
3820 | if (depot->new_slabs == NULL) | |
3821 | return; | |
3822 | ||
3823 | for (i = depot->slab_count; i < depot->new_slab_count; i++) | |
0eea6b6e | 3824 | free_slab(vdo_forget(depot->new_slabs[i])); |
7ce49449 MS |
3825 | depot->new_slab_count = 0; |
3826 | depot->new_size = 0; | |
0eea6b6e | 3827 | vdo_free(vdo_forget(depot->new_slabs)); |
7ce49449 MS |
3828 | } |
3829 | ||
3830 | /** | |
3831 | * get_allocator_thread_id() - Get the ID of the thread on which a given allocator operates. | |
3832 | * | |
3833 | * Implements vdo_zone_thread_getter_fn. | |
3834 | */ | |
3835 | static thread_id_t get_allocator_thread_id(void *context, zone_count_t zone_number) | |
3836 | { | |
3837 | return ((struct slab_depot *) context)->allocators[zone_number].thread_id; | |
3838 | } | |
3839 | ||
3840 | /** | |
3841 | * release_recovery_journal_lock() - Request the slab journal to release the recovery journal lock | |
3842 | * it may hold on a specified recovery journal block. | |
3843 | * @journal: The slab journal. | |
3844 | * @recovery_lock: The sequence number of the recovery journal block whose locks should be | |
3845 | * released. | |
3846 | * | |
3847 | * Return: true if the journal does hold a lock on the specified block (which it will release). | |
3848 | */ | |
3849 | static bool __must_check release_recovery_journal_lock(struct slab_journal *journal, | |
3850 | sequence_number_t recovery_lock) | |
3851 | { | |
3852 | if (recovery_lock > journal->recovery_lock) { | |
3853 | ASSERT_LOG_ONLY((recovery_lock < journal->recovery_lock), | |
3854 | "slab journal recovery lock is not older than the recovery journal head"); | |
3855 | return false; | |
3856 | } | |
3857 | ||
3858 | if ((recovery_lock < journal->recovery_lock) || | |
3859 | vdo_is_read_only(journal->slab->allocator->depot->vdo)) | |
3860 | return false; | |
3861 | ||
3862 | /* All locks are held by the block which is in progress; write it. */ | |
3863 | commit_tail(journal); | |
3864 | return true; | |
3865 | } | |
3866 | ||
3867 | /* | |
3868 | * Request a commit of all dirty tail blocks which are locking the recovery journal block the depot | |
3869 | * is seeking to release. | |
3870 | * | |
3871 | * Implements vdo_zone_action_fn. | |
3872 | */ | |
3873 | static void release_tail_block_locks(void *context, zone_count_t zone_number, | |
3874 | struct vdo_completion *parent) | |
3875 | { | |
3876 | struct slab_journal *journal, *tmp; | |
3877 | struct slab_depot *depot = context; | |
3878 | struct list_head *list = &depot->allocators[zone_number].dirty_slab_journals; | |
3879 | ||
3880 | list_for_each_entry_safe(journal, tmp, list, dirty_entry) { | |
3881 | if (!release_recovery_journal_lock(journal, | |
3882 | depot->active_release_request)) | |
3883 | break; | |
3884 | } | |
3885 | ||
3886 | vdo_finish_completion(parent); | |
3887 | } | |
3888 | ||
3889 | /** | |
3890 | * prepare_for_tail_block_commit() - Prepare to commit oldest tail blocks. | |
3891 | * | |
3892 | * Implements vdo_action_preamble_fn. | |
3893 | */ | |
3894 | static void prepare_for_tail_block_commit(void *context, struct vdo_completion *parent) | |
3895 | { | |
3896 | struct slab_depot *depot = context; | |
3897 | ||
3898 | depot->active_release_request = depot->new_release_request; | |
3899 | vdo_finish_completion(parent); | |
3900 | } | |
3901 | ||
3902 | /** | |
3903 | * schedule_tail_block_commit() - Schedule a tail block commit if necessary. | |
3904 | * | |
3905 | * This method should not be called directly. Rather, call vdo_schedule_default_action() on the | |
3906 | * depot's action manager. | |
3907 | * | |
3908 | * Implements vdo_action_scheduler_fn. | |
3909 | */ | |
3910 | static bool schedule_tail_block_commit(void *context) | |
3911 | { | |
3912 | struct slab_depot *depot = context; | |
3913 | ||
3914 | if (depot->new_release_request == depot->active_release_request) | |
3915 | return false; | |
3916 | ||
3917 | return vdo_schedule_action(depot->action_manager, | |
3918 | prepare_for_tail_block_commit, | |
3919 | release_tail_block_locks, | |
3920 | NULL, NULL); | |
3921 | } | |
3922 | ||
c9ba9fd3 MS |
3923 | /** |
3924 | * initialize_slab_scrubber() - Initialize an allocator's slab scrubber. | |
3925 | * @allocator: The allocator being initialized | |
3926 | * | |
3927 | * Return: VDO_SUCCESS or an error. | |
3928 | */ | |
3929 | static int initialize_slab_scrubber(struct block_allocator *allocator) | |
3930 | { | |
3931 | struct slab_scrubber *scrubber = &allocator->scrubber; | |
3932 | block_count_t slab_journal_size = | |
3933 | allocator->depot->slab_config.slab_journal_blocks; | |
3934 | char *journal_data; | |
3935 | int result; | |
3936 | ||
0eea6b6e | 3937 | result = vdo_allocate(VDO_BLOCK_SIZE * slab_journal_size, |
c9ba9fd3 MS |
3938 | char, __func__, &journal_data); |
3939 | if (result != VDO_SUCCESS) | |
3940 | return result; | |
3941 | ||
3942 | result = allocate_vio_components(allocator->completion.vdo, | |
3943 | VIO_TYPE_SLAB_JOURNAL, | |
3944 | VIO_PRIORITY_METADATA, | |
3945 | allocator, slab_journal_size, | |
3946 | journal_data, &scrubber->vio); | |
3947 | if (result != VDO_SUCCESS) { | |
0eea6b6e | 3948 | vdo_free(journal_data); |
c9ba9fd3 MS |
3949 | return result; |
3950 | } | |
3951 | ||
3952 | INIT_LIST_HEAD(&scrubber->high_priority_slabs); | |
3953 | INIT_LIST_HEAD(&scrubber->slabs); | |
3954 | vdo_set_admin_state_code(&scrubber->admin_state, VDO_ADMIN_STATE_SUSPENDED); | |
3955 | return VDO_SUCCESS; | |
3956 | } | |
3957 | ||
3958 | /** | |
3959 | * initialize_slab_summary_block() - Initialize a slab_summary_block. | |
3960 | * @allocator: The allocator which owns the block. | |
3961 | * @index: The index of this block in its zone's summary. | |
3962 | * | |
3963 | * Return: VDO_SUCCESS or an error. | |
3964 | */ | |
3965 | static int __must_check initialize_slab_summary_block(struct block_allocator *allocator, | |
3966 | block_count_t index) | |
3967 | { | |
3968 | struct slab_summary_block *block = &allocator->summary_blocks[index]; | |
3969 | int result; | |
3970 | ||
0eea6b6e | 3971 | result = vdo_allocate(VDO_BLOCK_SIZE, char, __func__, &block->outgoing_entries); |
c9ba9fd3 MS |
3972 | if (result != VDO_SUCCESS) |
3973 | return result; | |
3974 | ||
3975 | result = allocate_vio_components(allocator->depot->vdo, VIO_TYPE_SLAB_SUMMARY, | |
3976 | VIO_PRIORITY_METADATA, NULL, 1, | |
3977 | block->outgoing_entries, &block->vio); | |
3978 | if (result != VDO_SUCCESS) | |
3979 | return result; | |
3980 | ||
3981 | block->allocator = allocator; | |
3982 | block->entries = &allocator->summary_entries[VDO_SLAB_SUMMARY_ENTRIES_PER_BLOCK * index]; | |
3983 | block->index = index; | |
3984 | return VDO_SUCCESS; | |
3985 | } | |
3986 | ||
3987 | static int __must_check initialize_block_allocator(struct slab_depot *depot, | |
3988 | zone_count_t zone) | |
3989 | { | |
3990 | int result; | |
3991 | block_count_t i; | |
3992 | struct block_allocator *allocator = &depot->allocators[zone]; | |
3993 | struct vdo *vdo = depot->vdo; | |
3994 | block_count_t max_free_blocks = depot->slab_config.data_blocks; | |
3995 | unsigned int max_priority = (2 + ilog2(max_free_blocks)); | |
3996 | ||
3997 | *allocator = (struct block_allocator) { | |
3998 | .depot = depot, | |
3999 | .zone_number = zone, | |
4000 | .thread_id = vdo->thread_config.physical_threads[zone], | |
4001 | .nonce = vdo->states.vdo.nonce, | |
4002 | }; | |
4003 | ||
4004 | INIT_LIST_HEAD(&allocator->dirty_slab_journals); | |
4005 | vdo_set_admin_state_code(&allocator->state, VDO_ADMIN_STATE_NORMAL_OPERATION); | |
4006 | result = vdo_register_read_only_listener(vdo, allocator, | |
4007 | notify_block_allocator_of_read_only_mode, | |
4008 | allocator->thread_id); | |
4009 | if (result != VDO_SUCCESS) | |
4010 | return result; | |
4011 | ||
4012 | vdo_initialize_completion(&allocator->completion, vdo, VDO_BLOCK_ALLOCATOR_COMPLETION); | |
4013 | result = make_vio_pool(vdo, BLOCK_ALLOCATOR_VIO_POOL_SIZE, allocator->thread_id, | |
4014 | VIO_TYPE_SLAB_JOURNAL, VIO_PRIORITY_METADATA, | |
4015 | allocator, &allocator->vio_pool); | |
4016 | if (result != VDO_SUCCESS) | |
4017 | return result; | |
4018 | ||
4019 | result = initialize_slab_scrubber(allocator); | |
4020 | if (result != VDO_SUCCESS) | |
4021 | return result; | |
4022 | ||
4023 | result = vdo_make_priority_table(max_priority, &allocator->prioritized_slabs); | |
4024 | if (result != VDO_SUCCESS) | |
4025 | return result; | |
4026 | ||
0eea6b6e | 4027 | result = vdo_allocate(VDO_SLAB_SUMMARY_BLOCKS_PER_ZONE, |
c9ba9fd3 MS |
4028 | struct slab_summary_block, __func__, |
4029 | &allocator->summary_blocks); | |
4030 | if (result != VDO_SUCCESS) | |
4031 | return result; | |
4032 | ||
4033 | vdo_set_admin_state_code(&allocator->summary_state, | |
4034 | VDO_ADMIN_STATE_NORMAL_OPERATION); | |
4035 | allocator->summary_entries = depot->summary_entries + (MAX_VDO_SLABS * zone); | |
4036 | ||
4037 | /* Initialize each summary block. */ | |
4038 | for (i = 0; i < VDO_SLAB_SUMMARY_BLOCKS_PER_ZONE; i++) { | |
4039 | result = initialize_slab_summary_block(allocator, i); | |
4040 | if (result != VDO_SUCCESS) | |
4041 | return result; | |
4042 | } | |
4043 | ||
4044 | /* | |
4045 | * Performing well atop thin provisioned storage requires either that VDO discards freed | |
4046 | * blocks, or that the block allocator try to use slabs that already have allocated blocks | |
4047 | * in preference to slabs that have never been opened. For reasons we have not been able to | |
4048 | * fully understand, some SSD machines have been have been very sensitive (50% reduction in | |
4049 | * test throughput) to very slight differences in the timing and locality of block | |
4050 | * allocation. Assigning a low priority to unopened slabs (max_priority/2, say) would be | |
4051 | * ideal for the story, but anything less than a very high threshold (max_priority - 1) | |
4052 | * hurts on these machines. | |
4053 | * | |
4054 | * This sets the free block threshold for preferring to open an unopened slab to the binary | |
4055 | * floor of 3/4ths the total number of data blocks in a slab, which will generally evaluate | |
4056 | * to about half the slab size. | |
4057 | */ | |
4058 | allocator->unopened_slab_priority = (1 + ilog2((max_free_blocks * 3) / 4)); | |
4059 | ||
4060 | return VDO_SUCCESS; | |
4061 | } | |
4062 | ||
7ce49449 MS |
4063 | static int allocate_components(struct slab_depot *depot, |
4064 | struct partition *summary_partition) | |
4065 | { | |
4066 | int result; | |
4067 | zone_count_t zone; | |
4068 | slab_count_t slab_count; | |
4069 | u8 hint; | |
4070 | u32 i; | |
4071 | const struct thread_config *thread_config = &depot->vdo->thread_config; | |
4072 | ||
4073 | result = vdo_make_action_manager(depot->zone_count, get_allocator_thread_id, | |
4074 | thread_config->journal_thread, depot, | |
4075 | schedule_tail_block_commit, | |
4076 | depot->vdo, &depot->action_manager); | |
4077 | if (result != VDO_SUCCESS) | |
4078 | return result; | |
4079 | ||
4080 | depot->origin = depot->first_block; | |
4081 | ||
4082 | /* block size must be a multiple of entry size */ | |
4083 | BUILD_BUG_ON((VDO_BLOCK_SIZE % sizeof(struct slab_summary_entry)) != 0); | |
4084 | ||
4085 | depot->summary_origin = summary_partition->offset; | |
4086 | depot->hint_shift = vdo_get_slab_summary_hint_shift(depot->slab_size_shift); | |
0eea6b6e | 4087 | result = vdo_allocate(MAXIMUM_VDO_SLAB_SUMMARY_ENTRIES, |
7ce49449 MS |
4088 | struct slab_summary_entry, __func__, |
4089 | &depot->summary_entries); | |
4090 | if (result != VDO_SUCCESS) | |
4091 | return result; | |
4092 | ||
4093 | ||
4094 | /* Initialize all the entries. */ | |
4095 | hint = compute_fullness_hint(depot, depot->slab_config.data_blocks); | |
4096 | for (i = 0; i < MAXIMUM_VDO_SLAB_SUMMARY_ENTRIES; i++) { | |
4097 | /* | |
4098 | * This default tail block offset must be reflected in | |
4099 | * slabJournal.c::read_slab_journal_tail(). | |
4100 | */ | |
4101 | depot->summary_entries[i] = (struct slab_summary_entry) { | |
4102 | .tail_block_offset = 0, | |
4103 | .fullness_hint = hint, | |
4104 | .load_ref_counts = false, | |
4105 | .is_dirty = false, | |
4106 | }; | |
4107 | } | |
4108 | ||
7ce49449 MS |
4109 | slab_count = vdo_compute_slab_count(depot->first_block, depot->last_block, |
4110 | depot->slab_size_shift); | |
4111 | if (thread_config->physical_zone_count > slab_count) { | |
4112 | return uds_log_error_strerror(VDO_BAD_CONFIGURATION, | |
4113 | "%u physical zones exceeds slab count %u", | |
4114 | thread_config->physical_zone_count, | |
4115 | slab_count); | |
4116 | } | |
4117 | ||
4118 | /* Initialize the block allocators. */ | |
4119 | for (zone = 0; zone < depot->zone_count; zone++) { | |
4120 | result = initialize_block_allocator(depot, zone); | |
4121 | if (result != VDO_SUCCESS) | |
4122 | return result; | |
4123 | } | |
4124 | ||
4125 | /* Allocate slabs. */ | |
4126 | result = allocate_slabs(depot, slab_count); | |
4127 | if (result != VDO_SUCCESS) | |
4128 | return result; | |
4129 | ||
4130 | /* Use the new slabs. */ | |
4131 | for (i = depot->slab_count; i < depot->new_slab_count; i++) { | |
4132 | struct vdo_slab *slab = depot->new_slabs[i]; | |
4133 | ||
4134 | register_slab_with_allocator(slab->allocator, slab); | |
4135 | WRITE_ONCE(depot->slab_count, depot->slab_count + 1); | |
4136 | } | |
4137 | ||
4138 | depot->slabs = depot->new_slabs; | |
4139 | depot->new_slabs = NULL; | |
4140 | depot->new_slab_count = 0; | |
4141 | ||
4142 | return VDO_SUCCESS; | |
4143 | } | |
4144 | ||
4145 | /** | |
4146 | * vdo_decode_slab_depot() - Make a slab depot and configure it with the state read from the super | |
4147 | * block. | |
4148 | * @state: The slab depot state from the super block. | |
4149 | * @vdo: The VDO which will own the depot. | |
4150 | * @summary_partition: The partition which holds the slab summary. | |
4151 | * @depot_ptr: A pointer to hold the depot. | |
4152 | * | |
4153 | * Return: A success or error code. | |
4154 | */ | |
4155 | int vdo_decode_slab_depot(struct slab_depot_state_2_0 state, struct vdo *vdo, | |
4156 | struct partition *summary_partition, | |
4157 | struct slab_depot **depot_ptr) | |
4158 | { | |
4159 | unsigned int slab_size_shift; | |
4160 | struct slab_depot *depot; | |
4161 | int result; | |
4162 | ||
4163 | /* | |
4164 | * Calculate the bit shift for efficiently mapping block numbers to slabs. Using a shift | |
4165 | * requires that the slab size be a power of two. | |
4166 | */ | |
4167 | block_count_t slab_size = state.slab_config.slab_blocks; | |
4168 | ||
4169 | if (!is_power_of_2(slab_size)) { | |
4170 | return uds_log_error_strerror(UDS_INVALID_ARGUMENT, | |
4171 | "slab size must be a power of two"); | |
4172 | } | |
4173 | slab_size_shift = ilog2(slab_size); | |
4174 | ||
0eea6b6e | 4175 | result = vdo_allocate_extended(struct slab_depot, |
7ce49449 MS |
4176 | vdo->thread_config.physical_zone_count, |
4177 | struct block_allocator, __func__, &depot); | |
4178 | if (result != VDO_SUCCESS) | |
4179 | return result; | |
4180 | ||
4181 | depot->vdo = vdo; | |
4182 | depot->old_zone_count = state.zone_count; | |
4183 | depot->zone_count = vdo->thread_config.physical_zone_count; | |
4184 | depot->slab_config = state.slab_config; | |
4185 | depot->first_block = state.first_block; | |
4186 | depot->last_block = state.last_block; | |
4187 | depot->slab_size_shift = slab_size_shift; | |
4188 | ||
4189 | result = allocate_components(depot, summary_partition); | |
4190 | if (result != VDO_SUCCESS) { | |
4191 | vdo_free_slab_depot(depot); | |
4192 | return result; | |
4193 | } | |
4194 | ||
4195 | *depot_ptr = depot; | |
4196 | return VDO_SUCCESS; | |
4197 | } | |
4198 | ||
c9ba9fd3 MS |
4199 | static void uninitialize_allocator_summary(struct block_allocator *allocator) |
4200 | { | |
4201 | block_count_t i; | |
4202 | ||
4203 | if (allocator->summary_blocks == NULL) | |
4204 | return; | |
4205 | ||
4206 | for (i = 0; i < VDO_SLAB_SUMMARY_BLOCKS_PER_ZONE; i++) { | |
4207 | free_vio_components(&allocator->summary_blocks[i].vio); | |
0eea6b6e | 4208 | vdo_free(vdo_forget(allocator->summary_blocks[i].outgoing_entries)); |
c9ba9fd3 MS |
4209 | } |
4210 | ||
0eea6b6e | 4211 | vdo_free(vdo_forget(allocator->summary_blocks)); |
c9ba9fd3 MS |
4212 | } |
4213 | ||
7ce49449 MS |
4214 | /** |
4215 | * vdo_free_slab_depot() - Destroy a slab depot. | |
4216 | * @depot: The depot to destroy. | |
4217 | */ | |
4218 | void vdo_free_slab_depot(struct slab_depot *depot) | |
4219 | { | |
4220 | zone_count_t zone = 0; | |
4221 | ||
4222 | if (depot == NULL) | |
4223 | return; | |
4224 | ||
4225 | vdo_abandon_new_slabs(depot); | |
4226 | ||
4227 | for (zone = 0; zone < depot->zone_count; zone++) { | |
4228 | struct block_allocator *allocator = &depot->allocators[zone]; | |
4229 | ||
4230 | if (allocator->eraser != NULL) | |
0eea6b6e | 4231 | dm_kcopyd_client_destroy(vdo_forget(allocator->eraser)); |
7ce49449 MS |
4232 | |
4233 | uninitialize_allocator_summary(allocator); | |
4234 | uninitialize_scrubber_vio(&allocator->scrubber); | |
0eea6b6e MS |
4235 | free_vio_pool(vdo_forget(allocator->vio_pool)); |
4236 | vdo_free_priority_table(vdo_forget(allocator->prioritized_slabs)); | |
7ce49449 MS |
4237 | } |
4238 | ||
4239 | if (depot->slabs != NULL) { | |
4240 | slab_count_t i; | |
4241 | ||
4242 | for (i = 0; i < depot->slab_count; i++) | |
0eea6b6e | 4243 | free_slab(vdo_forget(depot->slabs[i])); |
7ce49449 MS |
4244 | } |
4245 | ||
0eea6b6e MS |
4246 | vdo_free(vdo_forget(depot->slabs)); |
4247 | vdo_free(vdo_forget(depot->action_manager)); | |
4248 | vdo_free(vdo_forget(depot->summary_entries)); | |
4249 | vdo_free(depot); | |
7ce49449 MS |
4250 | } |
4251 | ||
4252 | /** | |
4253 | * vdo_record_slab_depot() - Record the state of a slab depot for encoding into the super block. | |
4254 | * @depot: The depot to encode. | |
4255 | * | |
4256 | * Return: The depot state. | |
4257 | */ | |
4258 | struct slab_depot_state_2_0 vdo_record_slab_depot(const struct slab_depot *depot) | |
4259 | { | |
4260 | /* | |
4261 | * If this depot is currently using 0 zones, it must have been synchronously loaded by a | |
4262 | * tool and is now being saved. We did not load and combine the slab summary, so we still | |
4263 | * need to do that next time we load with the old zone count rather than 0. | |
4264 | */ | |
4265 | struct slab_depot_state_2_0 state; | |
4266 | zone_count_t zones_to_record = depot->zone_count; | |
4267 | ||
4268 | if (depot->zone_count == 0) | |
4269 | zones_to_record = depot->old_zone_count; | |
4270 | ||
4271 | state = (struct slab_depot_state_2_0) { | |
4272 | .slab_config = depot->slab_config, | |
4273 | .first_block = depot->first_block, | |
4274 | .last_block = depot->last_block, | |
4275 | .zone_count = zones_to_record, | |
4276 | }; | |
4277 | ||
4278 | return state; | |
4279 | } | |
4280 | ||
4281 | /** | |
4282 | * vdo_allocate_reference_counters() - Allocate the reference counters for all slabs in the depot. | |
4283 | * | |
4284 | * Context: This method may be called only before entering normal operation from the load thread. | |
4285 | * | |
4286 | * Return: VDO_SUCCESS or an error. | |
4287 | */ | |
4288 | int vdo_allocate_reference_counters(struct slab_depot *depot) | |
4289 | { | |
4290 | struct slab_iterator iterator = | |
4291 | get_depot_slab_iterator(depot, depot->slab_count - 1, 0, 1); | |
4292 | ||
4293 | while (iterator.next != NULL) { | |
4294 | int result = allocate_slab_counters(next_slab(&iterator)); | |
4295 | ||
4296 | if (result != VDO_SUCCESS) | |
4297 | return result; | |
4298 | } | |
4299 | ||
4300 | return VDO_SUCCESS; | |
4301 | } | |
4302 | ||
4303 | /** | |
4304 | * get_slab_number() - Get the number of the slab that contains a specified block. | |
4305 | * @depot: The slab depot. | |
4306 | * @pbn: The physical block number. | |
4307 | * @slab_number_ptr: A pointer to hold the slab number. | |
4308 | * | |
4309 | * Return: VDO_SUCCESS or an error. | |
4310 | */ | |
4311 | static int __must_check get_slab_number(const struct slab_depot *depot, | |
4312 | physical_block_number_t pbn, | |
4313 | slab_count_t *slab_number_ptr) | |
4314 | { | |
4315 | slab_count_t slab_number; | |
4316 | ||
4317 | if (pbn < depot->first_block) | |
4318 | return VDO_OUT_OF_RANGE; | |
4319 | ||
4320 | slab_number = (pbn - depot->first_block) >> depot->slab_size_shift; | |
4321 | if (slab_number >= depot->slab_count) | |
4322 | return VDO_OUT_OF_RANGE; | |
4323 | ||
4324 | *slab_number_ptr = slab_number; | |
4325 | return VDO_SUCCESS; | |
4326 | } | |
4327 | ||
4328 | /** | |
4329 | * vdo_get_slab() - Get the slab object for the slab that contains a specified block. | |
4330 | * @depot: The slab depot. | |
4331 | * @pbn: The physical block number. | |
4332 | * | |
4333 | * Will put the VDO in read-only mode if the PBN is not a valid data block nor the zero block. | |
4334 | * | |
4335 | * Return: The slab containing the block, or NULL if the block number is the zero block or | |
4336 | * otherwise out of range. | |
4337 | */ | |
4338 | struct vdo_slab *vdo_get_slab(const struct slab_depot *depot, | |
4339 | physical_block_number_t pbn) | |
4340 | { | |
4341 | slab_count_t slab_number; | |
4342 | int result; | |
4343 | ||
4344 | if (pbn == VDO_ZERO_BLOCK) | |
4345 | return NULL; | |
4346 | ||
4347 | result = get_slab_number(depot, pbn, &slab_number); | |
4348 | if (result != VDO_SUCCESS) { | |
4349 | vdo_enter_read_only_mode(depot->vdo, result); | |
4350 | return NULL; | |
4351 | } | |
4352 | ||
4353 | return depot->slabs[slab_number]; | |
4354 | } | |
4355 | ||
4356 | /** | |
4357 | * vdo_get_increment_limit() - Determine how many new references a block can acquire. | |
4358 | * @depot: The slab depot. | |
4359 | * @pbn: The physical block number that is being queried. | |
4360 | * | |
4361 | * Context: This method must be called from the physical zone thread of the PBN. | |
4362 | * | |
4363 | * Return: The number of available references. | |
4364 | */ | |
4365 | u8 vdo_get_increment_limit(struct slab_depot *depot, physical_block_number_t pbn) | |
4366 | { | |
4367 | struct vdo_slab *slab = vdo_get_slab(depot, pbn); | |
4368 | vdo_refcount_t *counter_ptr = NULL; | |
4369 | int result; | |
4370 | ||
4371 | if ((slab == NULL) || (slab->status != VDO_SLAB_REBUILT)) | |
4372 | return 0; | |
4373 | ||
4374 | result = get_reference_counter(slab, pbn, &counter_ptr); | |
4375 | if (result != VDO_SUCCESS) | |
4376 | return 0; | |
4377 | ||
4378 | if (*counter_ptr == PROVISIONAL_REFERENCE_COUNT) | |
4379 | return (MAXIMUM_REFERENCE_COUNT - 1); | |
4380 | ||
4381 | return (MAXIMUM_REFERENCE_COUNT - *counter_ptr); | |
4382 | } | |
4383 | ||
4384 | /** | |
4385 | * vdo_is_physical_data_block() - Determine whether the given PBN refers to a data block. | |
4386 | * @depot: The depot. | |
4387 | * @pbn: The physical block number to ask about. | |
4388 | * | |
4389 | * Return: True if the PBN corresponds to a data block. | |
4390 | */ | |
4391 | bool vdo_is_physical_data_block(const struct slab_depot *depot, | |
4392 | physical_block_number_t pbn) | |
4393 | { | |
4394 | slab_count_t slab_number; | |
4395 | slab_block_number sbn; | |
4396 | ||
4397 | return ((pbn == VDO_ZERO_BLOCK) || | |
4398 | ((get_slab_number(depot, pbn, &slab_number) == VDO_SUCCESS) && | |
4399 | (slab_block_number_from_pbn(depot->slabs[slab_number], pbn, &sbn) == | |
4400 | VDO_SUCCESS))); | |
4401 | } | |
4402 | ||
4403 | /** | |
4404 | * vdo_get_slab_depot_allocated_blocks() - Get the total number of data blocks allocated across all | |
4405 | * the slabs in the depot. | |
4406 | * @depot: The slab depot. | |
4407 | * | |
4408 | * This is the total number of blocks with a non-zero reference count. | |
4409 | * | |
4410 | * Context: This may be called from any thread. | |
4411 | * | |
4412 | * Return: The total number of blocks with a non-zero reference count. | |
4413 | */ | |
4414 | block_count_t vdo_get_slab_depot_allocated_blocks(const struct slab_depot *depot) | |
4415 | { | |
4416 | block_count_t total = 0; | |
4417 | zone_count_t zone; | |
4418 | ||
4419 | for (zone = 0; zone < depot->zone_count; zone++) { | |
4420 | /* The allocators are responsible for thread safety. */ | |
4421 | total += READ_ONCE(depot->allocators[zone].allocated_blocks); | |
4422 | } | |
4423 | ||
4424 | return total; | |
4425 | } | |
4426 | ||
4427 | /** | |
4428 | * vdo_get_slab_depot_data_blocks() - Get the total number of data blocks in all the slabs in the | |
4429 | * depot. | |
4430 | * @depot: The slab depot. | |
4431 | * | |
4432 | * Context: This may be called from any thread. | |
4433 | * | |
4434 | * Return: The total number of data blocks in all slabs. | |
4435 | */ | |
4436 | block_count_t vdo_get_slab_depot_data_blocks(const struct slab_depot *depot) | |
4437 | { | |
4438 | return (READ_ONCE(depot->slab_count) * depot->slab_config.data_blocks); | |
4439 | } | |
4440 | ||
c9ba9fd3 MS |
4441 | /** |
4442 | * finish_combining_zones() - Clean up after saving out the combined slab summary. | |
4443 | * @completion: The vio which was used to write the summary data. | |
4444 | */ | |
4445 | static void finish_combining_zones(struct vdo_completion *completion) | |
4446 | { | |
4447 | int result = completion->result; | |
4448 | struct vdo_completion *parent = completion->parent; | |
4449 | ||
0eea6b6e | 4450 | free_vio(as_vio(vdo_forget(completion))); |
c9ba9fd3 MS |
4451 | vdo_fail_completion(parent, result); |
4452 | } | |
4453 | ||
4454 | static void handle_combining_error(struct vdo_completion *completion) | |
4455 | { | |
4456 | vio_record_metadata_io_error(as_vio(completion)); | |
4457 | finish_combining_zones(completion); | |
4458 | } | |
4459 | ||
4460 | static void write_summary_endio(struct bio *bio) | |
4461 | { | |
4462 | struct vio *vio = bio->bi_private; | |
4463 | struct vdo *vdo = vio->completion.vdo; | |
4464 | ||
4465 | continue_vio_after_io(vio, finish_combining_zones, | |
4466 | vdo->thread_config.admin_thread); | |
4467 | } | |
4468 | ||
4469 | /** | |
4470 | * combine_summaries() - Treating the current entries buffer as the on-disk value of all zones, | |
4471 | * update every zone to the correct values for every slab. | |
4472 | * @depot: The depot whose summary entries should be combined. | |
4473 | */ | |
4474 | static void combine_summaries(struct slab_depot *depot) | |
4475 | { | |
4476 | /* | |
4477 | * Combine all the old summary data into the portion of the buffer corresponding to the | |
4478 | * first zone. | |
4479 | */ | |
4480 | zone_count_t zone = 0; | |
4481 | struct slab_summary_entry *entries = depot->summary_entries; | |
4482 | ||
4483 | if (depot->old_zone_count > 1) { | |
4484 | slab_count_t entry_number; | |
4485 | ||
4486 | for (entry_number = 0; entry_number < MAX_VDO_SLABS; entry_number++) { | |
4487 | if (zone != 0) { | |
4488 | memcpy(entries + entry_number, | |
4489 | entries + (zone * MAX_VDO_SLABS) + entry_number, | |
4490 | sizeof(struct slab_summary_entry)); | |
4491 | } | |
4492 | ||
4493 | zone++; | |
4494 | if (zone == depot->old_zone_count) | |
4495 | zone = 0; | |
4496 | } | |
4497 | } | |
4498 | ||
4499 | /* Copy the combined data to each zones's region of the buffer. */ | |
4500 | for (zone = 1; zone < MAX_VDO_PHYSICAL_ZONES; zone++) { | |
4501 | memcpy(entries + (zone * MAX_VDO_SLABS), entries, | |
4502 | MAX_VDO_SLABS * sizeof(struct slab_summary_entry)); | |
4503 | } | |
4504 | } | |
4505 | ||
4506 | /** | |
4507 | * finish_loading_summary() - Finish loading slab summary data. | |
4508 | * @completion: The vio which was used to read the summary data. | |
4509 | * | |
4510 | * Combines the slab summary data from all the previously written zones and copies the combined | |
4511 | * summary to each partition's data region. Then writes the combined summary back out to disk. This | |
4512 | * callback is registered in load_summary_endio(). | |
4513 | */ | |
4514 | static void finish_loading_summary(struct vdo_completion *completion) | |
4515 | { | |
4516 | struct slab_depot *depot = completion->vdo->depot; | |
4517 | ||
4518 | /* Combine the summary from each zone so each zone is correct for all slabs. */ | |
4519 | combine_summaries(depot); | |
4520 | ||
4521 | /* Write the combined summary back out. */ | |
f7f46761 MS |
4522 | vdo_submit_metadata_vio(as_vio(completion), depot->summary_origin, |
4523 | write_summary_endio, handle_combining_error, | |
4524 | REQ_OP_WRITE); | |
c9ba9fd3 MS |
4525 | } |
4526 | ||
4527 | static void load_summary_endio(struct bio *bio) | |
4528 | { | |
4529 | struct vio *vio = bio->bi_private; | |
4530 | struct vdo *vdo = vio->completion.vdo; | |
4531 | ||
4532 | continue_vio_after_io(vio, finish_loading_summary, | |
4533 | vdo->thread_config.admin_thread); | |
4534 | } | |
4535 | ||
4536 | /** | |
4537 | * load_slab_summary() - The preamble of a load operation. | |
4538 | * | |
4539 | * Implements vdo_action_preamble_fn. | |
4540 | */ | |
4541 | static void load_slab_summary(void *context, struct vdo_completion *parent) | |
4542 | { | |
4543 | int result; | |
4544 | struct vio *vio; | |
4545 | struct slab_depot *depot = context; | |
4546 | const struct admin_state_code *operation = | |
4547 | vdo_get_current_manager_operation(depot->action_manager); | |
4548 | ||
4549 | result = create_multi_block_metadata_vio(depot->vdo, VIO_TYPE_SLAB_SUMMARY, | |
4550 | VIO_PRIORITY_METADATA, parent, | |
4551 | VDO_SLAB_SUMMARY_BLOCKS, | |
4552 | (char *) depot->summary_entries, &vio); | |
4553 | if (result != VDO_SUCCESS) { | |
4554 | vdo_fail_completion(parent, result); | |
4555 | return; | |
4556 | } | |
4557 | ||
4558 | if ((operation == VDO_ADMIN_STATE_FORMATTING) || | |
4559 | (operation == VDO_ADMIN_STATE_LOADING_FOR_REBUILD)) { | |
4560 | finish_loading_summary(&vio->completion); | |
4561 | return; | |
883069e3 MS |
4562 | } |
4563 | ||
f7f46761 MS |
4564 | vdo_submit_metadata_vio(vio, depot->summary_origin, load_summary_endio, |
4565 | handle_combining_error, REQ_OP_READ); | |
883069e3 | 4566 | } |
c9ba9fd3 | 4567 | |
7ce49449 MS |
4568 | /* Implements vdo_zone_action_fn. */ |
4569 | static void load_allocator(void *context, zone_count_t zone_number, | |
4570 | struct vdo_completion *parent) | |
4571 | { | |
4572 | struct slab_depot *depot = context; | |
4573 | ||
4574 | vdo_start_loading(&depot->allocators[zone_number].state, | |
4575 | vdo_get_current_manager_operation(depot->action_manager), | |
4576 | parent, initiate_load); | |
4577 | } | |
4578 | ||
4579 | /** | |
4580 | * vdo_load_slab_depot() - Asynchronously load any slab depot state that isn't included in the | |
4581 | * super_block component. | |
4582 | * @depot: The depot to load. | |
4583 | * @operation: The type of load to perform. | |
4584 | * @parent: The completion to notify when the load is complete. | |
4585 | * @context: Additional context for the load operation; may be NULL. | |
4586 | * | |
4587 | * This method may be called only before entering normal operation from the load thread. | |
4588 | */ | |
4589 | void vdo_load_slab_depot(struct slab_depot *depot, | |
4590 | const struct admin_state_code *operation, | |
4591 | struct vdo_completion *parent, void *context) | |
4592 | { | |
4593 | if (!vdo_assert_load_operation(operation, parent)) | |
4594 | return; | |
4595 | ||
4596 | vdo_schedule_operation_with_context(depot->action_manager, operation, | |
4597 | load_slab_summary, load_allocator, | |
4598 | NULL, context, parent); | |
4599 | } | |
4600 | ||
4601 | /* Implements vdo_zone_action_fn. */ | |
4602 | static void prepare_to_allocate(void *context, zone_count_t zone_number, | |
4603 | struct vdo_completion *parent) | |
4604 | { | |
4605 | struct slab_depot *depot = context; | |
4606 | struct block_allocator *allocator = &depot->allocators[zone_number]; | |
4607 | int result; | |
4608 | ||
4609 | result = vdo_prepare_slabs_for_allocation(allocator); | |
4610 | if (result != VDO_SUCCESS) { | |
4611 | vdo_fail_completion(parent, result); | |
4612 | return; | |
4613 | } | |
4614 | ||
4615 | scrub_slabs(allocator, parent); | |
4616 | } | |
4617 | ||
4618 | /** | |
4619 | * vdo_prepare_slab_depot_to_allocate() - Prepare the slab depot to come online and start | |
4620 | * allocating blocks. | |
4621 | * @depot: The depot to prepare. | |
4622 | * @load_type: The load type. | |
4623 | * @parent: The completion to notify when the operation is complete. | |
4624 | * | |
4625 | * This method may be called only before entering normal operation from the load thread. It must be | |
4626 | * called before allocation may proceed. | |
4627 | */ | |
4628 | void vdo_prepare_slab_depot_to_allocate(struct slab_depot *depot, | |
4629 | enum slab_depot_load_type load_type, | |
4630 | struct vdo_completion *parent) | |
4631 | { | |
4632 | depot->load_type = load_type; | |
4633 | atomic_set(&depot->zones_to_scrub, depot->zone_count); | |
4634 | vdo_schedule_action(depot->action_manager, NULL, | |
4635 | prepare_to_allocate, NULL, parent); | |
4636 | } | |
4637 | ||
4638 | /** | |
4639 | * vdo_update_slab_depot_size() - Update the slab depot to reflect its new size in memory. | |
4640 | * @depot: The depot to update. | |
4641 | * | |
4642 | * This size is saved to disk as part of the super block. | |
4643 | */ | |
4644 | void vdo_update_slab_depot_size(struct slab_depot *depot) | |
4645 | { | |
4646 | depot->last_block = depot->new_last_block; | |
4647 | } | |
4648 | ||
4649 | /** | |
4650 | * vdo_prepare_to_grow_slab_depot() - Allocate new memory needed for a resize of a slab depot to | |
4651 | * the given size. | |
4652 | * @depot: The depot to prepare to resize. | |
4653 | * @partition: The new depot partition | |
4654 | * | |
4655 | * Return: VDO_SUCCESS or an error. | |
4656 | */ | |
4657 | int vdo_prepare_to_grow_slab_depot(struct slab_depot *depot, | |
4658 | const struct partition *partition) | |
4659 | { | |
4660 | struct slab_depot_state_2_0 new_state; | |
4661 | int result; | |
4662 | slab_count_t new_slab_count; | |
4663 | ||
4664 | if ((partition->count >> depot->slab_size_shift) <= depot->slab_count) | |
4665 | return VDO_INCREMENT_TOO_SMALL; | |
4666 | ||
4667 | /* Generate the depot configuration for the new block count. */ | |
4668 | ASSERT_LOG_ONLY(depot->first_block == partition->offset, | |
4669 | "New slab depot partition doesn't change origin"); | |
4670 | result = vdo_configure_slab_depot(partition, depot->slab_config, | |
4671 | depot->zone_count, &new_state); | |
4672 | if (result != VDO_SUCCESS) | |
4673 | return result; | |
4674 | ||
4675 | new_slab_count = vdo_compute_slab_count(depot->first_block, | |
4676 | new_state.last_block, | |
4677 | depot->slab_size_shift); | |
4678 | if (new_slab_count <= depot->slab_count) | |
4679 | return uds_log_error_strerror(VDO_INCREMENT_TOO_SMALL, | |
4680 | "Depot can only grow"); | |
4681 | if (new_slab_count == depot->new_slab_count) { | |
4682 | /* Check it out, we've already got all the new slabs allocated! */ | |
4683 | return VDO_SUCCESS; | |
4684 | } | |
4685 | ||
4686 | vdo_abandon_new_slabs(depot); | |
4687 | result = allocate_slabs(depot, new_slab_count); | |
4688 | if (result != VDO_SUCCESS) { | |
4689 | vdo_abandon_new_slabs(depot); | |
4690 | return result; | |
4691 | } | |
4692 | ||
4693 | depot->new_size = partition->count; | |
4694 | depot->old_last_block = depot->last_block; | |
4695 | depot->new_last_block = new_state.last_block; | |
4696 | ||
4697 | return VDO_SUCCESS; | |
4698 | } | |
4699 | ||
4700 | /** | |
4701 | * finish_registration() - Finish registering new slabs now that all of the allocators have | |
4702 | * received their new slabs. | |
4703 | * | |
4704 | * Implements vdo_action_conclusion_fn. | |
4705 | */ | |
4706 | static int finish_registration(void *context) | |
4707 | { | |
4708 | struct slab_depot *depot = context; | |
4709 | ||
4710 | WRITE_ONCE(depot->slab_count, depot->new_slab_count); | |
0eea6b6e | 4711 | vdo_free(depot->slabs); |
7ce49449 MS |
4712 | depot->slabs = depot->new_slabs; |
4713 | depot->new_slabs = NULL; | |
4714 | depot->new_slab_count = 0; | |
4715 | return VDO_SUCCESS; | |
4716 | } | |
4717 | ||
4718 | /* Implements vdo_zone_action_fn. */ | |
4719 | static void register_new_slabs(void *context, zone_count_t zone_number, | |
4720 | struct vdo_completion *parent) | |
4721 | { | |
4722 | struct slab_depot *depot = context; | |
4723 | struct block_allocator *allocator = &depot->allocators[zone_number]; | |
4724 | slab_count_t i; | |
4725 | ||
4726 | for (i = depot->slab_count; i < depot->new_slab_count; i++) { | |
4727 | struct vdo_slab *slab = depot->new_slabs[i]; | |
4728 | ||
4729 | if (slab->allocator == allocator) | |
4730 | register_slab_with_allocator(allocator, slab); | |
4731 | } | |
4732 | ||
4733 | vdo_finish_completion(parent); | |
4734 | } | |
4735 | ||
4736 | /** | |
4737 | * vdo_use_new_slabs() - Use the new slabs allocated for resize. | |
4738 | * @depot: The depot. | |
4739 | * @parent: The object to notify when complete. | |
4740 | */ | |
4741 | void vdo_use_new_slabs(struct slab_depot *depot, struct vdo_completion *parent) | |
4742 | { | |
4743 | ASSERT_LOG_ONLY(depot->new_slabs != NULL, "Must have new slabs to use"); | |
4744 | vdo_schedule_operation(depot->action_manager, | |
4745 | VDO_ADMIN_STATE_SUSPENDED_OPERATION, | |
4746 | NULL, register_new_slabs, | |
4747 | finish_registration, parent); | |
4748 | } | |
4749 | ||
c9ba9fd3 MS |
4750 | /** |
4751 | * stop_scrubbing() - Tell the scrubber to stop scrubbing after it finishes the slab it is | |
4752 | * currently working on. | |
4753 | * @scrubber: The scrubber to stop. | |
4754 | * @parent: The completion to notify when scrubbing has stopped. | |
4755 | */ | |
4756 | static void stop_scrubbing(struct block_allocator *allocator) | |
4757 | { | |
4758 | struct slab_scrubber *scrubber = &allocator->scrubber; | |
4759 | ||
4760 | if (vdo_is_state_quiescent(&scrubber->admin_state)) { | |
4761 | vdo_finish_completion(&allocator->completion); | |
4762 | } else { | |
4763 | vdo_start_draining(&scrubber->admin_state, | |
4764 | VDO_ADMIN_STATE_SUSPENDING, | |
4765 | &allocator->completion, NULL); | |
4766 | } | |
4767 | } | |
4768 | ||
4769 | /* Implements vdo_admin_initiator_fn. */ | |
4770 | static void initiate_summary_drain(struct admin_state *state) | |
4771 | { | |
4772 | check_summary_drain_complete(container_of(state, struct block_allocator, | |
4773 | summary_state)); | |
4774 | } | |
4775 | ||
4776 | static void do_drain_step(struct vdo_completion *completion) | |
4777 | { | |
4778 | struct block_allocator *allocator = vdo_as_block_allocator(completion); | |
4779 | ||
4780 | vdo_prepare_completion_for_requeue(&allocator->completion, do_drain_step, | |
4781 | handle_operation_error, allocator->thread_id, | |
4782 | NULL); | |
4783 | switch (++allocator->drain_step) { | |
4784 | case VDO_DRAIN_ALLOCATOR_STEP_SCRUBBER: | |
4785 | stop_scrubbing(allocator); | |
4786 | return; | |
4787 | ||
4788 | case VDO_DRAIN_ALLOCATOR_STEP_SLABS: | |
4789 | apply_to_slabs(allocator, do_drain_step); | |
4790 | return; | |
4791 | ||
4792 | case VDO_DRAIN_ALLOCATOR_STEP_SUMMARY: | |
4793 | vdo_start_draining(&allocator->summary_state, | |
4794 | vdo_get_admin_state_code(&allocator->state), | |
4795 | completion, initiate_summary_drain); | |
4796 | return; | |
4797 | ||
4798 | case VDO_DRAIN_ALLOCATOR_STEP_FINISHED: | |
4799 | ASSERT_LOG_ONLY(!is_vio_pool_busy(allocator->vio_pool), | |
4800 | "vio pool not busy"); | |
4801 | vdo_finish_draining_with_result(&allocator->state, completion->result); | |
4802 | return; | |
4803 | ||
4804 | default: | |
4805 | vdo_finish_draining_with_result(&allocator->state, UDS_BAD_STATE); | |
4806 | } | |
4807 | } | |
4808 | ||
4809 | /* Implements vdo_admin_initiator_fn. */ | |
4810 | static void initiate_drain(struct admin_state *state) | |
4811 | { | |
4812 | struct block_allocator *allocator = | |
4813 | container_of(state, struct block_allocator, state); | |
4814 | ||
4815 | allocator->drain_step = VDO_DRAIN_ALLOCATOR_START; | |
4816 | do_drain_step(&allocator->completion); | |
4817 | } | |
4818 | ||
7ce49449 MS |
4819 | /* |
4820 | * Drain all allocator I/O. Depending upon the type of drain, some or all dirty metadata may be | |
4821 | * written to disk. The type of drain will be determined from the state of the allocator's depot. | |
4822 | * | |
4823 | * Implements vdo_zone_action_fn. | |
4824 | */ | |
4825 | static void drain_allocator(void *context, zone_count_t zone_number, | |
4826 | struct vdo_completion *parent) | |
4827 | { | |
4828 | struct slab_depot *depot = context; | |
4829 | ||
4830 | vdo_start_draining(&depot->allocators[zone_number].state, | |
4831 | vdo_get_current_manager_operation(depot->action_manager), | |
4832 | parent, initiate_drain); | |
4833 | } | |
4834 | ||
4835 | /** | |
4836 | * vdo_drain_slab_depot() - Drain all slab depot I/O. | |
4837 | * @depot: The depot to drain. | |
4838 | * @operation: The drain operation (flush, rebuild, suspend, or save). | |
4839 | * @parent: The completion to finish when the drain is complete. | |
4840 | * | |
4841 | * If saving, or flushing, all dirty depot metadata will be written out. If saving or suspending, | |
4842 | * the depot will be left in a suspended state. | |
4843 | */ | |
4844 | void vdo_drain_slab_depot(struct slab_depot *depot, | |
4845 | const struct admin_state_code *operation, | |
4846 | struct vdo_completion *parent) | |
4847 | { | |
4848 | vdo_schedule_operation(depot->action_manager, operation, | |
4849 | NULL, drain_allocator, NULL, parent); | |
4850 | } | |
4851 | ||
c9ba9fd3 MS |
4852 | /** |
4853 | * resume_scrubbing() - Tell the scrubber to resume scrubbing if it has been stopped. | |
4854 | * @allocator: The allocator being resumed. | |
4855 | */ | |
4856 | static void resume_scrubbing(struct block_allocator *allocator) | |
4857 | { | |
4858 | int result; | |
4859 | struct slab_scrubber *scrubber = &allocator->scrubber; | |
4860 | ||
4861 | if (!has_slabs_to_scrub(scrubber)) { | |
4862 | vdo_finish_completion(&allocator->completion); | |
4863 | return; | |
4864 | } | |
4865 | ||
4866 | result = vdo_resume_if_quiescent(&scrubber->admin_state); | |
4867 | if (result != VDO_SUCCESS) { | |
4868 | vdo_fail_completion(&allocator->completion, result); | |
4869 | return; | |
4870 | } | |
4871 | ||
4872 | scrub_next_slab(scrubber); | |
4873 | vdo_finish_completion(&allocator->completion); | |
4874 | } | |
4875 | ||
4876 | static void do_resume_step(struct vdo_completion *completion) | |
4877 | { | |
4878 | struct block_allocator *allocator = vdo_as_block_allocator(completion); | |
4879 | ||
4880 | vdo_prepare_completion_for_requeue(&allocator->completion, do_resume_step, | |
4881 | handle_operation_error, | |
4882 | allocator->thread_id, NULL); | |
4883 | switch (--allocator->drain_step) { | |
4884 | case VDO_DRAIN_ALLOCATOR_STEP_SUMMARY: | |
4885 | vdo_fail_completion(completion, | |
4886 | vdo_resume_if_quiescent(&allocator->summary_state)); | |
4887 | return; | |
4888 | ||
4889 | case VDO_DRAIN_ALLOCATOR_STEP_SLABS: | |
4890 | apply_to_slabs(allocator, do_resume_step); | |
4891 | return; | |
4892 | ||
4893 | case VDO_DRAIN_ALLOCATOR_STEP_SCRUBBER: | |
4894 | resume_scrubbing(allocator); | |
4895 | return; | |
4896 | ||
4897 | case VDO_DRAIN_ALLOCATOR_START: | |
4898 | vdo_finish_resuming_with_result(&allocator->state, completion->result); | |
4899 | return; | |
4900 | ||
4901 | default: | |
4902 | vdo_finish_resuming_with_result(&allocator->state, UDS_BAD_STATE); | |
4903 | } | |
4904 | } | |
4905 | ||
4906 | /* Implements vdo_admin_initiator_fn. */ | |
4907 | static void initiate_resume(struct admin_state *state) | |
4908 | { | |
4909 | struct block_allocator *allocator = | |
4910 | container_of(state, struct block_allocator, state); | |
4911 | ||
4912 | allocator->drain_step = VDO_DRAIN_ALLOCATOR_STEP_FINISHED; | |
4913 | do_resume_step(&allocator->completion); | |
4914 | } | |
4915 | ||
4916 | /* Implements vdo_zone_action_fn. */ | |
4917 | static void resume_allocator(void *context, zone_count_t zone_number, | |
4918 | struct vdo_completion *parent) | |
4919 | { | |
4920 | struct slab_depot *depot = context; | |
4921 | ||
4922 | vdo_start_resuming(&depot->allocators[zone_number].state, | |
4923 | vdo_get_current_manager_operation(depot->action_manager), | |
4924 | parent, initiate_resume); | |
4925 | } | |
7ce49449 MS |
4926 | |
4927 | /** | |
4928 | * vdo_resume_slab_depot() - Resume a suspended slab depot. | |
4929 | * @depot: The depot to resume. | |
4930 | * @parent: The completion to finish when the depot has resumed. | |
4931 | */ | |
4932 | void vdo_resume_slab_depot(struct slab_depot *depot, struct vdo_completion *parent) | |
4933 | { | |
4934 | if (vdo_is_read_only(depot->vdo)) { | |
4935 | vdo_continue_completion(parent, VDO_READ_ONLY); | |
4936 | return; | |
4937 | } | |
4938 | ||
4939 | vdo_schedule_operation(depot->action_manager, VDO_ADMIN_STATE_RESUMING, | |
4940 | NULL, resume_allocator, NULL, parent); | |
4941 | } | |
4942 | ||
4943 | /** | |
4944 | * vdo_commit_oldest_slab_journal_tail_blocks() - Commit all dirty tail blocks which are locking a | |
4945 | * given recovery journal block. | |
4946 | * @depot: The depot. | |
4947 | * @recovery_block_number: The sequence number of the recovery journal block whose locks should be | |
4948 | * released. | |
4949 | * | |
4950 | * Context: This method must be called from the journal zone thread. | |
4951 | */ | |
4952 | void vdo_commit_oldest_slab_journal_tail_blocks(struct slab_depot *depot, | |
4953 | sequence_number_t recovery_block_number) | |
4954 | { | |
4955 | if (depot == NULL) | |
4956 | return; | |
4957 | ||
4958 | depot->new_release_request = recovery_block_number; | |
4959 | vdo_schedule_default_action(depot->action_manager); | |
4960 | } | |
4961 | ||
4962 | /* Implements vdo_zone_action_fn. */ | |
4963 | static void scrub_all_unrecovered_slabs(void *context, zone_count_t zone_number, | |
4964 | struct vdo_completion *parent) | |
4965 | { | |
4966 | struct slab_depot *depot = context; | |
4967 | ||
4968 | scrub_slabs(&depot->allocators[zone_number], NULL); | |
4969 | vdo_launch_completion(parent); | |
4970 | } | |
4971 | ||
4972 | /** | |
4973 | * vdo_scrub_all_unrecovered_slabs() - Scrub all unrecovered slabs. | |
4974 | * @depot: The depot to scrub. | |
4975 | * @parent: The object to notify when scrubbing has been launched for all zones. | |
4976 | */ | |
4977 | void vdo_scrub_all_unrecovered_slabs(struct slab_depot *depot, | |
4978 | struct vdo_completion *parent) | |
4979 | { | |
4980 | vdo_schedule_action(depot->action_manager, NULL, | |
4981 | scrub_all_unrecovered_slabs, | |
4982 | NULL, parent); | |
4983 | } | |
4984 | ||
4985 | /** | |
4986 | * get_block_allocator_statistics() - Get the total of the statistics from all the block allocators | |
4987 | * in the depot. | |
4988 | * @depot: The slab depot. | |
4989 | * | |
4990 | * Return: The statistics from all block allocators in the depot. | |
4991 | */ | |
4992 | static struct block_allocator_statistics __must_check | |
4993 | get_block_allocator_statistics(const struct slab_depot *depot) | |
4994 | { | |
4995 | struct block_allocator_statistics totals; | |
4996 | zone_count_t zone; | |
4997 | ||
4998 | memset(&totals, 0, sizeof(totals)); | |
4999 | ||
5000 | for (zone = 0; zone < depot->zone_count; zone++) { | |
5001 | const struct block_allocator *allocator = &depot->allocators[zone]; | |
5002 | const struct block_allocator_statistics *stats = &allocator->statistics; | |
5003 | ||
5004 | totals.slab_count += allocator->slab_count; | |
5005 | totals.slabs_opened += READ_ONCE(stats->slabs_opened); | |
5006 | totals.slabs_reopened += READ_ONCE(stats->slabs_reopened); | |
5007 | } | |
5008 | ||
5009 | return totals; | |
5010 | } | |
5011 | ||
5012 | /** | |
5013 | * get_ref_counts_statistics() - Get the cumulative ref_counts statistics for the depot. | |
5014 | * @depot: The slab depot. | |
5015 | * | |
5016 | * Return: The cumulative statistics for all ref_counts in the depot. | |
5017 | */ | |
5018 | static struct ref_counts_statistics __must_check | |
5019 | get_ref_counts_statistics(const struct slab_depot *depot) | |
5020 | { | |
5021 | struct ref_counts_statistics totals; | |
5022 | zone_count_t zone; | |
5023 | ||
5024 | memset(&totals, 0, sizeof(totals)); | |
5025 | ||
5026 | for (zone = 0; zone < depot->zone_count; zone++) { | |
5027 | totals.blocks_written += | |
5028 | READ_ONCE(depot->allocators[zone].ref_counts_statistics.blocks_written); | |
5029 | } | |
5030 | ||
5031 | return totals; | |
5032 | } | |
5033 | ||
5034 | /** | |
eebd4e16 | 5035 | * get_slab_journal_statistics() - Get the aggregated slab journal statistics for the depot. |
7ce49449 MS |
5036 | * @depot: The slab depot. |
5037 | * | |
5038 | * Return: The aggregated statistics for all slab journals in the depot. | |
5039 | */ | |
5040 | static struct slab_journal_statistics __must_check | |
5041 | get_slab_journal_statistics(const struct slab_depot *depot) | |
5042 | { | |
5043 | struct slab_journal_statistics totals; | |
5044 | zone_count_t zone; | |
5045 | ||
5046 | memset(&totals, 0, sizeof(totals)); | |
5047 | ||
5048 | for (zone = 0; zone < depot->zone_count; zone++) { | |
5049 | const struct slab_journal_statistics *stats = | |
5050 | &depot->allocators[zone].slab_journal_statistics; | |
5051 | ||
5052 | totals.disk_full_count += READ_ONCE(stats->disk_full_count); | |
5053 | totals.flush_count += READ_ONCE(stats->flush_count); | |
5054 | totals.blocked_count += READ_ONCE(stats->blocked_count); | |
5055 | totals.blocks_written += READ_ONCE(stats->blocks_written); | |
5056 | totals.tail_busy_count += READ_ONCE(stats->tail_busy_count); | |
5057 | } | |
5058 | ||
5059 | return totals; | |
5060 | } | |
5061 | ||
5062 | /** | |
5063 | * vdo_get_slab_depot_statistics() - Get all the vdo_statistics fields that are properties of the | |
5064 | * slab depot. | |
5065 | * @depot: The slab depot. | |
5066 | * @stats: The vdo statistics structure to partially fill. | |
5067 | */ | |
5068 | void vdo_get_slab_depot_statistics(const struct slab_depot *depot, | |
5069 | struct vdo_statistics *stats) | |
5070 | { | |
5071 | slab_count_t slab_count = READ_ONCE(depot->slab_count); | |
5072 | slab_count_t unrecovered = 0; | |
5073 | zone_count_t zone; | |
5074 | ||
5075 | for (zone = 0; zone < depot->zone_count; zone++) { | |
5076 | /* The allocators are responsible for thread safety. */ | |
5077 | unrecovered += READ_ONCE(depot->allocators[zone].scrubber.slab_count); | |
5078 | } | |
5079 | ||
5080 | stats->recovery_percentage = (slab_count - unrecovered) * 100 / slab_count; | |
5081 | stats->allocator = get_block_allocator_statistics(depot); | |
5082 | stats->ref_counts = get_ref_counts_statistics(depot); | |
5083 | stats->slab_journal = get_slab_journal_statistics(depot); | |
5084 | stats->slab_summary = (struct slab_summary_statistics) { | |
5085 | .blocks_written = atomic64_read(&depot->summary_statistics.blocks_written), | |
5086 | }; | |
5087 | } | |
5088 | ||
5089 | /** | |
5090 | * vdo_dump_slab_depot() - Dump the slab depot, in a thread-unsafe fashion. | |
5091 | * @depot: The slab depot. | |
5092 | */ | |
5093 | void vdo_dump_slab_depot(const struct slab_depot *depot) | |
5094 | { | |
5095 | uds_log_info("vdo slab depot"); | |
5096 | uds_log_info(" zone_count=%u old_zone_count=%u slabCount=%u active_release_request=%llu new_release_request=%llu", | |
5097 | (unsigned int) depot->zone_count, | |
5098 | (unsigned int) depot->old_zone_count, READ_ONCE(depot->slab_count), | |
5099 | (unsigned long long) depot->active_release_request, | |
5100 | (unsigned long long) depot->new_release_request); | |
5101 | } |