dm vdo: move indexer files into sub-directory
authorMike Snitzer <snitzer@kernel.org>
Thu, 8 Feb 2024 21:55:29 +0000 (15:55 -0600)
committerMike Snitzer <snitzer@kernel.org>
Mon, 4 Mar 2024 20:07:55 +0000 (15:07 -0500)
The goal is to assist high-level understanding of which code is
conceptually specific to VDO's indexer.

Signed-off-by: Mike Snitzer <snitzer@kernel.org>
Signed-off-by: Matthew Sakai <msakai@redhat.com>
68 files changed:
drivers/md/dm-vdo/Makefile
drivers/md/dm-vdo/chapter-index.c [deleted file]
drivers/md/dm-vdo/chapter-index.h [deleted file]
drivers/md/dm-vdo/config.c [deleted file]
drivers/md/dm-vdo/config.h [deleted file]
drivers/md/dm-vdo/data-vio.h
drivers/md/dm-vdo/dedupe.c
drivers/md/dm-vdo/delta-index.c [deleted file]
drivers/md/dm-vdo/delta-index.h [deleted file]
drivers/md/dm-vdo/funnel-requestqueue.c [deleted file]
drivers/md/dm-vdo/funnel-requestqueue.h [deleted file]
drivers/md/dm-vdo/geometry.c [deleted file]
drivers/md/dm-vdo/geometry.h [deleted file]
drivers/md/dm-vdo/hash-utils.h [deleted file]
drivers/md/dm-vdo/index-layout.c [deleted file]
drivers/md/dm-vdo/index-layout.h [deleted file]
drivers/md/dm-vdo/index-page-map.c [deleted file]
drivers/md/dm-vdo/index-page-map.h [deleted file]
drivers/md/dm-vdo/index-session.c [deleted file]
drivers/md/dm-vdo/index-session.h [deleted file]
drivers/md/dm-vdo/index.c [deleted file]
drivers/md/dm-vdo/index.h [deleted file]
drivers/md/dm-vdo/indexer.h [deleted file]
drivers/md/dm-vdo/indexer/chapter-index.c [new file with mode: 0644]
drivers/md/dm-vdo/indexer/chapter-index.h [new file with mode: 0644]
drivers/md/dm-vdo/indexer/config.c [new file with mode: 0644]
drivers/md/dm-vdo/indexer/config.h [new file with mode: 0644]
drivers/md/dm-vdo/indexer/delta-index.c [new file with mode: 0644]
drivers/md/dm-vdo/indexer/delta-index.h [new file with mode: 0644]
drivers/md/dm-vdo/indexer/funnel-requestqueue.c [new file with mode: 0644]
drivers/md/dm-vdo/indexer/funnel-requestqueue.h [new file with mode: 0644]
drivers/md/dm-vdo/indexer/geometry.c [new file with mode: 0644]
drivers/md/dm-vdo/indexer/geometry.h [new file with mode: 0644]
drivers/md/dm-vdo/indexer/hash-utils.h [new file with mode: 0644]
drivers/md/dm-vdo/indexer/index-layout.c [new file with mode: 0644]
drivers/md/dm-vdo/indexer/index-layout.h [new file with mode: 0644]
drivers/md/dm-vdo/indexer/index-page-map.c [new file with mode: 0644]
drivers/md/dm-vdo/indexer/index-page-map.h [new file with mode: 0644]
drivers/md/dm-vdo/indexer/index-session.c [new file with mode: 0644]
drivers/md/dm-vdo/indexer/index-session.h [new file with mode: 0644]
drivers/md/dm-vdo/indexer/index.c [new file with mode: 0644]
drivers/md/dm-vdo/indexer/index.h [new file with mode: 0644]
drivers/md/dm-vdo/indexer/indexer.h [new file with mode: 0644]
drivers/md/dm-vdo/indexer/io-factory.c [new file with mode: 0644]
drivers/md/dm-vdo/indexer/io-factory.h [new file with mode: 0644]
drivers/md/dm-vdo/indexer/open-chapter.c [new file with mode: 0644]
drivers/md/dm-vdo/indexer/open-chapter.h [new file with mode: 0644]
drivers/md/dm-vdo/indexer/radix-sort.c [new file with mode: 0644]
drivers/md/dm-vdo/indexer/radix-sort.h [new file with mode: 0644]
drivers/md/dm-vdo/indexer/sparse-cache.c [new file with mode: 0644]
drivers/md/dm-vdo/indexer/sparse-cache.h [new file with mode: 0644]
drivers/md/dm-vdo/indexer/volume-index.c [new file with mode: 0644]
drivers/md/dm-vdo/indexer/volume-index.h [new file with mode: 0644]
drivers/md/dm-vdo/indexer/volume.c [new file with mode: 0644]
drivers/md/dm-vdo/indexer/volume.h [new file with mode: 0644]
drivers/md/dm-vdo/io-factory.c [deleted file]
drivers/md/dm-vdo/io-factory.h [deleted file]
drivers/md/dm-vdo/open-chapter.c [deleted file]
drivers/md/dm-vdo/open-chapter.h [deleted file]
drivers/md/dm-vdo/radix-sort.c [deleted file]
drivers/md/dm-vdo/radix-sort.h [deleted file]
drivers/md/dm-vdo/sparse-cache.c [deleted file]
drivers/md/dm-vdo/sparse-cache.h [deleted file]
drivers/md/dm-vdo/uds-sysfs.c
drivers/md/dm-vdo/volume-index.c [deleted file]
drivers/md/dm-vdo/volume-index.h [deleted file]
drivers/md/dm-vdo/volume.c [deleted file]
drivers/md/dm-vdo/volume.h [deleted file]

index 32266ab04cc19b05f4b1e3e2c09e71856d6648cb..502a7a0acbdbb7ddf85bb7013c58c7ae1e6877b8 100644 (file)
@@ -1,50 +1,39 @@
 # SPDX-License-Identifier: GPL-2.0-only
 
+ccflags-y := -I$(srctree)/$(src) -I$(srctree)/$(src)/indexer
+
 obj-$(CONFIG_DM_VDO) += dm-vdo.o
 
 dm-vdo-objs := \
        action-manager.o \
        admin-state.o \
        block-map.o \
-       chapter-index.o \
        completion.o \
-       config.o \
        data-vio.o \
        dedupe.o \
-       delta-index.o \
        dm-vdo-target.o \
        dump.o \
        encodings.o \
        errors.o \
        flush.o \
        funnel-queue.o \
-       funnel-requestqueue.o \
        funnel-workqueue.o \
-       geometry.o \
-       index-layout.o \
-       index.o \
-       index-page-map.o \
-       index-session.o \
        int-map.o \
-       io-factory.o \
        io-submitter.o \
        logger.o \
        logical-zone.o \
        memory-alloc.o \
        message-stats.o \
        murmurhash3.o \
-       open-chapter.o \
        packer.o \
        permassert.o \
        physical-zone.o \
        pool-sysfs.o \
        pool-sysfs-stats.o \
        priority-table.o \
-       radix-sort.o \
        recovery-journal.o \
        repair.o \
        slab-depot.o \
-       sparse-cache.o \
        status-codes.o \
        string-utils.o \
        sysfs.o \
@@ -54,6 +43,19 @@ dm-vdo-objs := \
        uds-sysfs.o \
        vdo.o \
        vio.o \
-       volume-index.o \
-       volume.o \
-       wait-queue.o
+       wait-queue.o \
+       indexer/chapter-index.o \
+       indexer/config.o \
+       indexer/delta-index.o \
+       indexer/funnel-requestqueue.o \
+       indexer/geometry.o \
+       indexer/index.o \
+       indexer/index-layout.o \
+       indexer/index-page-map.o \
+       indexer/index-session.o \
+       indexer/io-factory.o \
+       indexer/open-chapter.o \
+       indexer/radix-sort.o \
+       indexer/sparse-cache.o \
+       indexer/volume.o \
+       indexer/volume-index.o
diff --git a/drivers/md/dm-vdo/chapter-index.c b/drivers/md/dm-vdo/chapter-index.c
deleted file mode 100644 (file)
index 9b9185c..0000000
+++ /dev/null
@@ -1,292 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright 2023 Red Hat
- */
-
-#include "chapter-index.h"
-
-#include "errors.h"
-#include "hash-utils.h"
-#include "indexer.h"
-#include "logger.h"
-#include "memory-alloc.h"
-#include "permassert.h"
-
-int uds_make_open_chapter_index(struct open_chapter_index **chapter_index,
-                               const struct index_geometry *geometry, u64 volume_nonce)
-{
-       int result;
-       size_t memory_size;
-       struct open_chapter_index *index;
-
-       result = uds_allocate(1, struct open_chapter_index, "open chapter index", &index);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       /*
-        * The delta index will rebalance delta lists when memory gets tight,
-        * so give the chapter index one extra page.
-        */
-       memory_size = ((geometry->index_pages_per_chapter + 1) * geometry->bytes_per_page);
-       index->geometry = geometry;
-       index->volume_nonce = volume_nonce;
-       result = uds_initialize_delta_index(&index->delta_index, 1,
-                                           geometry->delta_lists_per_chapter,
-                                           geometry->chapter_mean_delta,
-                                           geometry->chapter_payload_bits,
-                                           memory_size, 'm');
-       if (result != UDS_SUCCESS) {
-               uds_free(index);
-               return result;
-       }
-
-       index->memory_size = index->delta_index.memory_size + sizeof(struct open_chapter_index);
-       *chapter_index = index;
-       return UDS_SUCCESS;
-}
-
-void uds_free_open_chapter_index(struct open_chapter_index *chapter_index)
-{
-       if (chapter_index == NULL)
-               return;
-
-       uds_uninitialize_delta_index(&chapter_index->delta_index);
-       uds_free(chapter_index);
-}
-
-/* Re-initialize an open chapter index for a new chapter. */
-void uds_empty_open_chapter_index(struct open_chapter_index *chapter_index,
-                                 u64 virtual_chapter_number)
-{
-       uds_reset_delta_index(&chapter_index->delta_index);
-       chapter_index->virtual_chapter_number = virtual_chapter_number;
-}
-
-static inline bool was_entry_found(const struct delta_index_entry *entry, u32 address)
-{
-       return (!entry->at_end) && (entry->key == address);
-}
-
-/* Associate a record name with the record page containing its metadata. */
-int uds_put_open_chapter_index_record(struct open_chapter_index *chapter_index,
-                                     const struct uds_record_name *name,
-                                     u32 page_number)
-{
-       int result;
-       struct delta_index_entry entry;
-       u32 address;
-       u32 list_number;
-       const u8 *found_name;
-       bool found;
-       const struct index_geometry *geometry = chapter_index->geometry;
-       u64 chapter_number = chapter_index->virtual_chapter_number;
-       u32 record_pages = geometry->record_pages_per_chapter;
-
-       result = ASSERT(page_number < record_pages,
-                       "Page number within chapter (%u) exceeds the maximum value %u",
-                       page_number, record_pages);
-       if (result != UDS_SUCCESS)
-               return UDS_INVALID_ARGUMENT;
-
-       address = uds_hash_to_chapter_delta_address(name, geometry);
-       list_number = uds_hash_to_chapter_delta_list(name, geometry);
-       result = uds_get_delta_index_entry(&chapter_index->delta_index, list_number,
-                                          address, name->name, &entry);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       found = was_entry_found(&entry, address);
-       result = ASSERT(!(found && entry.is_collision),
-                       "Chunk appears more than once in chapter %llu",
-                       (unsigned long long) chapter_number);
-       if (result != UDS_SUCCESS)
-               return UDS_BAD_STATE;
-
-       found_name = (found ? name->name : NULL);
-       return uds_put_delta_index_entry(&entry, address, page_number, found_name);
-}
-
-/*
- * Pack a section of an open chapter index into a chapter index page. A range of delta lists
- * (starting with a specified list index) is copied from the open chapter index into a memory page.
- * The number of lists copied onto the page is returned to the caller on success.
- *
- * @chapter_index: The open chapter index
- * @memory: The memory page to use
- * @first_list: The first delta list number to be copied
- * @last_page: If true, this is the last page of the chapter index and all the remaining lists must
- *             be packed onto this page
- * @lists_packed: The number of delta lists that were packed onto this page
- */
-int uds_pack_open_chapter_index_page(struct open_chapter_index *chapter_index,
-                                    u8 *memory, u32 first_list, bool last_page,
-                                    u32 *lists_packed)
-{
-       int result;
-       struct delta_index *delta_index = &chapter_index->delta_index;
-       struct delta_index_stats stats;
-       u64 nonce = chapter_index->volume_nonce;
-       u64 chapter_number = chapter_index->virtual_chapter_number;
-       const struct index_geometry *geometry = chapter_index->geometry;
-       u32 list_count = geometry->delta_lists_per_chapter;
-       unsigned int removals = 0;
-       struct delta_index_entry entry;
-       u32 next_list;
-       s32 list_number;
-
-       for (;;) {
-               result = uds_pack_delta_index_page(delta_index, nonce, memory,
-                                                  geometry->bytes_per_page,
-                                                  chapter_number, first_list,
-                                                  lists_packed);
-               if (result != UDS_SUCCESS)
-                       return result;
-
-               if ((first_list + *lists_packed) == list_count) {
-                       /* All lists are packed. */
-                       break;
-               } else if (*lists_packed == 0) {
-                       /*
-                        * The next delta list does not fit on a page. This delta list will be
-                        * removed.
-                        */
-               } else if (last_page) {
-                       /*
-                        * This is the last page and there are lists left unpacked, but all of the
-                        * remaining lists must fit on the page. Find a list that contains entries
-                        * and remove the entire list. Try the first list that does not fit. If it
-                        * is empty, we will select the last list that already fits and has any
-                        * entries.
-                        */
-               } else {
-                       /* This page is done. */
-                       break;
-               }
-
-               if (removals == 0) {
-                       uds_get_delta_index_stats(delta_index, &stats);
-                       uds_log_warning("The chapter index for chapter %llu contains %llu entries with %llu collisions",
-                                       (unsigned long long) chapter_number,
-                                       (unsigned long long) stats.record_count,
-                                       (unsigned long long) stats.collision_count);
-               }
-
-               list_number = *lists_packed;
-               do {
-                       if (list_number < 0)
-                               return UDS_OVERFLOW;
-
-                       next_list = first_list + list_number--,
-                       result = uds_start_delta_index_search(delta_index, next_list, 0,
-                                                             &entry);
-                       if (result != UDS_SUCCESS)
-                               return result;
-
-                       result = uds_next_delta_index_entry(&entry);
-                       if (result != UDS_SUCCESS)
-                               return result;
-               } while (entry.at_end);
-
-               do {
-                       result = uds_remove_delta_index_entry(&entry);
-                       if (result != UDS_SUCCESS)
-                               return result;
-
-                       removals++;
-               } while (!entry.at_end);
-       }
-
-       if (removals > 0) {
-               uds_log_warning("To avoid chapter index page overflow in chapter %llu, %u entries were removed from the chapter index",
-                               (unsigned long long) chapter_number, removals);
-       }
-
-       return UDS_SUCCESS;
-}
-
-/* Make a new chapter index page, initializing it with the data from a given index_page buffer. */
-int uds_initialize_chapter_index_page(struct delta_index_page *index_page,
-                                     const struct index_geometry *geometry,
-                                     u8 *page_buffer, u64 volume_nonce)
-{
-       return uds_initialize_delta_index_page(index_page, volume_nonce,
-                                              geometry->chapter_mean_delta,
-                                              geometry->chapter_payload_bits,
-                                              page_buffer, geometry->bytes_per_page);
-}
-
-/* Validate a chapter index page read during rebuild. */
-int uds_validate_chapter_index_page(const struct delta_index_page *index_page,
-                                   const struct index_geometry *geometry)
-{
-       int result;
-       const struct delta_index *delta_index = &index_page->delta_index;
-       u32 first = index_page->lowest_list_number;
-       u32 last = index_page->highest_list_number;
-       u32 list_number;
-
-       /* We walk every delta list from start to finish. */
-       for (list_number = first; list_number <= last; list_number++) {
-               struct delta_index_entry entry;
-
-               result = uds_start_delta_index_search(delta_index, list_number - first,
-                                                     0, &entry);
-               if (result != UDS_SUCCESS)
-                       return result;
-
-               for (;;) {
-                       result = uds_next_delta_index_entry(&entry);
-                       if (result != UDS_SUCCESS) {
-                               /*
-                                * A random bit stream is highly likely to arrive here when we go
-                                * past the end of the delta list.
-                                */
-                               return result;
-                       }
-
-                       if (entry.at_end)
-                               break;
-
-                       /* Also make sure that the record page field contains a plausible value. */
-                       if (uds_get_delta_entry_value(&entry) >=
-                           geometry->record_pages_per_chapter) {
-                               /*
-                                * Do not log this as an error. It happens in normal operation when
-                                * we are doing a rebuild but haven't written the entire volume
-                                * once.
-                                */
-                               return UDS_CORRUPT_DATA;
-                       }
-               }
-       }
-       return UDS_SUCCESS;
-}
-
-/*
- * Search a chapter index page for a record name, returning the record page number that may contain
- * the name.
- */
-int uds_search_chapter_index_page(struct delta_index_page *index_page,
-                                 const struct index_geometry *geometry,
-                                 const struct uds_record_name *name,
-                                 u16 *record_page_ptr)
-{
-       int result;
-       struct delta_index *delta_index = &index_page->delta_index;
-       u32 address = uds_hash_to_chapter_delta_address(name, geometry);
-       u32 delta_list_number = uds_hash_to_chapter_delta_list(name, geometry);
-       u32 sub_list_number = delta_list_number - index_page->lowest_list_number;
-       struct delta_index_entry entry;
-
-       result = uds_get_delta_index_entry(delta_index, sub_list_number, address,
-                                          name->name, &entry);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       if (was_entry_found(&entry, address))
-               *record_page_ptr = uds_get_delta_entry_value(&entry);
-       else
-               *record_page_ptr = NO_CHAPTER_INDEX_ENTRY;
-
-       return UDS_SUCCESS;
-}
diff --git a/drivers/md/dm-vdo/chapter-index.h b/drivers/md/dm-vdo/chapter-index.h
deleted file mode 100644 (file)
index be8bf2b..0000000
+++ /dev/null
@@ -1,61 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright 2023 Red Hat
- */
-
-#ifndef UDS_CHAPTER_INDEX_H
-#define UDS_CHAPTER_INDEX_H
-
-#include <linux/limits.h>
-
-#include "delta-index.h"
-#include "geometry.h"
-
-/*
- * A chapter index for an open chapter is a mutable structure that tracks all the records that have
- * been added to the chapter. A chapter index for a closed chapter is similar except that it is
- * immutable because the contents of a closed chapter can never change, and the immutable structure
- * is more efficient. Both types of chapter index are implemented with a delta index.
- */
-
-/* The value returned when no entry is found in the chapter index. */
-#define NO_CHAPTER_INDEX_ENTRY U16_MAX
-
-struct open_chapter_index {
-       const struct index_geometry *geometry;
-       struct delta_index delta_index;
-       u64 virtual_chapter_number;
-       u64 volume_nonce;
-       size_t memory_size;
-};
-
-int __must_check uds_make_open_chapter_index(struct open_chapter_index **chapter_index,
-                                            const struct index_geometry *geometry,
-                                            u64 volume_nonce);
-
-void uds_free_open_chapter_index(struct open_chapter_index *chapter_index);
-
-void uds_empty_open_chapter_index(struct open_chapter_index *chapter_index,
-                                 u64 virtual_chapter_number);
-
-int __must_check uds_put_open_chapter_index_record(struct open_chapter_index *chapter_index,
-                                                  const struct uds_record_name *name,
-                                                  u32 page_number);
-
-int __must_check uds_pack_open_chapter_index_page(struct open_chapter_index *chapter_index,
-                                                 u8 *memory, u32 first_list,
-                                                 bool last_page, u32 *lists_packed);
-
-int __must_check uds_initialize_chapter_index_page(struct delta_index_page *index_page,
-                                                  const struct index_geometry *geometry,
-                                                  u8 *page_buffer, u64 volume_nonce);
-
-int __must_check uds_validate_chapter_index_page(const struct delta_index_page *index_page,
-                                                const struct index_geometry *geometry);
-
-int __must_check uds_search_chapter_index_page(struct delta_index_page *index_page,
-                                              const struct index_geometry *geometry,
-                                              const struct uds_record_name *name,
-                                              u16 *record_page_ptr);
-
-#endif /* UDS_CHAPTER_INDEX_H */
diff --git a/drivers/md/dm-vdo/config.c b/drivers/md/dm-vdo/config.c
deleted file mode 100644 (file)
index 0bf315e..0000000
+++ /dev/null
@@ -1,378 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright 2023 Red Hat
- */
-
-#include "config.h"
-
-#include "logger.h"
-#include "memory-alloc.h"
-#include "numeric.h"
-#include "string-utils.h"
-#include "thread-utils.h"
-
-static const u8 INDEX_CONFIG_MAGIC[] = "ALBIC";
-static const u8 INDEX_CONFIG_VERSION_6_02[] = "06.02";
-static const u8 INDEX_CONFIG_VERSION_8_02[] = "08.02";
-
-enum {
-       DEFAULT_VOLUME_READ_THREADS = 2,
-       MAX_VOLUME_READ_THREADS = 16,
-       INDEX_CONFIG_MAGIC_LENGTH = sizeof(INDEX_CONFIG_MAGIC) - 1,
-       INDEX_CONFIG_VERSION_LENGTH = sizeof(INDEX_CONFIG_VERSION_6_02) - 1,
-};
-
-static bool is_version(const u8 *version, u8 *buffer)
-{
-       return memcmp(version, buffer, INDEX_CONFIG_VERSION_LENGTH) == 0;
-}
-
-static bool are_matching_configurations(struct uds_configuration *saved_config,
-                                       struct index_geometry *saved_geometry,
-                                       struct uds_configuration *user)
-{
-       struct index_geometry *geometry = user->geometry;
-       bool result = true;
-
-       if (saved_geometry->record_pages_per_chapter != geometry->record_pages_per_chapter) {
-               uds_log_error("Record pages per chapter (%u) does not match (%u)",
-                             saved_geometry->record_pages_per_chapter,
-                             geometry->record_pages_per_chapter);
-               result = false;
-       }
-
-       if (saved_geometry->chapters_per_volume != geometry->chapters_per_volume) {
-               uds_log_error("Chapter count (%u) does not match (%u)",
-                             saved_geometry->chapters_per_volume,
-                             geometry->chapters_per_volume);
-               result = false;
-       }
-
-       if (saved_geometry->sparse_chapters_per_volume != geometry->sparse_chapters_per_volume) {
-               uds_log_error("Sparse chapter count (%u) does not match (%u)",
-                             saved_geometry->sparse_chapters_per_volume,
-                             geometry->sparse_chapters_per_volume);
-               result = false;
-       }
-
-       if (saved_config->cache_chapters != user->cache_chapters) {
-               uds_log_error("Cache size (%u) does not match (%u)",
-                             saved_config->cache_chapters, user->cache_chapters);
-               result = false;
-       }
-
-       if (saved_config->volume_index_mean_delta != user->volume_index_mean_delta) {
-               uds_log_error("Volume index mean delta (%u) does not match (%u)",
-                             saved_config->volume_index_mean_delta,
-                             user->volume_index_mean_delta);
-               result = false;
-       }
-
-       if (saved_geometry->bytes_per_page != geometry->bytes_per_page) {
-               uds_log_error("Bytes per page value (%zu) does not match (%zu)",
-                             saved_geometry->bytes_per_page, geometry->bytes_per_page);
-               result = false;
-       }
-
-       if (saved_config->sparse_sample_rate != user->sparse_sample_rate) {
-               uds_log_error("Sparse sample rate (%u) does not match (%u)",
-                             saved_config->sparse_sample_rate,
-                             user->sparse_sample_rate);
-               result = false;
-       }
-
-       if (saved_config->nonce != user->nonce) {
-               uds_log_error("Nonce (%llu) does not match (%llu)",
-                             (unsigned long long) saved_config->nonce,
-                             (unsigned long long) user->nonce);
-               result = false;
-       }
-
-       return result;
-}
-
-/* Read the configuration and validate it against the provided one. */
-int uds_validate_config_contents(struct buffered_reader *reader,
-                                struct uds_configuration *user_config)
-{
-       int result;
-       struct uds_configuration config;
-       struct index_geometry geometry;
-       u8 version_buffer[INDEX_CONFIG_VERSION_LENGTH];
-       u32 bytes_per_page;
-       u8 buffer[sizeof(struct uds_configuration_6_02)];
-       size_t offset = 0;
-
-       result = uds_verify_buffered_data(reader, INDEX_CONFIG_MAGIC,
-                                         INDEX_CONFIG_MAGIC_LENGTH);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       result = uds_read_from_buffered_reader(reader, version_buffer,
-                                              INDEX_CONFIG_VERSION_LENGTH);
-       if (result != UDS_SUCCESS)
-               return uds_log_error_strerror(result, "cannot read index config version");
-
-       if (!is_version(INDEX_CONFIG_VERSION_6_02, version_buffer) &&
-           !is_version(INDEX_CONFIG_VERSION_8_02, version_buffer)) {
-               return uds_log_error_strerror(UDS_CORRUPT_DATA,
-                                             "unsupported configuration version: '%.*s'",
-                                             INDEX_CONFIG_VERSION_LENGTH,
-                                             version_buffer);
-       }
-
-       result = uds_read_from_buffered_reader(reader, buffer, sizeof(buffer));
-       if (result != UDS_SUCCESS)
-               return uds_log_error_strerror(result, "cannot read config data");
-
-       decode_u32_le(buffer, &offset, &geometry.record_pages_per_chapter);
-       decode_u32_le(buffer, &offset, &geometry.chapters_per_volume);
-       decode_u32_le(buffer, &offset, &geometry.sparse_chapters_per_volume);
-       decode_u32_le(buffer, &offset, &config.cache_chapters);
-       offset += sizeof(u32);
-       decode_u32_le(buffer, &offset, &config.volume_index_mean_delta);
-       decode_u32_le(buffer, &offset, &bytes_per_page);
-       geometry.bytes_per_page = bytes_per_page;
-       decode_u32_le(buffer, &offset, &config.sparse_sample_rate);
-       decode_u64_le(buffer, &offset, &config.nonce);
-
-       result = ASSERT(offset == sizeof(struct uds_configuration_6_02),
-                       "%zu bytes read but not decoded",
-                       sizeof(struct uds_configuration_6_02) - offset);
-       if (result != UDS_SUCCESS)
-               return UDS_CORRUPT_DATA;
-
-       if (is_version(INDEX_CONFIG_VERSION_6_02, version_buffer)) {
-               user_config->geometry->remapped_virtual = 0;
-               user_config->geometry->remapped_physical = 0;
-       } else {
-               u8 remapping[sizeof(u64) + sizeof(u64)];
-
-               result = uds_read_from_buffered_reader(reader, remapping,
-                                                      sizeof(remapping));
-               if (result != UDS_SUCCESS)
-                       return uds_log_error_strerror(result, "cannot read converted config");
-
-               offset = 0;
-               decode_u64_le(remapping, &offset,
-                             &user_config->geometry->remapped_virtual);
-               decode_u64_le(remapping, &offset,
-                             &user_config->geometry->remapped_physical);
-       }
-
-       if (!are_matching_configurations(&config, &geometry, user_config)) {
-               uds_log_warning("Supplied configuration does not match save");
-               return UDS_NO_INDEX;
-       }
-
-       return UDS_SUCCESS;
-}
-
-/*
- * Write the configuration to stable storage. If the superblock version is < 4, write the 6.02
- * version; otherwise write the 8.02 version, indicating the configuration is for an index that has
- * been reduced by one chapter.
- */
-int uds_write_config_contents(struct buffered_writer *writer,
-                             struct uds_configuration *config, u32 version)
-{
-       int result;
-       struct index_geometry *geometry = config->geometry;
-       u8 buffer[sizeof(struct uds_configuration_8_02)];
-       size_t offset = 0;
-
-       result = uds_write_to_buffered_writer(writer, INDEX_CONFIG_MAGIC,
-                                             INDEX_CONFIG_MAGIC_LENGTH);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       /*
-        * If version is < 4, the index has not been reduced by a chapter so it must be written out
-        * as version 6.02 so that it is still compatible with older versions of UDS.
-        */
-       if (version >= 4) {
-               result = uds_write_to_buffered_writer(writer, INDEX_CONFIG_VERSION_8_02,
-                                                     INDEX_CONFIG_VERSION_LENGTH);
-               if (result != UDS_SUCCESS)
-                       return result;
-       } else {
-               result = uds_write_to_buffered_writer(writer, INDEX_CONFIG_VERSION_6_02,
-                                                     INDEX_CONFIG_VERSION_LENGTH);
-               if (result != UDS_SUCCESS)
-                       return result;
-       }
-
-       encode_u32_le(buffer, &offset, geometry->record_pages_per_chapter);
-       encode_u32_le(buffer, &offset, geometry->chapters_per_volume);
-       encode_u32_le(buffer, &offset, geometry->sparse_chapters_per_volume);
-       encode_u32_le(buffer, &offset, config->cache_chapters);
-       encode_u32_le(buffer, &offset, 0);
-       encode_u32_le(buffer, &offset, config->volume_index_mean_delta);
-       encode_u32_le(buffer, &offset, geometry->bytes_per_page);
-       encode_u32_le(buffer, &offset, config->sparse_sample_rate);
-       encode_u64_le(buffer, &offset, config->nonce);
-
-       result = ASSERT(offset == sizeof(struct uds_configuration_6_02),
-                       "%zu bytes encoded, of %zu expected", offset,
-                       sizeof(struct uds_configuration_6_02));
-       if (result != UDS_SUCCESS)
-               return result;
-
-       if (version >= 4) {
-               encode_u64_le(buffer, &offset, geometry->remapped_virtual);
-               encode_u64_le(buffer, &offset, geometry->remapped_physical);
-       }
-
-       return uds_write_to_buffered_writer(writer, buffer, offset);
-}
-
-/* Compute configuration parameters that depend on memory size. */
-static int compute_memory_sizes(uds_memory_config_size_t mem_gb, bool sparse,
-                               u32 *chapters_per_volume, u32 *record_pages_per_chapter,
-                               u32 *sparse_chapters_per_volume)
-{
-       u32 reduced_chapters = 0;
-       u32 base_chapters;
-
-       if (mem_gb == UDS_MEMORY_CONFIG_256MB) {
-               base_chapters = DEFAULT_CHAPTERS_PER_VOLUME;
-               *record_pages_per_chapter = SMALL_RECORD_PAGES_PER_CHAPTER;
-       } else if (mem_gb == UDS_MEMORY_CONFIG_512MB) {
-               base_chapters = DEFAULT_CHAPTERS_PER_VOLUME;
-               *record_pages_per_chapter = 2 * SMALL_RECORD_PAGES_PER_CHAPTER;
-       } else if (mem_gb == UDS_MEMORY_CONFIG_768MB) {
-               base_chapters = DEFAULT_CHAPTERS_PER_VOLUME;
-               *record_pages_per_chapter = 3 * SMALL_RECORD_PAGES_PER_CHAPTER;
-       } else if ((mem_gb >= 1) && (mem_gb <= UDS_MEMORY_CONFIG_MAX)) {
-               base_chapters = mem_gb * DEFAULT_CHAPTERS_PER_VOLUME;
-               *record_pages_per_chapter = DEFAULT_RECORD_PAGES_PER_CHAPTER;
-       } else if (mem_gb == UDS_MEMORY_CONFIG_REDUCED_256MB) {
-               reduced_chapters = 1;
-               base_chapters = DEFAULT_CHAPTERS_PER_VOLUME;
-               *record_pages_per_chapter = SMALL_RECORD_PAGES_PER_CHAPTER;
-       } else if (mem_gb == UDS_MEMORY_CONFIG_REDUCED_512MB) {
-               reduced_chapters = 1;
-               base_chapters = DEFAULT_CHAPTERS_PER_VOLUME;
-               *record_pages_per_chapter = 2 * SMALL_RECORD_PAGES_PER_CHAPTER;
-       } else if (mem_gb == UDS_MEMORY_CONFIG_REDUCED_768MB) {
-               reduced_chapters = 1;
-               base_chapters = DEFAULT_CHAPTERS_PER_VOLUME;
-               *record_pages_per_chapter = 3 * SMALL_RECORD_PAGES_PER_CHAPTER;
-       } else if ((mem_gb >= 1 + UDS_MEMORY_CONFIG_REDUCED) &&
-                  (mem_gb <= UDS_MEMORY_CONFIG_REDUCED_MAX)) {
-               reduced_chapters = 1;
-               base_chapters = ((mem_gb - UDS_MEMORY_CONFIG_REDUCED) *
-                                DEFAULT_CHAPTERS_PER_VOLUME);
-               *record_pages_per_chapter = DEFAULT_RECORD_PAGES_PER_CHAPTER;
-       } else {
-               uds_log_error("received invalid memory size");
-               return -EINVAL;
-       }
-
-       if (sparse) {
-               /* Make 95% of chapters sparse, allowing 10x more records. */
-               *sparse_chapters_per_volume = (19 * base_chapters) / 2;
-               base_chapters *= 10;
-       } else {
-               *sparse_chapters_per_volume = 0;
-       }
-
-       *chapters_per_volume = base_chapters - reduced_chapters;
-       return UDS_SUCCESS;
-}
-
-static unsigned int __must_check normalize_zone_count(unsigned int requested)
-{
-       unsigned int zone_count = requested;
-
-       if (zone_count == 0)
-               zone_count = num_online_cpus() / 2;
-
-       if (zone_count < 1)
-               zone_count = 1;
-
-       if (zone_count > MAX_ZONES)
-               zone_count = MAX_ZONES;
-
-       uds_log_info("Using %u indexing zone%s for concurrency.",
-                    zone_count, zone_count == 1 ? "" : "s");
-       return zone_count;
-}
-
-static unsigned int __must_check normalize_read_threads(unsigned int requested)
-{
-       unsigned int read_threads = requested;
-
-       if (read_threads < 1)
-               read_threads = DEFAULT_VOLUME_READ_THREADS;
-
-       if (read_threads > MAX_VOLUME_READ_THREADS)
-               read_threads = MAX_VOLUME_READ_THREADS;
-
-       return read_threads;
-}
-
-int uds_make_configuration(const struct uds_parameters *params,
-                          struct uds_configuration **config_ptr)
-{
-       struct uds_configuration *config;
-       u32 chapters_per_volume = 0;
-       u32 record_pages_per_chapter = 0;
-       u32 sparse_chapters_per_volume = 0;
-       int result;
-
-       result = compute_memory_sizes(params->memory_size, params->sparse,
-                                     &chapters_per_volume, &record_pages_per_chapter,
-                                     &sparse_chapters_per_volume);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       result = uds_allocate(1, struct uds_configuration, __func__, &config);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       result = uds_make_index_geometry(DEFAULT_BYTES_PER_PAGE, record_pages_per_chapter,
-                                        chapters_per_volume, sparse_chapters_per_volume,
-                                        0, 0, &config->geometry);
-       if (result != UDS_SUCCESS) {
-               uds_free_configuration(config);
-               return result;
-       }
-
-       config->zone_count = normalize_zone_count(params->zone_count);
-       config->read_threads = normalize_read_threads(params->read_threads);
-
-       config->cache_chapters = DEFAULT_CACHE_CHAPTERS;
-       config->volume_index_mean_delta = DEFAULT_VOLUME_INDEX_MEAN_DELTA;
-       config->sparse_sample_rate = (params->sparse ? DEFAULT_SPARSE_SAMPLE_RATE : 0);
-       config->nonce = params->nonce;
-       config->bdev = params->bdev;
-       config->offset = params->offset;
-       config->size = params->size;
-
-       *config_ptr = config;
-       return UDS_SUCCESS;
-}
-
-void uds_free_configuration(struct uds_configuration *config)
-{
-       if (config != NULL) {
-               uds_free_index_geometry(config->geometry);
-               uds_free(config);
-       }
-}
-
-void uds_log_configuration(struct uds_configuration *config)
-{
-       struct index_geometry *geometry = config->geometry;
-
-       uds_log_debug("Configuration:");
-       uds_log_debug("  Record pages per chapter:   %10u", geometry->record_pages_per_chapter);
-       uds_log_debug("  Chapters per volume:        %10u", geometry->chapters_per_volume);
-       uds_log_debug("  Sparse chapters per volume: %10u", geometry->sparse_chapters_per_volume);
-       uds_log_debug("  Cache size (chapters):      %10u", config->cache_chapters);
-       uds_log_debug("  Volume index mean delta:    %10u", config->volume_index_mean_delta);
-       uds_log_debug("  Bytes per page:             %10zu", geometry->bytes_per_page);
-       uds_log_debug("  Sparse sample rate:         %10u", config->sparse_sample_rate);
-       uds_log_debug("  Nonce:                      %llu", (unsigned long long) config->nonce);
-}
diff --git a/drivers/md/dm-vdo/config.h b/drivers/md/dm-vdo/config.h
deleted file mode 100644 (file)
index 08507dc..0000000
+++ /dev/null
@@ -1,124 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright 2023 Red Hat
- */
-
-#ifndef UDS_CONFIG_H
-#define UDS_CONFIG_H
-
-#include "geometry.h"
-#include "indexer.h"
-#include "io-factory.h"
-
-/*
- * The uds_configuration records a variety of parameters used to configure a new UDS index. Some
- * parameters are provided by the client, while others are fixed or derived from user-supplied
- * values. It is created when an index is created, and it is recorded in the index metadata.
- */
-
-enum {
-       DEFAULT_VOLUME_INDEX_MEAN_DELTA = 4096,
-       DEFAULT_CACHE_CHAPTERS = 7,
-       DEFAULT_SPARSE_SAMPLE_RATE = 32,
-       MAX_ZONES = 16,
-};
-
-/* A set of configuration parameters for the indexer. */
-struct uds_configuration {
-       /* Storage device for the index */
-       struct block_device *bdev;
-
-       /* The maximum allowable size of the index */
-       size_t size;
-
-       /* The offset where the index should start */
-       off_t offset;
-
-       /* Parameters for the volume */
-
-       /* The volume layout */
-       struct index_geometry *geometry;
-
-       /* Index owner's nonce */
-       u64 nonce;
-
-       /* The number of threads used to process index requests */
-       unsigned int zone_count;
-
-       /* The number of threads used to read volume pages */
-       unsigned int read_threads;
-
-       /* Size of the page cache and sparse chapter index cache in chapters */
-       u32 cache_chapters;
-
-       /* Parameters for the volume index */
-
-       /* The mean delta for the volume index */
-       u32 volume_index_mean_delta;
-
-       /* Sampling rate for sparse indexing */
-       u32 sparse_sample_rate;
-};
-
-/* On-disk structure of data for a version 8.02 index. */
-struct uds_configuration_8_02 {
-       /* Smaller (16), Small (64) or large (256) indices */
-       u32 record_pages_per_chapter;
-       /* Total number of chapters per volume */
-       u32 chapters_per_volume;
-       /* Number of sparse chapters per volume */
-       u32 sparse_chapters_per_volume;
-       /* Size of the page cache, in chapters */
-       u32 cache_chapters;
-       /* Unused field */
-       u32 unused;
-       /* The volume index mean delta to use */
-       u32 volume_index_mean_delta;
-       /* Size of a page, used for both record pages and index pages */
-       u32 bytes_per_page;
-       /* Sampling rate for sparse indexing */
-       u32 sparse_sample_rate;
-       /* Index owner's nonce */
-       u64 nonce;
-       /* Virtual chapter remapped from physical chapter 0 */
-       u64 remapped_virtual;
-       /* New physical chapter which remapped chapter was moved to */
-       u64 remapped_physical;
-} __packed;
-
-/* On-disk structure of data for a version 6.02 index. */
-struct uds_configuration_6_02 {
-       /* Smaller (16), Small (64) or large (256) indices */
-       u32 record_pages_per_chapter;
-       /* Total number of chapters per volume */
-       u32 chapters_per_volume;
-       /* Number of sparse chapters per volume */
-       u32 sparse_chapters_per_volume;
-       /* Size of the page cache, in chapters */
-       u32 cache_chapters;
-       /* Unused field */
-       u32 unused;
-       /* The volume index mean delta to use */
-       u32 volume_index_mean_delta;
-       /* Size of a page, used for both record pages and index pages */
-       u32 bytes_per_page;
-       /* Sampling rate for sparse indexing */
-       u32 sparse_sample_rate;
-       /* Index owner's nonce */
-       u64 nonce;
-} __packed;
-
-int __must_check uds_make_configuration(const struct uds_parameters *params,
-                                       struct uds_configuration **config_ptr);
-
-void uds_free_configuration(struct uds_configuration *config);
-
-int __must_check uds_validate_config_contents(struct buffered_reader *reader,
-                                             struct uds_configuration *config);
-
-int __must_check uds_write_config_contents(struct buffered_writer *writer,
-                                          struct uds_configuration *config, u32 version);
-
-void uds_log_configuration(struct uds_configuration *config);
-
-#endif /* UDS_CONFIG_H */
index e7729623a6bb53a5107f34d8b3798f85e064aa3c..44fd0d8ccb769d9d5c21fddb30579016ec07b09b 100644 (file)
 #include <linux/bio.h>
 #include <linux/list.h>
 
-#include "indexer.h"
 #include "permassert.h"
 
+#include "indexer.h"
+
 #include "block-map.h"
 #include "completion.h"
 #include "constants.h"
index 942a50ef8b0d82cfbefecf3b2807377c81ecefe9..9468d7fad443564113329b53046f17afa6e3bc45 100644 (file)
 #include <linux/spinlock.h>
 #include <linux/timer.h>
 
-#include "indexer.h"
 #include "logger.h"
 #include "memory-alloc.h"
 #include "numeric.h"
 #include "permassert.h"
 #include "string-utils.h"
 
+#include "indexer.h"
+
 #include "action-manager.h"
 #include "admin-state.h"
 #include "completion.h"
diff --git a/drivers/md/dm-vdo/delta-index.c b/drivers/md/dm-vdo/delta-index.c
deleted file mode 100644 (file)
index 66f51b5..0000000
+++ /dev/null
@@ -1,1987 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright 2023 Red Hat
- */
-#include "delta-index.h"
-
-#include <linux/bitops.h>
-#include <linux/bits.h>
-#include <linux/compiler.h>
-#include <linux/limits.h>
-#include <linux/log2.h>
-
-#include "config.h"
-#include "cpu.h"
-#include "errors.h"
-#include "indexer.h"
-#include "logger.h"
-#include "memory-alloc.h"
-#include "numeric.h"
-#include "permassert.h"
-#include "string-utils.h"
-#include "time-utils.h"
-
-/*
- * The entries in a delta index could be stored in a single delta list, but to reduce search times
- * and update costs it uses multiple delta lists. These lists are stored in a single chunk of
- * memory managed by the delta_zone structure. The delta_zone can move the data around within its
- * memory, so the location of each delta list is recorded as a bit offset into the memory. Because
- * the volume index can contain over a million delta lists, we want to be efficient with the size
- * of the delta list header information. This information is encoded into 16 bytes per list. The
- * volume index delta list memory can easily exceed 4 gigabits, so a 64 bit value is needed to
- * address the memory. The volume index delta lists average around 6 kilobits, so 16 bits are
- * sufficient to store the size of a delta list.
- *
- * Each delta list is stored as a bit stream. Within the delta list encoding, bits and bytes are
- * numbered in little endian order. Within a byte, bit 0 is the least significant bit (0x1), and
- * bit 7 is the most significant bit (0x80). Within a bit stream, bit 7 is the most signficant bit
- * of byte 0, and bit 8 is the least significant bit of byte 1. Within a byte array, a byte's
- * number corresponds to its index in the array.
- *
- * A standard delta list entry is stored as a fixed length payload (the value) followed by a
- * variable length key (the delta). A collision entry is used when two block names have the same
- * delta list address. A collision entry always follows a standard entry for the hash with which it
- * collides, and is encoded with DELTA == 0 with an additional 256 bits field at the end,
- * containing the full block name. An entry with a delta of 0 at the beginning of a delta list
- * indicates a normal entry.
- *
- * The delta in each entry is encoded with a variable-length Huffman code to minimize the memory
- * used by small deltas. The Huffman code is specified by three parameters, which can be computed
- * from the desired mean delta when the index is full. (See compute_coding_constants() for
- * details.)
- *
- * The bit field utilities used to read and write delta entries assume that it is possible to read
- * some bytes beyond the end of the bit field, so a delta_zone memory allocation is guarded by two
- * invalid delta lists to prevent reading outside the delta_zone memory. The valid delta lists are
- * numbered 1 to N, and the guard lists are numbered 0 and N+1. The function to decode the bit
- * stream include a step that skips over bits set to 0 until the first 1 bit is found. A corrupted
- * delta list could cause this step to run off the end of the delta_zone memory, so as extra
- * protection against this happening, the tail guard list is set to all ones.
- *
- * The delta_index supports two different forms. The mutable form is created by
- * uds_initialize_delta_index(), and is used for the volume index and for open chapter indexes. The
- * immutable form is created by uds_initialize_delta_index_page(), and is used for closed (and
- * cached) chapter index pages. The immutable form does not allocate delta list headers or
- * temporary offsets, and thus is somewhat more memory efficient.
- */
-
-/*
- * This is the largest field size supported by get_field() and set_field(). Any field that is
- * larger is not guaranteed to fit in a single byte-aligned u32.
- */
-enum {
-       MAX_FIELD_BITS = (sizeof(u32) - 1) * BITS_PER_BYTE + 1,
-};
-
-/*
- * This is the largest field size supported by get_big_field() and set_big_field(). Any field that
- * is larger is not guaranteed to fit in a single byte-aligned u64.
- */
-enum {
-       MAX_BIG_FIELD_BITS = (sizeof(u64) - 1) * BITS_PER_BYTE + 1,
-};
-
-/*
- * This is the number of guard bytes needed at the end of the memory byte array when using the bit
- * utilities. These utilities call get_big_field() and set_big_field(), which can access up to 7
- * bytes beyond the end of the desired field. The definition is written to make it clear how this
- * value is derived.
- */
-enum {
-       POST_FIELD_GUARD_BYTES = sizeof(u64) - 1,
-};
-
-/* The number of guard bits that are needed in the tail guard list */
-enum {
-       GUARD_BITS = POST_FIELD_GUARD_BYTES * BITS_PER_BYTE
-};
-
-/*
- * The maximum size of a single delta list in bytes. We count guard bytes in this value because a
- * buffer of this size can be used with move_bits().
- */
-enum {
-       DELTA_LIST_MAX_BYTE_COUNT =
-               ((U16_MAX + BITS_PER_BYTE) / BITS_PER_BYTE + POST_FIELD_GUARD_BYTES)
-};
-
-/* The number of extra bytes and bits needed to store a collision entry */
-enum {
-       COLLISION_BYTES = UDS_RECORD_NAME_SIZE,
-       COLLISION_BITS = COLLISION_BYTES * BITS_PER_BYTE
-};
-
-/*
- * Immutable delta lists are packed into pages containing a header that encodes the delta list
- * information into 19 bits per list (64KB bit offset).
- */
-
-enum { IMMUTABLE_HEADER_SIZE = 19 };
-
-/*
- * Constants and structures for the saved delta index. "DI" is for delta_index, and -##### is a
- * number to increment when the format of the data changes.
- */
-
-enum {
-       MAGIC_SIZE = 8,
-};
-
-static const char DELTA_INDEX_MAGIC[] = "DI-00002";
-
-struct delta_index_header {
-       char magic[MAGIC_SIZE];
-       u32 zone_number;
-       u32 zone_count;
-       u32 first_list;
-       u32 list_count;
-       u64 record_count;
-       u64 collision_count;
-};
-
-/*
- * Header data used for immutable delta index pages. This data is followed by the delta list offset
- * table.
- */
-struct delta_page_header {
-       /* Externally-defined nonce */
-       u64 nonce;
-       /* The virtual chapter number */
-       u64 virtual_chapter_number;
-       /* Index of the first delta list on the page */
-       u16 first_list;
-       /* Number of delta lists on the page */
-       u16 list_count;
-} __packed;
-
-static inline u64 get_delta_list_byte_start(const struct delta_list *delta_list)
-{
-       return delta_list->start / BITS_PER_BYTE;
-}
-
-static inline u16 get_delta_list_byte_size(const struct delta_list *delta_list)
-{
-       unsigned int bit_offset = delta_list->start % BITS_PER_BYTE;
-
-       return BITS_TO_BYTES(bit_offset + delta_list->size);
-}
-
-static void rebalance_delta_zone(const struct delta_zone *delta_zone, u32 first,
-                                u32 last)
-{
-       struct delta_list *delta_list;
-       u64 new_start;
-
-       if (first == last) {
-               /* Only one list is moving, and we know there is space. */
-               delta_list = &delta_zone->delta_lists[first];
-               new_start = delta_zone->new_offsets[first];
-               if (delta_list->start != new_start) {
-                       u64 source;
-                       u64 destination;
-
-                       source = get_delta_list_byte_start(delta_list);
-                       delta_list->start = new_start;
-                       destination = get_delta_list_byte_start(delta_list);
-                       memmove(delta_zone->memory + destination,
-                               delta_zone->memory + source,
-                               get_delta_list_byte_size(delta_list));
-               }
-       } else {
-               /*
-                * There is more than one list. Divide the problem in half, and use recursive calls
-                * to process each half. Note that after this computation, first <= middle, and
-                * middle < last.
-                */
-               u32 middle = (first + last) / 2;
-
-               delta_list = &delta_zone->delta_lists[middle];
-               new_start = delta_zone->new_offsets[middle];
-
-               /*
-                * The direction that our middle list is moving determines which half of the
-                * problem must be processed first.
-                */
-               if (new_start > delta_list->start) {
-                       rebalance_delta_zone(delta_zone, middle + 1, last);
-                       rebalance_delta_zone(delta_zone, first, middle);
-               } else {
-                       rebalance_delta_zone(delta_zone, first, middle);
-                       rebalance_delta_zone(delta_zone, middle + 1, last);
-               }
-       }
-}
-
-static inline size_t get_zone_memory_size(unsigned int zone_count, size_t memory_size)
-{
-       /* Round up so that each zone is a multiple of 64K in size. */
-       enum {
-               ALLOC_BOUNDARY = 64 * 1024,
-       };
-
-       return (memory_size / zone_count + ALLOC_BOUNDARY - 1) & -ALLOC_BOUNDARY;
-}
-
-void uds_reset_delta_index(const struct delta_index *delta_index)
-{
-       unsigned int z;
-
-       /*
-        * Initialize all delta lists to be empty. We keep 2 extra delta list descriptors, one
-        * before the first real entry and one after so that we don't need to bounds check the
-        * array access when calculating preceding and following gap sizes.
-        */
-       for (z = 0; z < delta_index->zone_count; z++) {
-               u64 list_bits;
-               u64 spacing;
-               u64 offset;
-               unsigned int i;
-               struct delta_zone *zone = &delta_index->delta_zones[z];
-               struct delta_list *delta_lists = zone->delta_lists;
-
-               /* Zeroing the delta list headers initializes the head guard list correctly. */
-               memset(delta_lists, 0,
-                      (zone->list_count + 2) * sizeof(struct delta_list));
-
-               /* Set all the bits in the end guard list. */
-               list_bits = (u64) zone->size * BITS_PER_BYTE - GUARD_BITS;
-               delta_lists[zone->list_count + 1].start = list_bits;
-               delta_lists[zone->list_count + 1].size = GUARD_BITS;
-               memset(zone->memory + (list_bits / BITS_PER_BYTE), ~0,
-                      POST_FIELD_GUARD_BYTES);
-
-               /* Evenly space out the real delta lists by setting regular offsets. */
-               spacing = list_bits / zone->list_count;
-               offset = spacing / 2;
-               for (i = 1; i <= zone->list_count; i++) {
-                       delta_lists[i].start = offset;
-                       offset += spacing;
-               }
-
-               /* Update the statistics. */
-               zone->discard_count += zone->record_count;
-               zone->record_count = 0;
-               zone->collision_count = 0;
-       }
-}
-
-/* Compute the Huffman coding parameters for the given mean delta. The Huffman code is specified by
- * three parameters:
- *
- *  MINBITS   The number of bits in the smallest code
- *  BASE      The number of values coded using a code of length MINBITS
- *  INCR      The number of values coded by using one additional bit
- *
- * These parameters are related by this equation:
- *
- *     BASE + INCR == 1 << MINBITS
- *
- * The math for the Huffman code of an exponential distribution says that
- *
- *     INCR = log(2) * MEAN_DELTA
- *
- * Then use the smallest MINBITS value so that
- *
- *     (1 << MINBITS) > INCR
- *
- * And then
- *
- *     BASE = (1 << MINBITS) - INCR
- *
- * Now the index can generate a code such that
- * - The first BASE values code using MINBITS bits.
- * - The next INCR values code using MINBITS+1 bits.
- * - The next INCR values code using MINBITS+2 bits.
- * - (and so on).
- */
-static void compute_coding_constants(u32 mean_delta, u16 *min_bits, u32 *min_keys, u32 *incr_keys)
-{
-       /*
-        * We want to compute the rounded value of log(2) * mean_delta. Since we cannot always use
-        * floating point, use a really good integer approximation.
-        */
-       *incr_keys = (836158UL * mean_delta + 603160UL) / 1206321UL;
-       *min_bits = bits_per(*incr_keys + 1);
-       *min_keys = (1 << *min_bits) - *incr_keys;
-}
-
-void uds_uninitialize_delta_index(struct delta_index *delta_index)
-{
-       unsigned int z;
-
-       if (delta_index->delta_zones == NULL)
-               return;
-
-       for (z = 0; z < delta_index->zone_count; z++) {
-               uds_free(uds_forget(delta_index->delta_zones[z].new_offsets));
-               uds_free(uds_forget(delta_index->delta_zones[z].delta_lists));
-               uds_free(uds_forget(delta_index->delta_zones[z].memory));
-       }
-
-       uds_free(delta_index->delta_zones);
-       memset(delta_index, 0, sizeof(struct delta_index));
-}
-
-static int initialize_delta_zone(struct delta_zone *delta_zone, size_t size,
-                                u32 first_list, u32 list_count, u32 mean_delta,
-                                u32 payload_bits, u8 tag)
-{
-       int result;
-
-       result = uds_allocate(size, u8, "delta list", &delta_zone->memory);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       result = uds_allocate(list_count + 2, u64, "delta list temp",
-                             &delta_zone->new_offsets);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       /* Allocate the delta lists. */
-       result = uds_allocate(list_count + 2, struct delta_list, "delta lists",
-                             &delta_zone->delta_lists);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       compute_coding_constants(mean_delta, &delta_zone->min_bits,
-                                &delta_zone->min_keys, &delta_zone->incr_keys);
-       delta_zone->value_bits = payload_bits;
-       delta_zone->buffered_writer = NULL;
-       delta_zone->size = size;
-       delta_zone->rebalance_time = 0;
-       delta_zone->rebalance_count = 0;
-       delta_zone->record_count = 0;
-       delta_zone->collision_count = 0;
-       delta_zone->discard_count = 0;
-       delta_zone->overflow_count = 0;
-       delta_zone->first_list = first_list;
-       delta_zone->list_count = list_count;
-       delta_zone->tag = tag;
-
-       return UDS_SUCCESS;
-}
-
-int uds_initialize_delta_index(struct delta_index *delta_index, unsigned int zone_count,
-                              u32 list_count, u32 mean_delta, u32 payload_bits,
-                              size_t memory_size, u8 tag)
-{
-       int result;
-       unsigned int z;
-       size_t zone_memory;
-
-       result = uds_allocate(zone_count, struct delta_zone, "Delta Index Zones",
-                             &delta_index->delta_zones);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       delta_index->zone_count = zone_count;
-       delta_index->list_count = list_count;
-       delta_index->lists_per_zone = DIV_ROUND_UP(list_count, zone_count);
-       delta_index->memory_size = 0;
-       delta_index->mutable = true;
-       delta_index->tag = tag;
-
-       for (z = 0; z < zone_count; z++) {
-               u32 lists_in_zone = delta_index->lists_per_zone;
-               u32 first_list_in_zone = z * lists_in_zone;
-
-               if (z == zone_count - 1) {
-                       /*
-                        * The last zone gets fewer lists if zone_count doesn't evenly divide
-                        * list_count. We'll have an underflow if the assertion below doesn't hold.
-                        */
-                       if (delta_index->list_count <= first_list_in_zone) {
-                               uds_uninitialize_delta_index(delta_index);
-                               return uds_log_error_strerror(UDS_INVALID_ARGUMENT,
-                                                             "%u delta lists not enough for %u zones",
-                                                             list_count, zone_count);
-                       }
-                       lists_in_zone = delta_index->list_count - first_list_in_zone;
-               }
-
-               zone_memory = get_zone_memory_size(zone_count, memory_size);
-               result = initialize_delta_zone(&delta_index->delta_zones[z], zone_memory,
-                                              first_list_in_zone, lists_in_zone,
-                                              mean_delta, payload_bits, tag);
-               if (result != UDS_SUCCESS) {
-                       uds_uninitialize_delta_index(delta_index);
-                       return result;
-               }
-
-               delta_index->memory_size +=
-                       (sizeof(struct delta_zone) + zone_memory +
-                        (lists_in_zone + 2) * (sizeof(struct delta_list) + sizeof(u64)));
-       }
-
-       uds_reset_delta_index(delta_index);
-       return UDS_SUCCESS;
-}
-
-/* Read a bit field from an arbitrary bit boundary. */
-static inline u32 get_field(const u8 *memory, u64 offset, u8 size)
-{
-       const void *addr = memory + offset / BITS_PER_BYTE;
-
-       return (get_unaligned_le32(addr) >> (offset % BITS_PER_BYTE)) & ((1 << size) - 1);
-}
-
-/* Write a bit field to an arbitrary bit boundary. */
-static inline void set_field(u32 value, u8 *memory, u64 offset, u8 size)
-{
-       void *addr = memory + offset / BITS_PER_BYTE;
-       int shift = offset % BITS_PER_BYTE;
-       u32 data = get_unaligned_le32(addr);
-
-       data &= ~(((1 << size) - 1) << shift);
-       data |= value << shift;
-       put_unaligned_le32(data, addr);
-}
-
-/* Get the bit offset to the immutable delta list header. */
-static inline u32 get_immutable_header_offset(u32 list_number)
-{
-       return sizeof(struct delta_page_header) * BITS_PER_BYTE +
-               list_number * IMMUTABLE_HEADER_SIZE;
-}
-
-/* Get the bit offset to the start of the immutable delta list bit stream. */
-static inline u32 get_immutable_start(const u8 *memory, u32 list_number)
-{
-       return get_field(memory, get_immutable_header_offset(list_number),
-                        IMMUTABLE_HEADER_SIZE);
-}
-
-/* Set the bit offset to the start of the immutable delta list bit stream. */
-static inline void set_immutable_start(u8 *memory, u32 list_number, u32 start)
-{
-       set_field(start, memory, get_immutable_header_offset(list_number),
-                 IMMUTABLE_HEADER_SIZE);
-}
-
-static bool verify_delta_index_page(u64 nonce, u16 list_count, u64 expected_nonce,
-                                   u8 *memory, size_t memory_size)
-{
-       unsigned int i;
-
-       /*
-        * Verify the nonce. A mismatch can happen here during rebuild if we haven't written the
-        * entire volume at least once.
-        */
-       if (nonce != expected_nonce)
-               return false;
-
-       /* Verify that the number of delta lists can fit in the page. */
-       if (list_count > ((memory_size - sizeof(struct delta_page_header)) *
-                         BITS_PER_BYTE / IMMUTABLE_HEADER_SIZE))
-               return false;
-
-       /*
-        * Verify that the first delta list is immediately after the last delta
-        * list header.
-        */
-       if (get_immutable_start(memory, 0) != get_immutable_header_offset(list_count + 1))
-               return false;
-
-       /* Verify that the lists are in the correct order. */
-       for (i = 0; i < list_count; i++) {
-               if (get_immutable_start(memory, i) > get_immutable_start(memory, i + 1))
-                       return false;
-       }
-
-       /*
-        * Verify that the last list ends on the page, and that there is room
-        * for the post-field guard bits.
-        */
-       if (get_immutable_start(memory, list_count) >
-           (memory_size - POST_FIELD_GUARD_BYTES) * BITS_PER_BYTE)
-               return false;
-
-       /* Verify that the guard bytes are correctly set to all ones. */
-       for (i = 0; i < POST_FIELD_GUARD_BYTES; i++) {
-               if (memory[memory_size - POST_FIELD_GUARD_BYTES + i] != (u8) ~0)
-                       return false;
-       }
-
-       /* All verifications passed. */
-       return true;
-}
-
-/* Initialize a delta index page to refer to a supplied page. */
-int uds_initialize_delta_index_page(struct delta_index_page *delta_index_page,
-                                   u64 expected_nonce, u32 mean_delta, u32 payload_bits,
-                                   u8 *memory, size_t memory_size)
-{
-       u64 nonce;
-       u64 vcn;
-       u64 first_list;
-       u64 list_count;
-       struct delta_page_header *header = (struct delta_page_header *) memory;
-       struct delta_zone *delta_zone = &delta_index_page->delta_zone;
-       const u8 *nonce_addr = (const u8 *) &header->nonce;
-       const u8 *vcn_addr = (const u8 *) &header->virtual_chapter_number;
-       const u8 *first_list_addr = (const u8 *) &header->first_list;
-       const u8 *list_count_addr = (const u8 *) &header->list_count;
-
-       /* First assume that the header is little endian. */
-       nonce = get_unaligned_le64(nonce_addr);
-       vcn = get_unaligned_le64(vcn_addr);
-       first_list = get_unaligned_le16(first_list_addr);
-       list_count = get_unaligned_le16(list_count_addr);
-       if (!verify_delta_index_page(nonce, list_count, expected_nonce, memory,
-                                    memory_size)) {
-               /* If that fails, try big endian. */
-               nonce = get_unaligned_be64(nonce_addr);
-               vcn = get_unaligned_be64(vcn_addr);
-               first_list = get_unaligned_be16(first_list_addr);
-               list_count = get_unaligned_be16(list_count_addr);
-               if (!verify_delta_index_page(nonce, list_count, expected_nonce, memory,
-                                            memory_size)) {
-                       /*
-                        * Both attempts failed. Do not log this as an error, because it can happen
-                        * during a rebuild if we haven't written the entire volume at least once.
-                        */
-                       return UDS_CORRUPT_DATA;
-               }
-       }
-
-       delta_index_page->delta_index.delta_zones = delta_zone;
-       delta_index_page->delta_index.zone_count = 1;
-       delta_index_page->delta_index.list_count = list_count;
-       delta_index_page->delta_index.lists_per_zone = list_count;
-       delta_index_page->delta_index.mutable = false;
-       delta_index_page->delta_index.tag = 'p';
-       delta_index_page->virtual_chapter_number = vcn;
-       delta_index_page->lowest_list_number = first_list;
-       delta_index_page->highest_list_number = first_list + list_count - 1;
-
-       compute_coding_constants(mean_delta, &delta_zone->min_bits,
-                                &delta_zone->min_keys, &delta_zone->incr_keys);
-       delta_zone->value_bits = payload_bits;
-       delta_zone->memory = memory;
-       delta_zone->delta_lists = NULL;
-       delta_zone->new_offsets = NULL;
-       delta_zone->buffered_writer = NULL;
-       delta_zone->size = memory_size;
-       delta_zone->rebalance_time = 0;
-       delta_zone->rebalance_count = 0;
-       delta_zone->record_count = 0;
-       delta_zone->collision_count = 0;
-       delta_zone->discard_count = 0;
-       delta_zone->overflow_count = 0;
-       delta_zone->first_list = 0;
-       delta_zone->list_count = list_count;
-       delta_zone->tag = 'p';
-
-       return UDS_SUCCESS;
-}
-
-/* Read a large bit field from an arbitrary bit boundary. */
-static inline u64 get_big_field(const u8 *memory, u64 offset, u8 size)
-{
-       const void *addr = memory + offset / BITS_PER_BYTE;
-
-       return (get_unaligned_le64(addr) >> (offset % BITS_PER_BYTE)) & ((1UL << size) - 1);
-}
-
-/* Write a large bit field to an arbitrary bit boundary. */
-static inline void set_big_field(u64 value, u8 *memory, u64 offset, u8 size)
-{
-       void *addr = memory + offset / BITS_PER_BYTE;
-       u8 shift = offset % BITS_PER_BYTE;
-       u64 data = get_unaligned_le64(addr);
-
-       data &= ~(((1UL << size) - 1) << shift);
-       data |= value << shift;
-       put_unaligned_le64(data, addr);
-}
-
-/* Set a sequence of bits to all zeros. */
-static inline void set_zero(u8 *memory, u64 offset, u32 size)
-{
-       if (size > 0) {
-               u8 *addr = memory + offset / BITS_PER_BYTE;
-               u8 shift = offset % BITS_PER_BYTE;
-               u32 count = size + shift > BITS_PER_BYTE ? (u32) BITS_PER_BYTE - shift : size;
-
-               *addr++ &= ~(((1 << count) - 1) << shift);
-               for (size -= count; size > BITS_PER_BYTE; size -= BITS_PER_BYTE)
-                       *addr++ = 0;
-
-               if (size > 0)
-                       *addr &= 0xFF << size;
-       }
-}
-
-/*
- * Move several bits from a higher to a lower address, moving the lower addressed bits first. The
- * size and memory offsets are measured in bits.
- */
-static void move_bits_down(const u8 *from, u64 from_offset, u8 *to, u64 to_offset, u32 size)
-{
-       const u8 *source;
-       u8 *destination;
-       u8 offset;
-       u8 count;
-       u64 field;
-
-       /* Start by moving one field that ends on a to int boundary. */
-       count = (MAX_BIG_FIELD_BITS - ((to_offset + MAX_BIG_FIELD_BITS) % BITS_PER_TYPE(u32)));
-       field = get_big_field(from, from_offset, count);
-       set_big_field(field, to, to_offset, count);
-       from_offset += count;
-       to_offset += count;
-       size -= count;
-
-       /* Now do the main loop to copy 32 bit chunks that are int-aligned at the destination. */
-       offset = from_offset % BITS_PER_TYPE(u32);
-       source = from + (from_offset - offset) / BITS_PER_BYTE;
-       destination = to + to_offset / BITS_PER_BYTE;
-       while (size > MAX_BIG_FIELD_BITS) {
-               put_unaligned_le32(get_unaligned_le64(source) >> offset, destination);
-               source += sizeof(u32);
-               destination += sizeof(u32);
-               from_offset += BITS_PER_TYPE(u32);
-               to_offset += BITS_PER_TYPE(u32);
-               size -= BITS_PER_TYPE(u32);
-       }
-
-       /* Finish up by moving any remaining bits. */
-       if (size > 0) {
-               field = get_big_field(from, from_offset, size);
-               set_big_field(field, to, to_offset, size);
-       }
-}
-
-/*
- * Move several bits from a lower to a higher address, moving the higher addressed bits first. The
- * size and memory offsets are measured in bits.
- */
-static void move_bits_up(const u8 *from, u64 from_offset, u8 *to, u64 to_offset, u32 size)
-{
-       const u8 *source;
-       u8 *destination;
-       u8 offset;
-       u8 count;
-       u64 field;
-
-       /* Start by moving one field that begins on a destination int boundary. */
-       count = (to_offset + size) % BITS_PER_TYPE(u32);
-       if (count > 0) {
-               size -= count;
-               field = get_big_field(from, from_offset + size, count);
-               set_big_field(field, to, to_offset + size, count);
-       }
-
-       /* Now do the main loop to copy 32 bit chunks that are int-aligned at the destination. */
-       offset = (from_offset + size) % BITS_PER_TYPE(u32);
-       source = from + (from_offset + size - offset) / BITS_PER_BYTE;
-       destination = to + (to_offset + size) / BITS_PER_BYTE;
-       while (size > MAX_BIG_FIELD_BITS) {
-               source -= sizeof(u32);
-               destination -= sizeof(u32);
-               size -= BITS_PER_TYPE(u32);
-               put_unaligned_le32(get_unaligned_le64(source) >> offset, destination);
-       }
-
-       /* Finish up by moving any remaining bits. */
-       if (size > 0) {
-               field = get_big_field(from, from_offset, size);
-               set_big_field(field, to, to_offset, size);
-       }
-}
-
-/*
- * Move bits from one field to another. When the fields overlap, behave as if we first move all the
- * bits from the source to a temporary value, and then move all the bits from the temporary value
- * to the destination. The size and memory offsets are measured in bits.
- */
-static void move_bits(const u8 *from, u64 from_offset, u8 *to, u64 to_offset, u32 size)
-{
-       u64 field;
-
-       /* A small move doesn't require special handling. */
-       if (size <= MAX_BIG_FIELD_BITS) {
-               if (size > 0) {
-                       field = get_big_field(from, from_offset, size);
-                       set_big_field(field, to, to_offset, size);
-               }
-
-               return;
-       }
-
-       if (from_offset > to_offset)
-               move_bits_down(from, from_offset, to, to_offset, size);
-       else
-               move_bits_up(from, from_offset, to, to_offset, size);
-}
-
-/*
- * Pack delta lists from a mutable delta index into an immutable delta index page. A range of delta
- * lists (starting with a specified list index) is copied from the mutable delta index into a
- * memory page used in the immutable index. The number of lists copied onto the page is returned in
- * list_count.
- */
-int uds_pack_delta_index_page(const struct delta_index *delta_index, u64 header_nonce,
-                             u8 *memory, size_t memory_size, u64 virtual_chapter_number,
-                             u32 first_list, u32 *list_count)
-{
-       const struct delta_zone *delta_zone;
-       struct delta_list *delta_lists;
-       u32 max_lists;
-       u32 n_lists = 0;
-       u32 offset;
-       u32 i;
-       int free_bits;
-       int bits;
-       struct delta_page_header *header;
-
-       delta_zone = &delta_index->delta_zones[0];
-       delta_lists = &delta_zone->delta_lists[first_list + 1];
-       max_lists = delta_index->list_count - first_list;
-
-       /*
-        * Compute how many lists will fit on the page. Subtract the size of the fixed header, one
-        * delta list offset, and the guard bytes from the page size to determine how much space is
-        * available for delta lists.
-        */
-       free_bits = memory_size * BITS_PER_BYTE;
-       free_bits -= get_immutable_header_offset(1);
-       free_bits -= GUARD_BITS;
-       if (free_bits < IMMUTABLE_HEADER_SIZE) {
-               /* This page is too small to store any delta lists. */
-               return uds_log_error_strerror(UDS_OVERFLOW,
-                                             "Chapter Index Page of %zu bytes is too small",
-                                             memory_size);
-       }
-
-       while (n_lists < max_lists) {
-               /* Each list requires a delta list offset and the list data. */
-               bits = IMMUTABLE_HEADER_SIZE + delta_lists[n_lists].size;
-               if (bits > free_bits)
-                       break;
-
-               n_lists++;
-               free_bits -= bits;
-       }
-
-       *list_count = n_lists;
-
-       header = (struct delta_page_header *) memory;
-       put_unaligned_le64(header_nonce, (u8 *) &header->nonce);
-       put_unaligned_le64(virtual_chapter_number,
-                          (u8 *) &header->virtual_chapter_number);
-       put_unaligned_le16(first_list, (u8 *) &header->first_list);
-       put_unaligned_le16(n_lists, (u8 *) &header->list_count);
-
-       /* Construct the delta list offset table. */
-       offset = get_immutable_header_offset(n_lists + 1);
-       set_immutable_start(memory, 0, offset);
-       for (i = 0; i < n_lists; i++) {
-               offset += delta_lists[i].size;
-               set_immutable_start(memory, i + 1, offset);
-       }
-
-       /* Copy the delta list data onto the memory page. */
-       for (i = 0; i < n_lists; i++) {
-               move_bits(delta_zone->memory, delta_lists[i].start, memory,
-                         get_immutable_start(memory, i), delta_lists[i].size);
-       }
-
-       /* Set all the bits in the guard bytes. */
-       memset(memory + memory_size - POST_FIELD_GUARD_BYTES, ~0,
-              POST_FIELD_GUARD_BYTES);
-       return UDS_SUCCESS;
-}
-
-/* Compute the new offsets of the delta lists. */
-static void compute_new_list_offsets(struct delta_zone *delta_zone, u32 growing_index,
-                                    size_t growing_size, size_t used_space)
-{
-       size_t spacing;
-       u32 i;
-       struct delta_list *delta_lists = delta_zone->delta_lists;
-       u32 tail_guard_index = delta_zone->list_count + 1;
-
-       spacing = (delta_zone->size - used_space) / delta_zone->list_count;
-       delta_zone->new_offsets[0] = 0;
-       for (i = 0; i <= delta_zone->list_count; i++) {
-               delta_zone->new_offsets[i + 1] =
-                       (delta_zone->new_offsets[i] +
-                        get_delta_list_byte_size(&delta_lists[i]) + spacing);
-               delta_zone->new_offsets[i] *= BITS_PER_BYTE;
-               delta_zone->new_offsets[i] += delta_lists[i].start % BITS_PER_BYTE;
-               if (i == 0)
-                       delta_zone->new_offsets[i + 1] -= spacing / 2;
-               if (i + 1 == growing_index)
-                       delta_zone->new_offsets[i + 1] += growing_size;
-       }
-
-       delta_zone->new_offsets[tail_guard_index] =
-               (delta_zone->size * BITS_PER_BYTE - delta_lists[tail_guard_index].size);
-}
-
-static void rebalance_lists(struct delta_zone *delta_zone)
-{
-       struct delta_list *delta_lists;
-       u32 i;
-       size_t used_space = 0;
-
-       /* Extend and balance memory to receive the delta lists */
-       delta_lists = delta_zone->delta_lists;
-       for (i = 0; i <= delta_zone->list_count + 1; i++)
-               used_space += get_delta_list_byte_size(&delta_lists[i]);
-
-       compute_new_list_offsets(delta_zone, 0, 0, used_space);
-       for (i = 1; i <= delta_zone->list_count + 1; i++)
-               delta_lists[i].start = delta_zone->new_offsets[i];
-}
-
-/* Start restoring a delta index from multiple input streams. */
-int uds_start_restoring_delta_index(struct delta_index *delta_index,
-                                   struct buffered_reader **buffered_readers,
-                                   unsigned int reader_count)
-{
-       int result;
-       unsigned int zone_count = reader_count;
-       u64 record_count = 0;
-       u64 collision_count = 0;
-       u32 first_list[MAX_ZONES];
-       u32 list_count[MAX_ZONES];
-       unsigned int z;
-       u32 list_next = 0;
-       const struct delta_zone *delta_zone;
-
-       /* Read and validate each header. */
-       for (z = 0; z < zone_count; z++) {
-               struct delta_index_header header;
-               u8 buffer[sizeof(struct delta_index_header)];
-               size_t offset = 0;
-
-               result = uds_read_from_buffered_reader(buffered_readers[z], buffer,
-                                                      sizeof(buffer));
-               if (result != UDS_SUCCESS) {
-                       return uds_log_warning_strerror(result,
-                                                       "failed to read delta index header");
-               }
-
-               memcpy(&header.magic, buffer, MAGIC_SIZE);
-               offset += MAGIC_SIZE;
-               decode_u32_le(buffer, &offset, &header.zone_number);
-               decode_u32_le(buffer, &offset, &header.zone_count);
-               decode_u32_le(buffer, &offset, &header.first_list);
-               decode_u32_le(buffer, &offset, &header.list_count);
-               decode_u64_le(buffer, &offset, &header.record_count);
-               decode_u64_le(buffer, &offset, &header.collision_count);
-
-               result = ASSERT(offset == sizeof(struct delta_index_header),
-                               "%zu bytes decoded of %zu expected", offset,
-                               sizeof(struct delta_index_header));
-               if (result != UDS_SUCCESS) {
-                       return uds_log_warning_strerror(result,
-                                                       "failed to read delta index header");
-               }
-
-               if (memcmp(header.magic, DELTA_INDEX_MAGIC, MAGIC_SIZE) != 0) {
-                       return uds_log_warning_strerror(UDS_CORRUPT_DATA,
-                                                       "delta index file has bad magic number");
-               }
-
-               if (zone_count != header.zone_count) {
-                       return uds_log_warning_strerror(UDS_CORRUPT_DATA,
-                                                       "delta index files contain mismatched zone counts (%u,%u)",
-                                                       zone_count, header.zone_count);
-               }
-
-               if (header.zone_number != z) {
-                       return uds_log_warning_strerror(UDS_CORRUPT_DATA,
-                                                       "delta index zone %u found in slot %u",
-                                                       header.zone_number, z);
-               }
-
-               first_list[z] = header.first_list;
-               list_count[z] = header.list_count;
-               record_count += header.record_count;
-               collision_count += header.collision_count;
-
-               if (first_list[z] != list_next) {
-                       return uds_log_warning_strerror(UDS_CORRUPT_DATA,
-                                                       "delta index file for zone %u starts with list %u instead of list %u",
-                                                       z, first_list[z], list_next);
-               }
-
-               list_next += list_count[z];
-       }
-
-       if (list_next != delta_index->list_count) {
-               return uds_log_warning_strerror(UDS_CORRUPT_DATA,
-                                               "delta index files contain %u delta lists instead of %u delta lists",
-                                               list_next, delta_index->list_count);
-       }
-
-       if (collision_count > record_count) {
-               return uds_log_warning_strerror(UDS_CORRUPT_DATA,
-                                               "delta index files contain %llu collisions and %llu records",
-                                               (unsigned long long) collision_count,
-                                               (unsigned long long) record_count);
-       }
-
-       uds_reset_delta_index(delta_index);
-       delta_index->delta_zones[0].record_count = record_count;
-       delta_index->delta_zones[0].collision_count = collision_count;
-
-       /* Read the delta lists and distribute them to the proper zones. */
-       for (z = 0; z < zone_count; z++) {
-               u32 i;
-
-               delta_index->load_lists[z] = 0;
-               for (i = 0; i < list_count[z]; i++) {
-                       u16 delta_list_size;
-                       u32 list_number;
-                       unsigned int zone_number;
-                       u8 size_data[sizeof(u16)];
-
-                       result = uds_read_from_buffered_reader(buffered_readers[z],
-                                                              size_data,
-                                                              sizeof(size_data));
-                       if (result != UDS_SUCCESS) {
-                               return uds_log_warning_strerror(result,
-                                                               "failed to read delta index size");
-                       }
-
-                       delta_list_size = get_unaligned_le16(size_data);
-                       if (delta_list_size > 0)
-                               delta_index->load_lists[z] += 1;
-
-                       list_number = first_list[z] + i;
-                       zone_number = list_number / delta_index->lists_per_zone;
-                       delta_zone = &delta_index->delta_zones[zone_number];
-                       list_number -= delta_zone->first_list;
-                       delta_zone->delta_lists[list_number + 1].size = delta_list_size;
-               }
-       }
-
-       /* Prepare each zone to start receiving the delta list data. */
-       for (z = 0; z < delta_index->zone_count; z++)
-               rebalance_lists(&delta_index->delta_zones[z]);
-
-       return UDS_SUCCESS;
-}
-
-static int restore_delta_list_to_zone(struct delta_zone *delta_zone,
-                                     const struct delta_list_save_info *save_info,
-                                     const u8 *data)
-{
-       struct delta_list *delta_list;
-       u16 bit_count;
-       u16 byte_count;
-       u32 list_number = save_info->index - delta_zone->first_list;
-
-       if (list_number >= delta_zone->list_count) {
-               return uds_log_warning_strerror(UDS_CORRUPT_DATA,
-                                               "invalid delta list number %u not in range [%u,%u)",
-                                               save_info->index, delta_zone->first_list,
-                                               delta_zone->first_list + delta_zone->list_count);
-       }
-
-       delta_list = &delta_zone->delta_lists[list_number + 1];
-       if (delta_list->size == 0) {
-               return uds_log_warning_strerror(UDS_CORRUPT_DATA,
-                                               "unexpected delta list number %u",
-                                               save_info->index);
-       }
-
-       bit_count = delta_list->size + save_info->bit_offset;
-       byte_count = BITS_TO_BYTES(bit_count);
-       if (save_info->byte_count != byte_count) {
-               return uds_log_warning_strerror(UDS_CORRUPT_DATA,
-                                               "unexpected delta list size %u != %u",
-                                               save_info->byte_count, byte_count);
-       }
-
-       move_bits(data, save_info->bit_offset, delta_zone->memory, delta_list->start,
-                 delta_list->size);
-       return UDS_SUCCESS;
-}
-
-static int restore_delta_list_data(struct delta_index *delta_index, unsigned int load_zone,
-                                  struct buffered_reader *buffered_reader, u8 *data)
-{
-       int result;
-       struct delta_list_save_info save_info;
-       u8 buffer[sizeof(struct delta_list_save_info)];
-       unsigned int new_zone;
-
-       result = uds_read_from_buffered_reader(buffered_reader, buffer, sizeof(buffer));
-       if (result != UDS_SUCCESS) {
-               return uds_log_warning_strerror(result,
-                                               "failed to read delta list data");
-       }
-
-       save_info = (struct delta_list_save_info) {
-               .tag = buffer[0],
-               .bit_offset = buffer[1],
-               .byte_count = get_unaligned_le16(&buffer[2]),
-               .index = get_unaligned_le32(&buffer[4]),
-       };
-
-       if ((save_info.bit_offset >= BITS_PER_BYTE) ||
-           (save_info.byte_count > DELTA_LIST_MAX_BYTE_COUNT)) {
-               return uds_log_warning_strerror(UDS_CORRUPT_DATA,
-                                               "corrupt delta list data");
-       }
-
-       /* Make sure the data is intended for this delta index. */
-       if (save_info.tag != delta_index->tag)
-               return UDS_CORRUPT_DATA;
-
-       if (save_info.index >= delta_index->list_count) {
-               return uds_log_warning_strerror(UDS_CORRUPT_DATA,
-                                               "invalid delta list number %u of %u",
-                                               save_info.index,
-                                               delta_index->list_count);
-       }
-
-       result = uds_read_from_buffered_reader(buffered_reader, data,
-                                              save_info.byte_count);
-       if (result != UDS_SUCCESS) {
-               return uds_log_warning_strerror(result,
-                                               "failed to read delta list data");
-       }
-
-       delta_index->load_lists[load_zone] -= 1;
-       new_zone = save_info.index / delta_index->lists_per_zone;
-       return restore_delta_list_to_zone(&delta_index->delta_zones[new_zone],
-                                         &save_info, data);
-}
-
-/* Restore delta lists from saved data. */
-int uds_finish_restoring_delta_index(struct delta_index *delta_index,
-                                    struct buffered_reader **buffered_readers,
-                                    unsigned int reader_count)
-{
-       int result;
-       int saved_result = UDS_SUCCESS;
-       unsigned int z;
-       u8 *data;
-
-       result = uds_allocate(DELTA_LIST_MAX_BYTE_COUNT, u8, __func__, &data);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       for (z = 0; z < reader_count; z++) {
-               while (delta_index->load_lists[z] > 0) {
-                       result = restore_delta_list_data(delta_index, z,
-                                                        buffered_readers[z], data);
-                       if (result != UDS_SUCCESS) {
-                               saved_result = result;
-                               break;
-                       }
-               }
-       }
-
-       uds_free(data);
-       return saved_result;
-}
-
-int uds_check_guard_delta_lists(struct buffered_reader **buffered_readers,
-                               unsigned int reader_count)
-{
-       int result;
-       unsigned int z;
-       u8 buffer[sizeof(struct delta_list_save_info)];
-
-       for (z = 0; z < reader_count; z++) {
-               result = uds_read_from_buffered_reader(buffered_readers[z], buffer,
-                                                      sizeof(buffer));
-               if (result != UDS_SUCCESS)
-                       return result;
-
-               if (buffer[0] != 'z')
-                       return UDS_CORRUPT_DATA;
-       }
-
-       return UDS_SUCCESS;
-}
-
-static int flush_delta_list(struct delta_zone *zone, u32 flush_index)
-{
-       struct delta_list *delta_list;
-       u8 buffer[sizeof(struct delta_list_save_info)];
-       int result;
-
-       delta_list = &zone->delta_lists[flush_index + 1];
-
-       buffer[0] = zone->tag;
-       buffer[1] = delta_list->start % BITS_PER_BYTE;
-       put_unaligned_le16(get_delta_list_byte_size(delta_list), &buffer[2]);
-       put_unaligned_le32(zone->first_list + flush_index, &buffer[4]);
-
-       result = uds_write_to_buffered_writer(zone->buffered_writer, buffer,
-                                             sizeof(buffer));
-       if (result != UDS_SUCCESS) {
-               uds_log_warning_strerror(result, "failed to write delta list memory");
-               return result;
-       }
-
-       result = uds_write_to_buffered_writer(zone->buffered_writer,
-                                             zone->memory + get_delta_list_byte_start(delta_list),
-                                             get_delta_list_byte_size(delta_list));
-       if (result != UDS_SUCCESS)
-               uds_log_warning_strerror(result, "failed to write delta list memory");
-
-       return result;
-}
-
-/* Start saving a delta index zone to a buffered output stream. */
-int uds_start_saving_delta_index(const struct delta_index *delta_index,
-                                unsigned int zone_number,
-                                struct buffered_writer *buffered_writer)
-{
-       int result;
-       u32 i;
-       struct delta_zone *delta_zone;
-       u8 buffer[sizeof(struct delta_index_header)];
-       size_t offset = 0;
-
-       delta_zone = &delta_index->delta_zones[zone_number];
-       memcpy(buffer, DELTA_INDEX_MAGIC, MAGIC_SIZE);
-       offset += MAGIC_SIZE;
-       encode_u32_le(buffer, &offset, zone_number);
-       encode_u32_le(buffer, &offset, delta_index->zone_count);
-       encode_u32_le(buffer, &offset, delta_zone->first_list);
-       encode_u32_le(buffer, &offset, delta_zone->list_count);
-       encode_u64_le(buffer, &offset, delta_zone->record_count);
-       encode_u64_le(buffer, &offset, delta_zone->collision_count);
-
-       result = ASSERT(offset == sizeof(struct delta_index_header),
-                       "%zu bytes encoded of %zu expected", offset,
-                       sizeof(struct delta_index_header));
-       if (result != UDS_SUCCESS)
-               return result;
-
-       result = uds_write_to_buffered_writer(buffered_writer, buffer, offset);
-       if (result != UDS_SUCCESS)
-               return uds_log_warning_strerror(result,
-                                               "failed to write delta index header");
-
-       for (i = 0; i < delta_zone->list_count; i++) {
-               u8 data[sizeof(u16)];
-               struct delta_list *delta_list;
-
-               delta_list = &delta_zone->delta_lists[i + 1];
-               put_unaligned_le16(delta_list->size, data);
-               result = uds_write_to_buffered_writer(buffered_writer, data,
-                                                     sizeof(data));
-               if (result != UDS_SUCCESS)
-                       return uds_log_warning_strerror(result,
-                                                       "failed to write delta list size");
-       }
-
-       delta_zone->buffered_writer = buffered_writer;
-       return UDS_SUCCESS;
-}
-
-int uds_finish_saving_delta_index(const struct delta_index *delta_index,
-                                 unsigned int zone_number)
-{
-       int result;
-       int first_error = UDS_SUCCESS;
-       u32 i;
-       struct delta_zone *delta_zone;
-       struct delta_list *delta_list;
-
-       delta_zone = &delta_index->delta_zones[zone_number];
-       for (i = 0; i < delta_zone->list_count; i++) {
-               delta_list = &delta_zone->delta_lists[i + 1];
-               if (delta_list->size > 0) {
-                       result = flush_delta_list(delta_zone, i);
-                       if ((result != UDS_SUCCESS) && (first_error == UDS_SUCCESS))
-                               first_error = result;
-               }
-       }
-
-       delta_zone->buffered_writer = NULL;
-       return first_error;
-}
-
-int uds_write_guard_delta_list(struct buffered_writer *buffered_writer)
-{
-       int result;
-       u8 buffer[sizeof(struct delta_list_save_info)];
-
-       memset(buffer, 0, sizeof(struct delta_list_save_info));
-       buffer[0] = 'z';
-
-       result = uds_write_to_buffered_writer(buffered_writer, buffer, sizeof(buffer));
-       if (result != UDS_SUCCESS)
-               uds_log_warning_strerror(result, "failed to write guard delta list");
-
-       return UDS_SUCCESS;
-}
-
-size_t uds_compute_delta_index_save_bytes(u32 list_count, size_t memory_size)
-{
-       /* One zone will use at least as much memory as other zone counts. */
-       return (sizeof(struct delta_index_header) +
-               list_count * (sizeof(struct delta_list_save_info) + 1) +
-               get_zone_memory_size(1, memory_size));
-}
-
-static int assert_not_at_end(const struct delta_index_entry *delta_entry)
-{
-       int result = ASSERT(!delta_entry->at_end,
-                           "operation is invalid because the list entry is at the end of the delta list");
-       if (result != UDS_SUCCESS)
-               result = UDS_BAD_STATE;
-
-       return result;
-}
-
-/*
- * Prepare to search for an entry in the specified delta list.
- *
- * This is always the first function to be called when dealing with delta index entries. It is
- * always followed by calls to uds_next_delta_index_entry() to iterate through a delta list. The
- * fields of the delta_index_entry argument will be set up for iteration, but will not contain an
- * entry from the list.
- */
-int uds_start_delta_index_search(const struct delta_index *delta_index, u32 list_number,
-                                u32 key, struct delta_index_entry *delta_entry)
-{
-       int result;
-       unsigned int zone_number;
-       struct delta_zone *delta_zone;
-       struct delta_list *delta_list;
-
-       result = ASSERT((list_number < delta_index->list_count),
-                       "Delta list number (%u) is out of range (%u)", list_number,
-                       delta_index->list_count);
-       if (result != UDS_SUCCESS)
-               return UDS_CORRUPT_DATA;
-
-       zone_number = list_number / delta_index->lists_per_zone;
-       delta_zone = &delta_index->delta_zones[zone_number];
-       list_number -= delta_zone->first_list;
-       result = ASSERT((list_number < delta_zone->list_count),
-                       "Delta list number (%u) is out of range (%u) for zone (%u)",
-                       list_number, delta_zone->list_count, zone_number);
-       if (result != UDS_SUCCESS)
-               return UDS_CORRUPT_DATA;
-
-       if (delta_index->mutable) {
-               delta_list = &delta_zone->delta_lists[list_number + 1];
-       } else {
-               u32 end_offset;
-
-               /*
-                * Translate the immutable delta list header into a temporary
-                * full delta list header.
-                */
-               delta_list = &delta_entry->temp_delta_list;
-               delta_list->start = get_immutable_start(delta_zone->memory, list_number);
-               end_offset = get_immutable_start(delta_zone->memory, list_number + 1);
-               delta_list->size = end_offset - delta_list->start;
-               delta_list->save_key = 0;
-               delta_list->save_offset = 0;
-       }
-
-       if (key > delta_list->save_key) {
-               delta_entry->key = delta_list->save_key;
-               delta_entry->offset = delta_list->save_offset;
-       } else {
-               delta_entry->key = 0;
-               delta_entry->offset = 0;
-               if (key == 0) {
-                       /*
-                        * This usually means we're about to walk the entire delta list, so get all
-                        * of it into the CPU cache.
-                        */
-                       uds_prefetch_range(&delta_zone->memory[delta_list->start / BITS_PER_BYTE],
-                                          delta_list->size / BITS_PER_BYTE, false);
-               }
-       }
-
-       delta_entry->at_end = false;
-       delta_entry->delta_zone = delta_zone;
-       delta_entry->delta_list = delta_list;
-       delta_entry->entry_bits = 0;
-       delta_entry->is_collision = false;
-       delta_entry->list_number = list_number;
-       delta_entry->list_overflow = false;
-       delta_entry->value_bits = delta_zone->value_bits;
-       return UDS_SUCCESS;
-}
-
-static inline u64 get_delta_entry_offset(const struct delta_index_entry *delta_entry)
-{
-       return delta_entry->delta_list->start + delta_entry->offset;
-}
-
-/*
- * Decode a delta index entry delta value. The delta_index_entry basically describes the previous
- * list entry, and has had its offset field changed to point to the subsequent entry. We decode the
- * bit stream and update the delta_list_entry to describe the entry.
- */
-static inline void decode_delta(struct delta_index_entry *delta_entry)
-{
-       int key_bits;
-       u32 delta;
-       const struct delta_zone *delta_zone = delta_entry->delta_zone;
-       const u8 *memory = delta_zone->memory;
-       u64 delta_offset = get_delta_entry_offset(delta_entry) + delta_entry->value_bits;
-       const u8 *addr = memory + delta_offset / BITS_PER_BYTE;
-       int offset = delta_offset % BITS_PER_BYTE;
-       u32 data = get_unaligned_le32(addr) >> offset;
-
-       addr += sizeof(u32);
-       key_bits = delta_zone->min_bits;
-       delta = data & ((1 << key_bits) - 1);
-       if (delta >= delta_zone->min_keys) {
-               data >>= key_bits;
-               if (data == 0) {
-                       key_bits = sizeof(u32) * BITS_PER_BYTE - offset;
-                       while ((data = get_unaligned_le32(addr)) == 0) {
-                               addr += sizeof(u32);
-                               key_bits += sizeof(u32) * BITS_PER_BYTE;
-                       }
-               }
-               key_bits += ffs(data);
-               delta += ((key_bits - delta_zone->min_bits - 1) * delta_zone->incr_keys);
-       }
-       delta_entry->delta = delta;
-       delta_entry->key += delta;
-
-       /* Check for a collision, a delta of zero after the start. */
-       if (unlikely((delta == 0) && (delta_entry->offset > 0))) {
-               delta_entry->is_collision = true;
-               delta_entry->entry_bits = delta_entry->value_bits + key_bits + COLLISION_BITS;
-       } else {
-               delta_entry->is_collision = false;
-               delta_entry->entry_bits = delta_entry->value_bits + key_bits;
-       }
-}
-
-noinline int uds_next_delta_index_entry(struct delta_index_entry *delta_entry)
-{
-       int result;
-       const struct delta_list *delta_list;
-       u32 next_offset;
-       u16 size;
-
-       result = assert_not_at_end(delta_entry);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       delta_list = delta_entry->delta_list;
-       delta_entry->offset += delta_entry->entry_bits;
-       size = delta_list->size;
-       if (unlikely(delta_entry->offset >= size)) {
-               delta_entry->at_end = true;
-               delta_entry->delta = 0;
-               delta_entry->is_collision = false;
-               result = ASSERT((delta_entry->offset == size),
-                               "next offset past end of delta list");
-               if (result != UDS_SUCCESS)
-                       result = UDS_CORRUPT_DATA;
-
-               return result;
-       }
-
-       decode_delta(delta_entry);
-
-       next_offset = delta_entry->offset + delta_entry->entry_bits;
-       if (next_offset > size) {
-               /*
-                * This is not an assertion because uds_validate_chapter_index_page() wants to
-                * handle this error.
-                */
-               uds_log_warning("Decoded past the end of the delta list");
-               return UDS_CORRUPT_DATA;
-       }
-
-       return UDS_SUCCESS;
-}
-
-int uds_remember_delta_index_offset(const struct delta_index_entry *delta_entry)
-{
-       int result;
-       struct delta_list *delta_list = delta_entry->delta_list;
-
-       result = ASSERT(!delta_entry->is_collision, "entry is not a collision");
-       if (result != UDS_SUCCESS)
-               return result;
-
-       delta_list->save_key = delta_entry->key - delta_entry->delta;
-       delta_list->save_offset = delta_entry->offset;
-       return UDS_SUCCESS;
-}
-
-static void set_delta(struct delta_index_entry *delta_entry, u32 delta)
-{
-       const struct delta_zone *delta_zone = delta_entry->delta_zone;
-       u32 key_bits = (delta_zone->min_bits +
-                       ((delta_zone->incr_keys - delta_zone->min_keys + delta) /
-                        delta_zone->incr_keys));
-
-       delta_entry->delta = delta;
-       delta_entry->entry_bits = delta_entry->value_bits + key_bits;
-}
-
-static void get_collision_name(const struct delta_index_entry *entry, u8 *name)
-{
-       u64 offset = get_delta_entry_offset(entry) + entry->entry_bits - COLLISION_BITS;
-       const u8 *addr = entry->delta_zone->memory + offset / BITS_PER_BYTE;
-       int size = COLLISION_BYTES;
-       int shift = offset % BITS_PER_BYTE;
-
-       while (--size >= 0)
-               *name++ = get_unaligned_le16(addr++) >> shift;
-}
-
-static void set_collision_name(const struct delta_index_entry *entry, const u8 *name)
-{
-       u64 offset = get_delta_entry_offset(entry) + entry->entry_bits - COLLISION_BITS;
-       u8 *addr = entry->delta_zone->memory + offset / BITS_PER_BYTE;
-       int size = COLLISION_BYTES;
-       int shift = offset % BITS_PER_BYTE;
-       u16 mask = ~((u16) 0xFF << shift);
-       u16 data;
-
-       while (--size >= 0) {
-               data = (get_unaligned_le16(addr) & mask) | (*name++ << shift);
-               put_unaligned_le16(data, addr++);
-       }
-}
-
-int uds_get_delta_index_entry(const struct delta_index *delta_index, u32 list_number,
-                             u32 key, const u8 *name,
-                             struct delta_index_entry *delta_entry)
-{
-       int result;
-
-       result = uds_start_delta_index_search(delta_index, list_number, key,
-                                             delta_entry);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       do {
-               result = uds_next_delta_index_entry(delta_entry);
-               if (result != UDS_SUCCESS)
-                       return result;
-       } while (!delta_entry->at_end && (key > delta_entry->key));
-
-       result = uds_remember_delta_index_offset(delta_entry);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       if (!delta_entry->at_end && (key == delta_entry->key)) {
-               struct delta_index_entry collision_entry = *delta_entry;
-
-               for (;;) {
-                       u8 full_name[COLLISION_BYTES];
-
-                       result = uds_next_delta_index_entry(&collision_entry);
-                       if (result != UDS_SUCCESS)
-                               return result;
-
-                       if (collision_entry.at_end || !collision_entry.is_collision)
-                               break;
-
-                       get_collision_name(&collision_entry, full_name);
-                       if (memcmp(full_name, name, COLLISION_BYTES) == 0) {
-                               *delta_entry = collision_entry;
-                               break;
-                       }
-               }
-       }
-
-       return UDS_SUCCESS;
-}
-
-int uds_get_delta_entry_collision(const struct delta_index_entry *delta_entry, u8 *name)
-{
-       int result;
-
-       result = assert_not_at_end(delta_entry);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       result = ASSERT(delta_entry->is_collision,
-                       "Cannot get full block name from a non-collision delta index entry");
-       if (result != UDS_SUCCESS)
-               return UDS_BAD_STATE;
-
-       get_collision_name(delta_entry, name);
-       return UDS_SUCCESS;
-}
-
-u32 uds_get_delta_entry_value(const struct delta_index_entry *delta_entry)
-{
-       return get_field(delta_entry->delta_zone->memory,
-                        get_delta_entry_offset(delta_entry), delta_entry->value_bits);
-}
-
-static int assert_mutable_entry(const struct delta_index_entry *delta_entry)
-{
-       int result = ASSERT((delta_entry->delta_list != &delta_entry->temp_delta_list),
-                           "delta index is mutable");
-       if (result != UDS_SUCCESS)
-               result = UDS_BAD_STATE;
-
-       return result;
-}
-
-int uds_set_delta_entry_value(const struct delta_index_entry *delta_entry, u32 value)
-{
-       int result;
-       u32 value_mask = (1 << delta_entry->value_bits) - 1;
-
-       result = assert_mutable_entry(delta_entry);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       result = assert_not_at_end(delta_entry);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       result = ASSERT((value & value_mask) == value,
-                       "Value (%u) being set in a delta index is too large (must fit in %u bits)",
-                       value, delta_entry->value_bits);
-       if (result != UDS_SUCCESS)
-               return UDS_INVALID_ARGUMENT;
-
-       set_field(value, delta_entry->delta_zone->memory,
-                 get_delta_entry_offset(delta_entry), delta_entry->value_bits);
-       return UDS_SUCCESS;
-}
-
-/*
- * Extend the memory used by the delta lists by adding growing_size bytes before the list indicated
- * by growing_index, then rebalancing the lists in the new chunk.
- */
-static int extend_delta_zone(struct delta_zone *delta_zone, u32 growing_index,
-                            size_t growing_size)
-{
-       ktime_t start_time;
-       ktime_t end_time;
-       struct delta_list *delta_lists;
-       u32 i;
-       size_t used_space;
-
-
-       /* Calculate the amount of space that is or will be in use. */
-       start_time = current_time_ns(CLOCK_MONOTONIC);
-       delta_lists = delta_zone->delta_lists;
-       used_space = growing_size;
-       for (i = 0; i <= delta_zone->list_count + 1; i++)
-               used_space += get_delta_list_byte_size(&delta_lists[i]);
-
-       if (delta_zone->size < used_space)
-               return UDS_OVERFLOW;
-
-       /* Compute the new offsets of the delta lists. */
-       compute_new_list_offsets(delta_zone, growing_index, growing_size, used_space);
-
-       /*
-        * When we rebalance the delta list, we will include the end guard list in the rebalancing.
-        * It contains the end guard data, which must be copied.
-        */
-       rebalance_delta_zone(delta_zone, 1, delta_zone->list_count + 1);
-       end_time = current_time_ns(CLOCK_MONOTONIC);
-       delta_zone->rebalance_count++;
-       delta_zone->rebalance_time += ktime_sub(end_time, start_time);
-       return UDS_SUCCESS;
-}
-
-static int insert_bits(struct delta_index_entry *delta_entry, u16 size)
-{
-       u64 free_before;
-       u64 free_after;
-       u64 source;
-       u64 destination;
-       u32 count;
-       bool before_flag;
-       u8 *memory;
-       struct delta_zone *delta_zone = delta_entry->delta_zone;
-       struct delta_list *delta_list = delta_entry->delta_list;
-       /* Compute bits in use before and after the inserted bits. */
-       u32 total_size = delta_list->size;
-       u32 before_size = delta_entry->offset;
-       u32 after_size = total_size - delta_entry->offset;
-
-       if (total_size + size > U16_MAX) {
-               delta_entry->list_overflow = true;
-               delta_zone->overflow_count++;
-               return UDS_OVERFLOW;
-       }
-
-       /* Compute bits available before and after the delta list. */
-       free_before = (delta_list[0].start - (delta_list[-1].start + delta_list[-1].size));
-       free_after = (delta_list[1].start - (delta_list[0].start + delta_list[0].size));
-
-       if ((size <= free_before) && (size <= free_after)) {
-               /*
-                * We have enough space to use either before or after the list. Select the smaller
-                * amount of data. If it is exactly the same, try to take from the larger amount of
-                * free space.
-                */
-               if (before_size < after_size)
-                       before_flag = true;
-               else if (after_size < before_size)
-                       before_flag = false;
-               else
-                       before_flag = free_before > free_after;
-       } else if (size <= free_before) {
-               /* There is space before but not after. */
-               before_flag = true;
-       } else if (size <= free_after) {
-               /* There is space after but not before. */
-               before_flag = false;
-       } else {
-               /*
-                * Neither of the surrounding spaces is large enough for this request. Extend
-                * and/or rebalance the delta list memory choosing to move the least amount of
-                * data.
-                */
-               int result;
-               u32 growing_index = delta_entry->list_number + 1;
-
-               before_flag = before_size < after_size;
-               if (!before_flag)
-                       growing_index++;
-               result = extend_delta_zone(delta_zone, growing_index,
-                                          BITS_TO_BYTES(size));
-               if (result != UDS_SUCCESS)
-                       return result;
-       }
-
-       delta_list->size += size;
-       if (before_flag) {
-               source = delta_list->start;
-               destination = source - size;
-               delta_list->start -= size;
-               count = before_size;
-       } else {
-               source = delta_list->start + delta_entry->offset;
-               destination = source + size;
-               count = after_size;
-       }
-
-       memory = delta_zone->memory;
-       move_bits(memory, source, memory, destination, count);
-       return UDS_SUCCESS;
-}
-
-static void encode_delta(const struct delta_index_entry *delta_entry)
-{
-       u32 temp;
-       u32 t1;
-       u32 t2;
-       u64 offset;
-       const struct delta_zone *delta_zone = delta_entry->delta_zone;
-       u8 *memory = delta_zone->memory;
-
-       offset = get_delta_entry_offset(delta_entry) + delta_entry->value_bits;
-       if (delta_entry->delta < delta_zone->min_keys) {
-               set_field(delta_entry->delta, memory, offset, delta_zone->min_bits);
-               return;
-       }
-
-       temp = delta_entry->delta - delta_zone->min_keys;
-       t1 = (temp % delta_zone->incr_keys) + delta_zone->min_keys;
-       t2 = temp / delta_zone->incr_keys;
-       set_field(t1, memory, offset, delta_zone->min_bits);
-       set_zero(memory, offset + delta_zone->min_bits, t2);
-       set_field(1, memory, offset + delta_zone->min_bits + t2, 1);
-}
-
-static void encode_entry(const struct delta_index_entry *delta_entry, u32 value,
-                        const u8 *name)
-{
-       u8 *memory = delta_entry->delta_zone->memory;
-       u64 offset = get_delta_entry_offset(delta_entry);
-
-       set_field(value, memory, offset, delta_entry->value_bits);
-       encode_delta(delta_entry);
-       if (name != NULL)
-               set_collision_name(delta_entry, name);
-}
-
-/*
- * Create a new entry in the delta index. If the entry is a collision, the full 256 bit name must
- * be provided.
- */
-int uds_put_delta_index_entry(struct delta_index_entry *delta_entry, u32 key, u32 value,
-                             const u8 *name)
-{
-       int result;
-       struct delta_zone *delta_zone;
-
-       result = assert_mutable_entry(delta_entry);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       if (delta_entry->is_collision) {
-               /*
-                * The caller wants us to insert a collision entry onto a collision entry. This
-                * happens when we find a collision and attempt to add the name again to the index.
-                * This is normally a fatal error unless we are replaying a closed chapter while we
-                * are rebuilding a volume index.
-                */
-               return UDS_DUPLICATE_NAME;
-       }
-
-       if (delta_entry->offset < delta_entry->delta_list->save_offset) {
-               /*
-                * The saved entry offset is after the new entry and will no longer be valid, so
-                * replace it with the insertion point.
-                */
-               result = uds_remember_delta_index_offset(delta_entry);
-               if (result != UDS_SUCCESS)
-                       return result;
-       }
-
-       if (name != NULL) {
-               /* Insert a collision entry which is placed after this entry. */
-               result = assert_not_at_end(delta_entry);
-               if (result != UDS_SUCCESS)
-                       return result;
-
-               result = ASSERT((key == delta_entry->key),
-                               "incorrect key for collision entry");
-               if (result != UDS_SUCCESS)
-                       return result;
-
-               delta_entry->offset += delta_entry->entry_bits;
-               set_delta(delta_entry, 0);
-               delta_entry->is_collision = true;
-               delta_entry->entry_bits += COLLISION_BITS;
-               result = insert_bits(delta_entry, delta_entry->entry_bits);
-       } else if (delta_entry->at_end) {
-               /* Insert a new entry at the end of the delta list. */
-               result = ASSERT((key >= delta_entry->key), "key past end of list");
-               if (result != UDS_SUCCESS)
-                       return result;
-
-               set_delta(delta_entry, key - delta_entry->key);
-               delta_entry->key = key;
-               delta_entry->at_end = false;
-               result = insert_bits(delta_entry, delta_entry->entry_bits);
-       } else {
-               u16 old_entry_size;
-               u16 additional_size;
-               struct delta_index_entry next_entry;
-               u32 next_value;
-
-               /*
-                * Insert a new entry which requires the delta in the following entry to be
-                * updated.
-                */
-               result = ASSERT((key < delta_entry->key),
-                               "key precedes following entry");
-               if (result != UDS_SUCCESS)
-                       return result;
-
-               result = ASSERT((key >= delta_entry->key - delta_entry->delta),
-                               "key effects following entry's delta");
-               if (result != UDS_SUCCESS)
-                       return result;
-
-               old_entry_size = delta_entry->entry_bits;
-               next_entry = *delta_entry;
-               next_value = uds_get_delta_entry_value(&next_entry);
-               set_delta(delta_entry, key - (delta_entry->key - delta_entry->delta));
-               delta_entry->key = key;
-               set_delta(&next_entry, next_entry.key - key);
-               next_entry.offset += delta_entry->entry_bits;
-               /* The two new entries are always bigger than the single entry being replaced. */
-               additional_size = (delta_entry->entry_bits +
-                                  next_entry.entry_bits - old_entry_size);
-               result = insert_bits(delta_entry, additional_size);
-               if (result != UDS_SUCCESS)
-                       return result;
-
-               encode_entry(&next_entry, next_value, NULL);
-       }
-
-       if (result != UDS_SUCCESS)
-               return result;
-
-       encode_entry(delta_entry, value, name);
-       delta_zone = delta_entry->delta_zone;
-       delta_zone->record_count++;
-       delta_zone->collision_count += delta_entry->is_collision ? 1 : 0;
-       return UDS_SUCCESS;
-}
-
-static void delete_bits(const struct delta_index_entry *delta_entry, int size)
-{
-       u64 source;
-       u64 destination;
-       u32 count;
-       bool before_flag;
-       struct delta_list *delta_list = delta_entry->delta_list;
-       u8 *memory = delta_entry->delta_zone->memory;
-       /* Compute bits retained before and after the deleted bits. */
-       u32 total_size = delta_list->size;
-       u32 before_size = delta_entry->offset;
-       u32 after_size = total_size - delta_entry->offset - size;
-
-       /*
-        * Determine whether to add to the available space either before or after the delta list.
-        * We prefer to move the least amount of data. If it is exactly the same, try to add to the
-        * smaller amount of free space.
-        */
-       if (before_size < after_size) {
-               before_flag = true;
-       } else if (after_size < before_size) {
-               before_flag = false;
-       } else {
-               u64 free_before =
-                       (delta_list[0].start - (delta_list[-1].start + delta_list[-1].size));
-               u64 free_after =
-                       (delta_list[1].start - (delta_list[0].start + delta_list[0].size));
-
-               before_flag = (free_before < free_after);
-       }
-
-       delta_list->size -= size;
-       if (before_flag) {
-               source = delta_list->start;
-               destination = source + size;
-               delta_list->start += size;
-               count = before_size;
-       } else {
-               destination = delta_list->start + delta_entry->offset;
-               source = destination + size;
-               count = after_size;
-       }
-
-       move_bits(memory, source, memory, destination, count);
-}
-
-int uds_remove_delta_index_entry(struct delta_index_entry *delta_entry)
-{
-       int result;
-       struct delta_index_entry next_entry;
-       struct delta_zone *delta_zone;
-       struct delta_list *delta_list;
-
-       result = assert_mutable_entry(delta_entry);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       next_entry = *delta_entry;
-       result = uds_next_delta_index_entry(&next_entry);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       delta_zone = delta_entry->delta_zone;
-
-       if (delta_entry->is_collision) {
-               /* This is a collision entry, so just remove it. */
-               delete_bits(delta_entry, delta_entry->entry_bits);
-               next_entry.offset = delta_entry->offset;
-               delta_zone->collision_count -= 1;
-       } else if (next_entry.at_end) {
-               /* This entry is at the end of the list, so just remove it. */
-               delete_bits(delta_entry, delta_entry->entry_bits);
-               next_entry.key -= delta_entry->delta;
-               next_entry.offset = delta_entry->offset;
-       } else {
-               /* The delta in the next entry needs to be updated. */
-               u32 next_value = uds_get_delta_entry_value(&next_entry);
-               u16 old_size = delta_entry->entry_bits + next_entry.entry_bits;
-
-               if (next_entry.is_collision) {
-                       next_entry.is_collision = false;
-                       delta_zone->collision_count -= 1;
-               }
-
-               set_delta(&next_entry, delta_entry->delta + next_entry.delta);
-               next_entry.offset = delta_entry->offset;
-               /* The one new entry is always smaller than the two entries being replaced. */
-               delete_bits(delta_entry, old_size - next_entry.entry_bits);
-               encode_entry(&next_entry, next_value, NULL);
-       }
-
-       delta_zone->record_count--;
-       delta_zone->discard_count++;
-       *delta_entry = next_entry;
-
-       delta_list = delta_entry->delta_list;
-       if (delta_entry->offset < delta_list->save_offset) {
-               /* The saved entry offset is no longer valid. */
-               delta_list->save_key = 0;
-               delta_list->save_offset = 0;
-       }
-
-       return UDS_SUCCESS;
-}
-
-void uds_get_delta_index_stats(const struct delta_index *delta_index,
-                              struct delta_index_stats *stats)
-{
-       unsigned int z;
-       const struct delta_zone *delta_zone;
-
-       memset(stats, 0, sizeof(struct delta_index_stats));
-       for (z = 0; z < delta_index->zone_count; z++) {
-               delta_zone = &delta_index->delta_zones[z];
-               stats->rebalance_time += delta_zone->rebalance_time;
-               stats->rebalance_count += delta_zone->rebalance_count;
-               stats->record_count += delta_zone->record_count;
-               stats->collision_count += delta_zone->collision_count;
-               stats->discard_count += delta_zone->discard_count;
-               stats->overflow_count += delta_zone->overflow_count;
-               stats->list_count += delta_zone->list_count;
-       }
-}
-
-size_t uds_compute_delta_index_size(u32 entry_count, u32 mean_delta, u32 payload_bits)
-{
-       u16 min_bits;
-       u32 incr_keys;
-       u32 min_keys;
-
-       compute_coding_constants(mean_delta, &min_bits, &min_keys, &incr_keys);
-       /* On average, each delta is encoded into about min_bits + 1.5 bits. */
-       return entry_count * (payload_bits + min_bits + 1) + entry_count / 2;
-}
-
-u32 uds_get_delta_index_page_count(u32 entry_count, u32 list_count, u32 mean_delta,
-                                  u32 payload_bits, size_t bytes_per_page)
-{
-       unsigned int bits_per_delta_list;
-       unsigned int bits_per_page;
-       size_t bits_per_index;
-
-       /* Compute the expected number of bits needed for all the entries. */
-       bits_per_index = uds_compute_delta_index_size(entry_count, mean_delta,
-                                                     payload_bits);
-       bits_per_delta_list = bits_per_index / list_count;
-
-       /* Add in the immutable delta list headers. */
-       bits_per_index += list_count * IMMUTABLE_HEADER_SIZE;
-       /* Compute the number of usable bits on an immutable index page. */
-       bits_per_page = ((bytes_per_page - sizeof(struct delta_page_header)) * BITS_PER_BYTE);
-       /*
-        * Reduce the bits per page by one immutable delta list header and one delta list to
-        * account for internal fragmentation.
-        */
-       bits_per_page -= IMMUTABLE_HEADER_SIZE + bits_per_delta_list;
-       /* Now compute the number of pages needed. */
-       return DIV_ROUND_UP(bits_per_index, bits_per_page);
-}
-
-void uds_log_delta_index_entry(struct delta_index_entry *delta_entry)
-{
-       uds_log_ratelimit(uds_log_info,
-                         "List 0x%X Key 0x%X Offset 0x%X%s%s List_size 0x%X%s",
-                         delta_entry->list_number, delta_entry->key,
-                         delta_entry->offset, delta_entry->at_end ? " end" : "",
-                         delta_entry->is_collision ? " collision" : "",
-                         delta_entry->delta_list->size,
-                         delta_entry->list_overflow ? " overflow" : "");
-       delta_entry->list_overflow = false;
-}
diff --git a/drivers/md/dm-vdo/delta-index.h b/drivers/md/dm-vdo/delta-index.h
deleted file mode 100644 (file)
index b3b38fb..0000000
+++ /dev/null
@@ -1,278 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright 2023 Red Hat
- */
-
-#ifndef UDS_DELTA_INDEX_H
-#define UDS_DELTA_INDEX_H
-
-#include <linux/cache.h>
-
-#include "config.h"
-#include "io-factory.h"
-#include "numeric.h"
-#include "time-utils.h"
-
-/*
- * A delta index is a key-value store, where each entry maps an address (the key) to a payload (the
- * value). The entries are sorted by address, and only the delta between successive addresses is
- * stored in the entry. The addresses are assumed to be uniformly distributed, and the deltas are
- * therefore exponentially distributed.
- *
- * A delta_index can either be mutable or immutable depending on its expected use. The immutable
- * form of a delta index is used for the indexes of closed chapters committed to the volume. The
- * mutable form of a delta index is used by the volume index, and also by the chapter index in an
- * open chapter. Like the index as a whole, each mutable delta index is divided into a number of
- * independent zones.
- */
-
-struct delta_list {
-       /* The offset of the delta list start, in bits */
-       u64 start;
-       /* The number of bits in the delta list */
-       u16 size;
-       /* Where the last search "found" the key, in bits */
-       u16 save_offset;
-       /* The key for the record just before save_offset */
-       u32 save_key;
-};
-
-struct delta_zone {
-       /* The delta list memory */
-       u8 *memory;
-       /* The delta list headers */
-       struct delta_list *delta_lists;
-       /* Temporary starts of delta lists */
-       u64 *new_offsets;
-       /* Buffered writer for saving an index */
-       struct buffered_writer *buffered_writer;
-       /* The size of delta list memory */
-       size_t size;
-       /* Nanoseconds spent rebalancing */
-       ktime_t rebalance_time;
-       /* Number of memory rebalances */
-       u32 rebalance_count;
-       /* The number of bits in a stored value */
-       u8 value_bits;
-       /* The number of bits in the minimal key code */
-       u16 min_bits;
-       /* The number of keys used in a minimal code */
-       u32 min_keys;
-       /* The number of keys used for another code bit */
-       u32 incr_keys;
-       /* The number of records in the index */
-       u64 record_count;
-       /* The number of collision records */
-       u64 collision_count;
-       /* The number of records removed */
-       u64 discard_count;
-       /* The number of UDS_OVERFLOW errors detected */
-       u64 overflow_count;
-       /* The index of the first delta list */
-       u32 first_list;
-       /* The number of delta lists */
-       u32 list_count;
-       /* Tag belonging to this delta index */
-       u8 tag;
-} __aligned(L1_CACHE_BYTES);
-
-struct delta_list_save_info {
-       /* Tag identifying which delta index this list is in */
-       u8 tag;
-       /* Bit offset of the start of the list data */
-       u8 bit_offset;
-       /* Number of bytes of list data */
-       u16 byte_count;
-       /* The delta list number within the delta index */
-       u32 index;
-} __packed;
-
-struct delta_index {
-       /* The zones */
-       struct delta_zone *delta_zones;
-       /* The number of zones */
-       unsigned int zone_count;
-       /* The number of delta lists */
-       u32 list_count;
-       /* Maximum lists per zone */
-       u32 lists_per_zone;
-       /* Total memory allocated to this index */
-       size_t memory_size;
-       /* The number of non-empty lists at load time per zone */
-       u32 load_lists[MAX_ZONES];
-       /* True if this index is mutable */
-       bool mutable;
-       /* Tag belonging to this delta index */
-       u8 tag;
-};
-
-/*
- * A delta_index_page describes a single page of a chapter index. The delta_index field allows the
- * page to be treated as an immutable delta_index. We use the delta_zone field to treat the chapter
- * index page as a single zone index, and without the need to do an additional memory allocation.
- */
-struct delta_index_page {
-       struct delta_index delta_index;
-       /* These values are loaded from the delta_page_header */
-       u32 lowest_list_number;
-       u32 highest_list_number;
-       u64 virtual_chapter_number;
-       /* This structure describes the single zone of a delta index page. */
-       struct delta_zone delta_zone;
-};
-
-/*
- * Notes on the delta_index_entries:
- *
- * The fields documented as "public" can be read by any code that uses a delta_index. The fields
- * documented as "private" carry information between delta_index method calls and should not be
- * used outside the delta_index module.
- *
- * (1) The delta_index_entry is used like an iterator when searching a delta list.
- *
- * (2) It is also the result of a successful search and can be used to refer to the element found
- *     by the search.
- *
- * (3) It is also the result of an unsuccessful search and can be used to refer to the insertion
- *     point for a new record.
- *
- * (4) If at_end is true, the delta_list entry can only be used as the insertion point for a new
- *     record at the end of the list.
- *
- * (5) If at_end is false and is_collision is true, the delta_list entry fields refer to a
- *     collision entry in the list, and the delta_list entry can be used a a reference to this
- *     entry.
- *
- * (6) If at_end is false and is_collision is false, the delta_list entry fields refer to a
- *     non-collision entry in the list. Such delta_list entries can be used as a reference to a
- *     found entry, or an insertion point for a non-collision entry before this entry, or an
- *     insertion point for a collision entry that collides with this entry.
- */
-struct delta_index_entry {
-       /* Public fields */
-       /* The key for this entry */
-       u32 key;
-       /* We are after the last list entry */
-       bool at_end;
-       /* This record is a collision */
-       bool is_collision;
-
-       /* Private fields */
-       /* This delta list overflowed */
-       bool list_overflow;
-       /* The number of bits used for the value */
-       u8 value_bits;
-       /* The number of bits used for the entire entry */
-       u16 entry_bits;
-       /* The delta index zone */
-       struct delta_zone *delta_zone;
-       /* The delta list containing the entry */
-       struct delta_list *delta_list;
-       /* The delta list number */
-       u32 list_number;
-       /* Bit offset of this entry within the list */
-       u16 offset;
-       /* The delta between this and previous entry */
-       u32 delta;
-       /* Temporary delta list for immutable indices */
-       struct delta_list temp_delta_list;
-};
-
-struct delta_index_stats {
-       /* Number of bytes allocated */
-       size_t memory_allocated;
-       /* Nanoseconds spent rebalancing */
-       ktime_t rebalance_time;
-       /* Number of memory rebalances */
-       u32 rebalance_count;
-       /* The number of records in the index */
-       u64 record_count;
-       /* The number of collision records */
-       u64 collision_count;
-       /* The number of records removed */
-       u64 discard_count;
-       /* The number of UDS_OVERFLOW errors detected */
-       u64 overflow_count;
-       /* The number of delta lists */
-       u32 list_count;
-};
-
-int __must_check uds_initialize_delta_index(struct delta_index *delta_index,
-                                           unsigned int zone_count, u32 list_count,
-                                           u32 mean_delta, u32 payload_bits,
-                                           size_t memory_size, u8 tag);
-
-int __must_check uds_initialize_delta_index_page(struct delta_index_page *delta_index_page,
-                                                u64 expected_nonce, u32 mean_delta,
-                                                u32 payload_bits, u8 *memory,
-                                                size_t memory_size);
-
-void uds_uninitialize_delta_index(struct delta_index *delta_index);
-
-void uds_reset_delta_index(const struct delta_index *delta_index);
-
-int __must_check uds_pack_delta_index_page(const struct delta_index *delta_index,
-                                          u64 header_nonce, u8 *memory,
-                                          size_t memory_size,
-                                          u64 virtual_chapter_number, u32 first_list,
-                                          u32 *list_count);
-
-int __must_check uds_start_restoring_delta_index(struct delta_index *delta_index,
-                                                struct buffered_reader **buffered_readers,
-                                                unsigned int reader_count);
-
-int __must_check uds_finish_restoring_delta_index(struct delta_index *delta_index,
-                                                 struct buffered_reader **buffered_readers,
-                                                 unsigned int reader_count);
-
-int __must_check uds_check_guard_delta_lists(struct buffered_reader **buffered_readers,
-                                            unsigned int reader_count);
-
-int __must_check uds_start_saving_delta_index(const struct delta_index *delta_index,
-                                             unsigned int zone_number,
-                                             struct buffered_writer *buffered_writer);
-
-int __must_check uds_finish_saving_delta_index(const struct delta_index *delta_index,
-                                              unsigned int zone_number);
-
-int __must_check uds_write_guard_delta_list(struct buffered_writer *buffered_writer);
-
-size_t __must_check uds_compute_delta_index_save_bytes(u32 list_count,
-                                                      size_t memory_size);
-
-int __must_check uds_start_delta_index_search(const struct delta_index *delta_index,
-                                             u32 list_number, u32 key,
-                                             struct delta_index_entry *iterator);
-
-int __must_check uds_next_delta_index_entry(struct delta_index_entry *delta_entry);
-
-int __must_check uds_remember_delta_index_offset(const struct delta_index_entry *delta_entry);
-
-int __must_check uds_get_delta_index_entry(const struct delta_index *delta_index,
-                                          u32 list_number, u32 key, const u8 *name,
-                                          struct delta_index_entry *delta_entry);
-
-int __must_check uds_get_delta_entry_collision(const struct delta_index_entry *delta_entry,
-                                              u8 *name);
-
-u32 __must_check uds_get_delta_entry_value(const struct delta_index_entry *delta_entry);
-
-int __must_check uds_set_delta_entry_value(const struct delta_index_entry *delta_entry, u32 value);
-
-int __must_check uds_put_delta_index_entry(struct delta_index_entry *delta_entry, u32 key,
-                                          u32 value, const u8 *name);
-
-int __must_check uds_remove_delta_index_entry(struct delta_index_entry *delta_entry);
-
-void uds_get_delta_index_stats(const struct delta_index *delta_index,
-                              struct delta_index_stats *stats);
-
-size_t __must_check uds_compute_delta_index_size(u32 entry_count, u32 mean_delta,
-                                                u32 payload_bits);
-
-u32 uds_get_delta_index_page_count(u32 entry_count, u32 list_count, u32 mean_delta,
-                                  u32 payload_bits, size_t bytes_per_page);
-
-void uds_log_delta_index_entry(struct delta_index_entry *delta_entry);
-
-#endif /* UDS_DELTA_INDEX_H */
diff --git a/drivers/md/dm-vdo/funnel-requestqueue.c b/drivers/md/dm-vdo/funnel-requestqueue.c
deleted file mode 100644 (file)
index d2b49e3..0000000
+++ /dev/null
@@ -1,279 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright 2023 Red Hat
- */
-
-#include "funnel-requestqueue.h"
-
-#include <linux/atomic.h>
-#include <linux/compiler.h>
-#include <linux/wait.h>
-
-#include "funnel-queue.h"
-#include "logger.h"
-#include "memory-alloc.h"
-#include "thread-utils.h"
-
-/*
- * This queue will attempt to handle requests in reasonably sized batches instead of reacting
- * immediately to each new request. The wait time between batches is dynamically adjusted up or
- * down to try to balance responsiveness against wasted thread run time.
- *
- * If the wait time becomes long enough, the queue will become dormant and must be explicitly
- * awoken when a new request is enqueued. The enqueue operation updates "newest" in the funnel
- * queue via xchg (which is a memory barrier), and later checks "dormant" to decide whether to do a
- * wakeup of the worker thread.
- *
- * When deciding to go to sleep, the worker thread sets "dormant" and then examines "newest" to
- * decide if the funnel queue is idle. In dormant mode, the last examination of "newest" before
- * going to sleep is done inside the wait_event_interruptible() macro, after a point where one or
- * more memory barriers have been issued. (Preparing to sleep uses spin locks.) Even if the funnel
- * queue's "next" field update isn't visible yet to make the entry accessible, its existence will
- * kick the worker thread out of dormant mode and back into timer-based mode.
- *
- * Unbatched requests are used to communicate between different zone threads and will also cause
- * the queue to awaken immediately.
- */
-
-enum {
-       NANOSECOND = 1,
-       MICROSECOND = 1000 * NANOSECOND,
-       MILLISECOND = 1000 * MICROSECOND,
-       DEFAULT_WAIT_TIME = 20 * MICROSECOND,
-       MINIMUM_WAIT_TIME = DEFAULT_WAIT_TIME / 2,
-       MAXIMUM_WAIT_TIME = MILLISECOND,
-       MINIMUM_BATCH = 32,
-       MAXIMUM_BATCH = 64,
-};
-
-struct uds_request_queue {
-       /* Wait queue for synchronizing producers and consumer */
-       struct wait_queue_head wait_head;
-       /* Function to process a request */
-       uds_request_queue_processor_fn processor;
-       /* Queue of new incoming requests */
-       struct funnel_queue *main_queue;
-       /* Queue of old requests to retry */
-       struct funnel_queue *retry_queue;
-       /* The thread id of the worker thread */
-       struct thread *thread;
-       /* True if the worker was started */
-       bool started;
-       /* When true, requests can be enqueued */
-       bool running;
-       /* A flag set when the worker is waiting without a timeout */
-       atomic_t dormant;
-};
-
-static inline struct uds_request *poll_queues(struct uds_request_queue *queue)
-{
-       struct funnel_queue_entry *entry;
-
-       entry = uds_funnel_queue_poll(queue->retry_queue);
-       if (entry != NULL)
-               return container_of(entry, struct uds_request, queue_link);
-
-       entry = uds_funnel_queue_poll(queue->main_queue);
-       if (entry != NULL)
-               return container_of(entry, struct uds_request, queue_link);
-
-       return NULL;
-}
-
-static inline bool are_queues_idle(struct uds_request_queue *queue)
-{
-       return uds_is_funnel_queue_idle(queue->retry_queue) &&
-              uds_is_funnel_queue_idle(queue->main_queue);
-}
-
-/*
- * Determine if there is a next request to process, and return it if there is. Also return flags
- * indicating whether the worker thread can sleep (for the use of wait_event() macros) and whether
- * the thread did sleep before returning a new request.
- */
-static inline bool dequeue_request(struct uds_request_queue *queue,
-                                  struct uds_request **request_ptr, bool *waited_ptr)
-{
-       struct uds_request *request = poll_queues(queue);
-
-       if (request != NULL) {
-               *request_ptr = request;
-               return true;
-       }
-
-       if (!READ_ONCE(queue->running)) {
-               /* Wake the worker thread so it can exit. */
-               *request_ptr = NULL;
-               return true;
-       }
-
-       *request_ptr = NULL;
-       *waited_ptr = true;
-       return false;
-}
-
-static void wait_for_request(struct uds_request_queue *queue, bool dormant,
-                            unsigned long timeout, struct uds_request **request,
-                            bool *waited)
-{
-       if (dormant) {
-               wait_event_interruptible(queue->wait_head,
-                                        (dequeue_request(queue, request, waited) ||
-                                         !are_queues_idle(queue)));
-               return;
-       }
-
-       wait_event_interruptible_hrtimeout(queue->wait_head,
-                                          dequeue_request(queue, request, waited),
-                                          ns_to_ktime(timeout));
-}
-
-static void request_queue_worker(void *arg)
-{
-       struct uds_request_queue *queue = arg;
-       struct uds_request *request = NULL;
-       unsigned long time_batch = DEFAULT_WAIT_TIME;
-       bool dormant = atomic_read(&queue->dormant);
-       bool waited = false;
-       long current_batch = 0;
-
-       for (;;) {
-               wait_for_request(queue, dormant, time_batch, &request, &waited);
-               if (likely(request != NULL)) {
-                       current_batch++;
-                       queue->processor(request);
-               } else if (!READ_ONCE(queue->running)) {
-                       break;
-               }
-
-               if (dormant) {
-                       /*
-                        * The queue has been roused from dormancy. Clear the flag so enqueuers can
-                        * stop broadcasting. No fence is needed for this transition.
-                        */
-                       atomic_set(&queue->dormant, false);
-                       dormant = false;
-                       time_batch = DEFAULT_WAIT_TIME;
-               } else if (waited) {
-                       /*
-                        * We waited for this request to show up. Adjust the wait time to smooth
-                        * out the batch size.
-                        */
-                       if (current_batch < MINIMUM_BATCH) {
-                               /*
-                                * If the last batch of requests was too small, increase the wait
-                                * time.
-                                */
-                               time_batch += time_batch / 4;
-                               if (time_batch >= MAXIMUM_WAIT_TIME) {
-                                       atomic_set(&queue->dormant, true);
-                                       dormant = true;
-                               }
-                       } else if (current_batch > MAXIMUM_BATCH) {
-                               /*
-                                * If the last batch of requests was too large, decrease the wait
-                                * time.
-                                */
-                               time_batch -= time_batch / 4;
-                               if (time_batch < MINIMUM_WAIT_TIME)
-                                       time_batch = MINIMUM_WAIT_TIME;
-                       }
-                       current_batch = 0;
-               }
-       }
-
-       /*
-        * Ensure that we process any remaining requests that were enqueued before trying to shut
-        * down. The corresponding write barrier is in uds_request_queue_finish().
-        */
-       smp_rmb();
-       while ((request = poll_queues(queue)) != NULL)
-               queue->processor(request);
-}
-
-int uds_make_request_queue(const char *queue_name,
-                          uds_request_queue_processor_fn processor,
-                          struct uds_request_queue **queue_ptr)
-{
-       int result;
-       struct uds_request_queue *queue;
-
-       result = uds_allocate(1, struct uds_request_queue, __func__, &queue);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       queue->processor = processor;
-       queue->running = true;
-       atomic_set(&queue->dormant, false);
-       init_waitqueue_head(&queue->wait_head);
-
-       result = uds_make_funnel_queue(&queue->main_queue);
-       if (result != UDS_SUCCESS) {
-               uds_request_queue_finish(queue);
-               return result;
-       }
-
-       result = uds_make_funnel_queue(&queue->retry_queue);
-       if (result != UDS_SUCCESS) {
-               uds_request_queue_finish(queue);
-               return result;
-       }
-
-       result = vdo_create_thread(request_queue_worker, queue, queue_name,
-                                  &queue->thread);
-       if (result != UDS_SUCCESS) {
-               uds_request_queue_finish(queue);
-               return result;
-       }
-
-       queue->started = true;
-       *queue_ptr = queue;
-       return UDS_SUCCESS;
-}
-
-static inline void wake_up_worker(struct uds_request_queue *queue)
-{
-       if (wq_has_sleeper(&queue->wait_head))
-               wake_up(&queue->wait_head);
-}
-
-void uds_request_queue_enqueue(struct uds_request_queue *queue,
-                              struct uds_request *request)
-{
-       struct funnel_queue *sub_queue;
-       bool unbatched = request->unbatched;
-
-       sub_queue = request->requeued ? queue->retry_queue : queue->main_queue;
-       uds_funnel_queue_put(sub_queue, &request->queue_link);
-
-       /*
-        * We must wake the worker thread when it is dormant. A read fence isn't needed here since
-        * we know the queue operation acts as one.
-        */
-       if (atomic_read(&queue->dormant) || unbatched)
-               wake_up_worker(queue);
-}
-
-void uds_request_queue_finish(struct uds_request_queue *queue)
-{
-       if (queue == NULL)
-               return;
-
-       /*
-        * This memory barrier ensures that any requests we queued will be seen. The point is that
-        * when dequeue_request() sees the following update to the running flag, it will also be
-        * able to see any change we made to a next field in the funnel queue entry. The
-        * corresponding read barrier is in request_queue_worker().
-        */
-       smp_wmb();
-       WRITE_ONCE(queue->running, false);
-
-       if (queue->started) {
-               wake_up_worker(queue);
-               vdo_join_threads(queue->thread);
-       }
-
-       uds_free_funnel_queue(queue->main_queue);
-       uds_free_funnel_queue(queue->retry_queue);
-       uds_free(queue);
-}
diff --git a/drivers/md/dm-vdo/funnel-requestqueue.h b/drivers/md/dm-vdo/funnel-requestqueue.h
deleted file mode 100644 (file)
index 9b0f539..0000000
+++ /dev/null
@@ -1,31 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright 2023 Red Hat
- */
-
-#ifndef UDS_REQUEST_QUEUE_H
-#define UDS_REQUEST_QUEUE_H
-
-#include "indexer.h"
-
-/*
- * A simple request queue which will handle new requests in the order in which they are received,
- * and will attempt to handle requeued requests before new ones. However, the nature of the
- * implementation means that it cannot guarantee this ordering; the prioritization is merely a
- * hint.
- */
-
-struct uds_request_queue;
-
-typedef void (*uds_request_queue_processor_fn)(struct uds_request *);
-
-int __must_check uds_make_request_queue(const char *queue_name,
-                                       uds_request_queue_processor_fn processor,
-                                       struct uds_request_queue **queue_ptr);
-
-void uds_request_queue_enqueue(struct uds_request_queue *queue,
-                              struct uds_request *request);
-
-void uds_request_queue_finish(struct uds_request_queue *queue);
-
-#endif /* UDS_REQUEST_QUEUE_H */
diff --git a/drivers/md/dm-vdo/geometry.c b/drivers/md/dm-vdo/geometry.c
deleted file mode 100644 (file)
index 04c0719..0000000
+++ /dev/null
@@ -1,200 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright 2023 Red Hat
- */
-
-#include "geometry.h"
-
-#include <linux/compiler.h>
-#include <linux/log2.h>
-
-#include "delta-index.h"
-#include "errors.h"
-#include "indexer.h"
-#include "logger.h"
-#include "memory-alloc.h"
-#include "permassert.h"
-
-/*
- * An index volume is divided into a fixed number of fixed-size chapters, each consisting of a
- * fixed number of fixed-size pages. The volume layout is defined by two constants and four
- * parameters. The constants are that index records are 32 bytes long (16-byte block name plus
- * 16-byte metadata) and that open chapter index hash slots are one byte long. The four parameters
- * are the number of bytes in a page, the number of record pages in a chapter, the number of
- * chapters in a volume, and the number of chapters that are sparse. From these parameters, we can
- * derive the rest of the layout and other index properties.
- *
- * The index volume is sized by its maximum memory footprint. For a dense index, the persistent
- * storage is about 10 times the size of the memory footprint. For a sparse index, the persistent
- * storage is about 100 times the size of the memory footprint.
- *
- * For a small index with a memory footprint less than 1GB, there are three possible memory
- * configurations: 0.25GB, 0.5GB and 0.75GB. The default geometry for each is 1024 index records
- * per 32 KB page, 1024 chapters per volume, and either 64, 128, or 192 record pages per chapter
- * (resulting in 6, 13, or 20 index pages per chapter) depending on the memory configuration. For
- * the VDO default of a 0.25 GB index, this yields a deduplication window of 256 GB using about 2.5
- * GB for the persistent storage and 256 MB of RAM.
- *
- * For a larger index with a memory footprint that is a multiple of 1 GB, the geometry is 1024
- * index records per 32 KB page, 256 record pages per chapter, 26 index pages per chapter, and 1024
- * chapters for every GB of memory footprint. For a 1 GB volume, this yields a deduplication window
- * of 1 TB using about 9GB of persistent storage and 1 GB of RAM.
- *
- * The above numbers hold for volumes which have no sparse chapters. A sparse volume has 10 times
- * as many chapters as the corresponding non-sparse volume, which provides 10 times the
- * deduplication window while using 10 times as much persistent storage as the equivalent
- * non-sparse volume with the same memory footprint.
- *
- * If the volume has been converted from a non-lvm format to an lvm volume, the number of chapters
- * per volume will have been reduced by one by eliminating physical chapter 0, and the virtual
- * chapter that formerly mapped to physical chapter 0 may be remapped to another physical chapter.
- * This remapping is expressed by storing which virtual chapter was remapped, and which physical
- * chapter it was moved to.
- */
-
-int uds_make_index_geometry(size_t bytes_per_page, u32 record_pages_per_chapter,
-                           u32 chapters_per_volume, u32 sparse_chapters_per_volume,
-                           u64 remapped_virtual, u64 remapped_physical,
-                           struct index_geometry **geometry_ptr)
-{
-       int result;
-       struct index_geometry *geometry;
-
-       result = uds_allocate(1, struct index_geometry, "geometry", &geometry);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       geometry->bytes_per_page = bytes_per_page;
-       geometry->record_pages_per_chapter = record_pages_per_chapter;
-       geometry->chapters_per_volume = chapters_per_volume;
-       geometry->sparse_chapters_per_volume = sparse_chapters_per_volume;
-       geometry->dense_chapters_per_volume = chapters_per_volume - sparse_chapters_per_volume;
-       geometry->remapped_virtual = remapped_virtual;
-       geometry->remapped_physical = remapped_physical;
-
-       geometry->records_per_page = bytes_per_page / BYTES_PER_RECORD;
-       geometry->records_per_chapter = geometry->records_per_page * record_pages_per_chapter;
-       geometry->records_per_volume = (u64) geometry->records_per_chapter * chapters_per_volume;
-
-       geometry->chapter_mean_delta = 1 << DEFAULT_CHAPTER_MEAN_DELTA_BITS;
-       geometry->chapter_payload_bits = bits_per(record_pages_per_chapter - 1);
-       /*
-        * We want 1 delta list for every 64 records in the chapter.
-        * The "| 077" ensures that the chapter_delta_list_bits computation
-        * does not underflow.
-        */
-       geometry->chapter_delta_list_bits =
-               bits_per((geometry->records_per_chapter - 1) | 077) - 6;
-       geometry->delta_lists_per_chapter = 1 << geometry->chapter_delta_list_bits;
-       /* We need enough address bits to achieve the desired mean delta. */
-       geometry->chapter_address_bits =
-               (DEFAULT_CHAPTER_MEAN_DELTA_BITS -
-                geometry->chapter_delta_list_bits +
-                bits_per(geometry->records_per_chapter - 1));
-       geometry->index_pages_per_chapter =
-               uds_get_delta_index_page_count(geometry->records_per_chapter,
-                                              geometry->delta_lists_per_chapter,
-                                              geometry->chapter_mean_delta,
-                                              geometry->chapter_payload_bits,
-                                              bytes_per_page);
-
-       geometry->pages_per_chapter = geometry->index_pages_per_chapter + record_pages_per_chapter;
-       geometry->pages_per_volume = geometry->pages_per_chapter * chapters_per_volume;
-       geometry->bytes_per_volume =
-               bytes_per_page * (geometry->pages_per_volume + HEADER_PAGES_PER_VOLUME);
-
-       *geometry_ptr = geometry;
-       return UDS_SUCCESS;
-}
-
-int uds_copy_index_geometry(struct index_geometry *source,
-                           struct index_geometry **geometry_ptr)
-{
-       return uds_make_index_geometry(source->bytes_per_page,
-                                      source->record_pages_per_chapter,
-                                      source->chapters_per_volume,
-                                      source->sparse_chapters_per_volume,
-                                      source->remapped_virtual, source->remapped_physical,
-                                      geometry_ptr);
-}
-
-void uds_free_index_geometry(struct index_geometry *geometry)
-{
-       uds_free(geometry);
-}
-
-u32 __must_check uds_map_to_physical_chapter(const struct index_geometry *geometry,
-                                            u64 virtual_chapter)
-{
-       u64 delta;
-
-       if (!uds_is_reduced_index_geometry(geometry))
-               return virtual_chapter % geometry->chapters_per_volume;
-
-       if (likely(virtual_chapter > geometry->remapped_virtual)) {
-               delta = virtual_chapter - geometry->remapped_virtual;
-               if (likely(delta > geometry->remapped_physical))
-                       return delta % geometry->chapters_per_volume;
-               else
-                       return delta - 1;
-       }
-
-       if (virtual_chapter == geometry->remapped_virtual)
-               return geometry->remapped_physical;
-
-       delta = geometry->remapped_virtual - virtual_chapter;
-       if (delta < geometry->chapters_per_volume)
-               return geometry->chapters_per_volume - delta;
-
-       /* This chapter is so old the answer doesn't matter. */
-       return 0;
-}
-
-/* Check whether any sparse chapters are in use. */
-bool uds_has_sparse_chapters(const struct index_geometry *geometry,
-                            u64 oldest_virtual_chapter, u64 newest_virtual_chapter)
-{
-       return uds_is_sparse_index_geometry(geometry) &&
-               ((newest_virtual_chapter - oldest_virtual_chapter + 1) >
-                geometry->dense_chapters_per_volume);
-}
-
-bool uds_is_chapter_sparse(const struct index_geometry *geometry,
-                          u64 oldest_virtual_chapter, u64 newest_virtual_chapter,
-                          u64 virtual_chapter_number)
-{
-       return uds_has_sparse_chapters(geometry, oldest_virtual_chapter,
-                                      newest_virtual_chapter) &&
-               ((virtual_chapter_number + geometry->dense_chapters_per_volume) <=
-                newest_virtual_chapter);
-}
-
-/* Calculate how many chapters to expire after opening the newest chapter. */
-u32 uds_chapters_to_expire(const struct index_geometry *geometry, u64 newest_chapter)
-{
-       /* If the index isn't full yet, don't expire anything. */
-       if (newest_chapter < geometry->chapters_per_volume)
-               return 0;
-
-       /* If a chapter is out of order... */
-       if (geometry->remapped_physical > 0) {
-               u64 oldest_chapter = newest_chapter - geometry->chapters_per_volume;
-
-               /*
-                * ... expire an extra chapter when expiring the moved chapter to free physical
-                * space for the new chapter ...
-                */
-               if (oldest_chapter == geometry->remapped_virtual)
-                       return 2;
-
-               /*
-                * ... but don't expire anything when the new chapter will use the physical chapter
-                * freed by expiring the moved chapter.
-                */
-               if (oldest_chapter == (geometry->remapped_virtual + geometry->remapped_physical))
-                       return 0;
-       }
-
-       /* Normally, just expire one. */
-       return 1;
-}
diff --git a/drivers/md/dm-vdo/geometry.h b/drivers/md/dm-vdo/geometry.h
deleted file mode 100644 (file)
index a2ecdb2..0000000
+++ /dev/null
@@ -1,140 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright 2023 Red Hat
- */
-
-#ifndef UDS_INDEX_GEOMETRY_H
-#define UDS_INDEX_GEOMETRY_H
-
-#include "indexer.h"
-
-/*
- * The index_geometry records parameters that define the layout of a UDS index volume, and the size and
- * shape of various index structures. It is created when the index is created, and is referenced by
- * many index sub-components.
- */
-
-struct index_geometry {
-       /* Size of a chapter page, in bytes */
-       size_t bytes_per_page;
-       /* Number of record pages in a chapter */
-       u32 record_pages_per_chapter;
-       /* Total number of chapters in a volume */
-       u32 chapters_per_volume;
-       /* Number of sparsely-indexed chapters in a volume */
-       u32 sparse_chapters_per_volume;
-       /* Number of bits used to determine delta list numbers */
-       u8 chapter_delta_list_bits;
-       /* Virtual chapter remapped from physical chapter 0 */
-       u64 remapped_virtual;
-       /* New physical chapter where the remapped chapter can be found */
-       u64 remapped_physical;
-
-       /*
-        * The following properties are derived from the ones above, but they are computed and
-        * recorded as fields for convenience.
-        */
-       /* Total number of pages in a volume, excluding the header */
-       u32 pages_per_volume;
-       /* Total number of bytes in a volume, including the header */
-       size_t bytes_per_volume;
-       /* Number of pages in a chapter */
-       u32 pages_per_chapter;
-       /* Number of index pages in a chapter index */
-       u32 index_pages_per_chapter;
-       /* Number of records that fit on a page */
-       u32 records_per_page;
-       /* Number of records that fit in a chapter */
-       u32 records_per_chapter;
-       /* Number of records that fit in a volume */
-       u64 records_per_volume;
-       /* Number of delta lists per chapter index */
-       u32 delta_lists_per_chapter;
-       /* Mean delta for chapter indexes */
-       u32 chapter_mean_delta;
-       /* Number of bits needed for record page numbers */
-       u8 chapter_payload_bits;
-       /* Number of bits used to compute addresses for chapter delta lists */
-       u8 chapter_address_bits;
-       /* Number of densely-indexed chapters in a volume */
-       u32 dense_chapters_per_volume;
-};
-
-enum {
-       /* The number of bytes in a record (name + metadata) */
-       BYTES_PER_RECORD = (UDS_RECORD_NAME_SIZE + UDS_RECORD_DATA_SIZE),
-
-       /* The default length of a page in a chapter, in bytes */
-       DEFAULT_BYTES_PER_PAGE = 1024 * BYTES_PER_RECORD,
-
-       /* The default maximum number of records per page */
-       DEFAULT_RECORDS_PER_PAGE = DEFAULT_BYTES_PER_PAGE / BYTES_PER_RECORD,
-
-       /* The default number of record pages in a chapter */
-       DEFAULT_RECORD_PAGES_PER_CHAPTER = 256,
-
-       /* The default number of record pages in a chapter for a small index */
-       SMALL_RECORD_PAGES_PER_CHAPTER = 64,
-
-       /* The default number of chapters in a volume */
-       DEFAULT_CHAPTERS_PER_VOLUME = 1024,
-
-       /* The default number of sparsely-indexed chapters in a volume */
-       DEFAULT_SPARSE_CHAPTERS_PER_VOLUME = 0,
-
-       /* The log2 of the default mean delta */
-       DEFAULT_CHAPTER_MEAN_DELTA_BITS = 16,
-
-       /* The log2 of the number of delta lists in a large chapter */
-       DEFAULT_CHAPTER_DELTA_LIST_BITS = 12,
-
-       /* The log2 of the number of delta lists in a small chapter */
-       SMALL_CHAPTER_DELTA_LIST_BITS = 10,
-
-       /* The number of header pages per volume */
-       HEADER_PAGES_PER_VOLUME = 1,
-};
-
-int __must_check uds_make_index_geometry(size_t bytes_per_page, u32 record_pages_per_chapter,
-                                        u32 chapters_per_volume,
-                                        u32 sparse_chapters_per_volume, u64 remapped_virtual,
-                                        u64 remapped_physical,
-                                        struct index_geometry **geometry_ptr);
-
-int __must_check uds_copy_index_geometry(struct index_geometry *source,
-                                        struct index_geometry **geometry_ptr);
-
-void uds_free_index_geometry(struct index_geometry *geometry);
-
-u32 __must_check uds_map_to_physical_chapter(const struct index_geometry *geometry,
-                                            u64 virtual_chapter);
-
-/*
- * Check whether this geometry is reduced by a chapter. This will only be true if the volume was
- * converted from a non-lvm volume to an lvm volume.
- */
-static inline bool __must_check
-uds_is_reduced_index_geometry(const struct index_geometry *geometry)
-{
-       return !!(geometry->chapters_per_volume & 1);
-}
-
-static inline bool __must_check
-uds_is_sparse_index_geometry(const struct index_geometry *geometry)
-{
-       return geometry->sparse_chapters_per_volume > 0;
-}
-
-bool __must_check uds_has_sparse_chapters(const struct index_geometry *geometry,
-                                         u64 oldest_virtual_chapter,
-                                         u64 newest_virtual_chapter);
-
-bool __must_check uds_is_chapter_sparse(const struct index_geometry *geometry,
-                                       u64 oldest_virtual_chapter,
-                                       u64 newest_virtual_chapter,
-                                       u64 virtual_chapter_number);
-
-u32 __must_check uds_chapters_to_expire(const struct index_geometry *geometry,
-                                       u64 newest_chapter);
-
-#endif /* UDS_INDEX_GEOMETRY_H */
diff --git a/drivers/md/dm-vdo/hash-utils.h b/drivers/md/dm-vdo/hash-utils.h
deleted file mode 100644 (file)
index e3b865b..0000000
+++ /dev/null
@@ -1,65 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright 2023 Red Hat
- */
-
-#ifndef UDS_HASH_UTILS_H
-#define UDS_HASH_UTILS_H
-
-#include "geometry.h"
-#include "indexer.h"
-#include "numeric.h"
-
-/* Utilities for extracting portions of a request name for various uses. */
-
-/* How various portions of a record name are apportioned. */
-enum {
-       VOLUME_INDEX_BYTES_OFFSET = 0,
-       VOLUME_INDEX_BYTES_COUNT = 8,
-       CHAPTER_INDEX_BYTES_OFFSET = 8,
-       CHAPTER_INDEX_BYTES_COUNT = 6,
-       SAMPLE_BYTES_OFFSET = 14,
-       SAMPLE_BYTES_COUNT = 2,
-};
-
-static inline u64 uds_extract_chapter_index_bytes(const struct uds_record_name *name)
-{
-       const u8 *chapter_bits = &name->name[CHAPTER_INDEX_BYTES_OFFSET];
-       u64 bytes = (u64) get_unaligned_be16(chapter_bits) << 32;
-
-       bytes |= get_unaligned_be32(chapter_bits + 2);
-       return bytes;
-}
-
-static inline u64 uds_extract_volume_index_bytes(const struct uds_record_name *name)
-{
-       return get_unaligned_be64(&name->name[VOLUME_INDEX_BYTES_OFFSET]);
-}
-
-static inline u32 uds_extract_sampling_bytes(const struct uds_record_name *name)
-{
-       return get_unaligned_be16(&name->name[SAMPLE_BYTES_OFFSET]);
-}
-
-/* Compute the chapter delta list for a given name. */
-static inline u32 uds_hash_to_chapter_delta_list(const struct uds_record_name *name,
-                                                const struct index_geometry *geometry)
-{
-       return ((uds_extract_chapter_index_bytes(name) >> geometry->chapter_address_bits) &
-               ((1 << geometry->chapter_delta_list_bits) - 1));
-}
-
-/* Compute the chapter delta address for a given name. */
-static inline u32 uds_hash_to_chapter_delta_address(const struct uds_record_name *name,
-                                                   const struct index_geometry *geometry)
-{
-       return uds_extract_chapter_index_bytes(name) & ((1 << geometry->chapter_address_bits) - 1);
-}
-
-static inline unsigned int uds_name_to_hash_slot(const struct uds_record_name *name,
-                                                unsigned int slot_count)
-{
-       return (unsigned int) (uds_extract_chapter_index_bytes(name) % slot_count);
-}
-
-#endif /* UDS_HASH_UTILS_H */
diff --git a/drivers/md/dm-vdo/index-layout.c b/drivers/md/dm-vdo/index-layout.c
deleted file mode 100644 (file)
index 2da507b..0000000
+++ /dev/null
@@ -1,1768 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright 2023 Red Hat
- */
-
-#include "index-layout.h"
-
-#include <linux/random.h>
-
-#include "config.h"
-#include "logger.h"
-#include "memory-alloc.h"
-#include "murmurhash3.h"
-#include "numeric.h"
-#include "open-chapter.h"
-#include "time-utils.h"
-#include "volume-index.h"
-
-/*
- * The UDS layout on storage media is divided into a number of fixed-size regions, the sizes of
- * which are computed when the index is created. Every header and region begins on 4K block
- * boundary. Save regions are further sub-divided into regions of their own.
- *
- * Each region has a kind and an instance number. Some kinds only have one instance and therefore
- * use RL_SOLE_INSTANCE (-1) as the instance number. The RL_KIND_INDEX used to use instances to
- * represent sub-indices; now, however there is only ever one sub-index and therefore one instance.
- * The RL_KIND_VOLUME_INDEX uses instances to record which zone is being saved.
- *
- * Every region header has a type and version.
- *
- *     +-+-+---------+--------+--------+-+
- *     | | |   I N D E X  0   101, 0   | |
- *     |H|C+---------+--------+--------+S|
- *     |D|f| Volume  | Save   | Save   |e|
- *     |R|g| Region  | Region | Region |a|
- *     | | | 201, -1 | 202, 0 | 202, 1 |l|
- *     +-+-+--------+---------+--------+-+
- *
- * The header contains the encoded region layout table as well as some index configuration data.
- * The sub-index region and its subdivisions are maintained in the same table.
- *
- * There are two save regions to preserve the old state in case saving the new state is incomplete.
- * They are used in alternation. Each save region is further divided into sub-regions.
- *
- *     +-+-----+------+------+-----+-----+
- *     |H| IPM | MI   | MI   |     | OC  |
- *     |D|     | zone | zone | ... |     |
- *     |R| 301 | 302  | 302  |     | 303 |
- *     | | -1  |  0   |  1   |     | -1  |
- *     +-+-----+------+------+-----+-----+
- *
- * The header contains the encoded region layout table as well as index state data for that save.
- * Each save also has a unique nonce.
- */
-
-enum {
-       MAGIC_SIZE = 32,
-       NONCE_INFO_SIZE = 32,
-       MAX_SAVES = 2,
-};
-
-enum region_kind {
-       RL_KIND_EMPTY = 0,
-       RL_KIND_HEADER = 1,
-       RL_KIND_CONFIG = 100,
-       RL_KIND_INDEX = 101,
-       RL_KIND_SEAL = 102,
-       RL_KIND_VOLUME = 201,
-       RL_KIND_SAVE = 202,
-       RL_KIND_INDEX_PAGE_MAP = 301,
-       RL_KIND_VOLUME_INDEX = 302,
-       RL_KIND_OPEN_CHAPTER = 303,
-};
-
-/* Some region types are historical and are no longer used. */
-enum region_type {
-       RH_TYPE_FREE = 0, /* unused */
-       RH_TYPE_SUPER = 1,
-       RH_TYPE_SAVE = 2,
-       RH_TYPE_CHECKPOINT = 3, /* unused */
-       RH_TYPE_UNSAVED = 4,
-};
-
-enum {
-       RL_SOLE_INSTANCE = 65535,
-};
-
-/*
- * Super block version 2 is the first released version.
- *
- * Super block version 3 is the normal version used from RHEL 8.2 onwards.
- *
- * Super block versions 4 through 6 were incremental development versions and
- * are not supported.
- *
- * Super block version 7 is used for volumes which have been reduced in size by one chapter in
- * order to make room to prepend LVM metadata to a volume originally created without lvm. This
- * allows the index to retain most its deduplication records.
- */
-enum {
-       SUPER_VERSION_MINIMUM = 3,
-       SUPER_VERSION_CURRENT = 3,
-       SUPER_VERSION_MAXIMUM = 7,
-};
-
-static const u8 LAYOUT_MAGIC[MAGIC_SIZE] = "*ALBIREO*SINGLE*FILE*LAYOUT*001*";
-static const u64 REGION_MAGIC = 0x416c6252676e3031; /* 'AlbRgn01' */
-
-struct region_header {
-       u64 magic;
-       u64 region_blocks;
-       u16 type;
-       /* Currently always version 1 */
-       u16 version;
-       u16 region_count;
-       u16 payload;
-};
-
-struct layout_region {
-       u64 start_block;
-       u64 block_count;
-       u32 __unused;
-       u16 kind;
-       u16 instance;
-};
-
-struct region_table {
-       size_t encoded_size;
-       struct region_header header;
-       struct layout_region regions[];
-};
-
-struct index_save_data {
-       u64 timestamp;
-       u64 nonce;
-       /* Currently always version 1 */
-       u32 version;
-       u32 unused__;
-};
-
-struct index_state_version {
-       s32 signature;
-       s32 version_id;
-};
-
-static const struct index_state_version INDEX_STATE_VERSION_301 = {
-       .signature  = -1,
-       .version_id = 301,
-};
-
-struct index_state_data301 {
-       struct index_state_version version;
-       u64 newest_chapter;
-       u64 oldest_chapter;
-       u64 last_save;
-       u32 unused;
-       u32 padding;
-};
-
-struct index_save_layout {
-       unsigned int zone_count;
-       struct layout_region index_save;
-       struct layout_region header;
-       struct layout_region index_page_map;
-       struct layout_region free_space;
-       struct layout_region volume_index_zones[MAX_ZONES];
-       struct layout_region open_chapter;
-       struct index_save_data save_data;
-       struct index_state_data301 state_data;
-};
-
-struct sub_index_layout {
-       u64 nonce;
-       struct layout_region sub_index;
-       struct layout_region volume;
-       struct index_save_layout *saves;
-};
-
-struct super_block_data {
-       u8 magic_label[MAGIC_SIZE];
-       u8 nonce_info[NONCE_INFO_SIZE];
-       u64 nonce;
-       u32 version;
-       u32 block_size;
-       u16 index_count;
-       u16 max_saves;
-       /* Padding reflects a blank field on permanent storage */
-       u8 padding[4];
-       u64 open_chapter_blocks;
-       u64 page_map_blocks;
-       u64 volume_offset;
-       u64 start_offset;
-};
-
-struct index_layout {
-       struct io_factory *factory;
-       size_t factory_size;
-       off_t offset;
-       struct super_block_data super;
-       struct layout_region header;
-       struct layout_region config;
-       struct sub_index_layout index;
-       struct layout_region seal;
-       u64 total_blocks;
-};
-
-struct save_layout_sizes {
-       unsigned int save_count;
-       size_t block_size;
-       u64 volume_blocks;
-       u64 volume_index_blocks;
-       u64 page_map_blocks;
-       u64 open_chapter_blocks;
-       u64 save_blocks;
-       u64 sub_index_blocks;
-       u64 total_blocks;
-       size_t total_size;
-};
-
-static inline bool is_converted_super_block(struct super_block_data *super)
-{
-       return super->version == 7;
-}
-
-static int __must_check compute_sizes(const struct uds_configuration *config,
-                                     struct save_layout_sizes *sls)
-{
-       int result;
-       struct index_geometry *geometry = config->geometry;
-
-       memset(sls, 0, sizeof(*sls));
-       sls->save_count = MAX_SAVES;
-       sls->block_size = UDS_BLOCK_SIZE;
-       sls->volume_blocks = geometry->bytes_per_volume / sls->block_size;
-
-       result = uds_compute_volume_index_save_blocks(config, sls->block_size,
-                                                     &sls->volume_index_blocks);
-       if (result != UDS_SUCCESS)
-               return uds_log_error_strerror(result, "cannot compute index save size");
-
-       sls->page_map_blocks =
-               DIV_ROUND_UP(uds_compute_index_page_map_save_size(geometry),
-                            sls->block_size);
-       sls->open_chapter_blocks =
-               DIV_ROUND_UP(uds_compute_saved_open_chapter_size(geometry),
-                            sls->block_size);
-       sls->save_blocks =
-               1 + (sls->volume_index_blocks + sls->page_map_blocks + sls->open_chapter_blocks);
-       sls->sub_index_blocks = sls->volume_blocks + (sls->save_count * sls->save_blocks);
-       sls->total_blocks = 3 + sls->sub_index_blocks;
-       sls->total_size = sls->total_blocks * sls->block_size;
-
-       return UDS_SUCCESS;
-}
-
-int uds_compute_index_size(const struct uds_parameters *parameters, u64 *index_size)
-{
-       int result;
-       struct uds_configuration *index_config;
-       struct save_layout_sizes sizes;
-
-       if (index_size == NULL) {
-               uds_log_error("Missing output size pointer");
-               return -EINVAL;
-       }
-
-       result = uds_make_configuration(parameters, &index_config);
-       if (result != UDS_SUCCESS) {
-               uds_log_error_strerror(result, "cannot compute index size");
-               return uds_status_to_errno(result);
-       }
-
-       result = compute_sizes(index_config, &sizes);
-       uds_free_configuration(index_config);
-       if (result != UDS_SUCCESS)
-               return uds_status_to_errno(result);
-
-       *index_size = sizes.total_size;
-       return UDS_SUCCESS;
-}
-
-/* Create unique data using the current time and a pseudorandom number. */
-static void create_unique_nonce_data(u8 *buffer)
-{
-       ktime_t now = current_time_ns(CLOCK_REALTIME);
-       u32 rand;
-       size_t offset = 0;
-
-       get_random_bytes(&rand, sizeof(u32));
-       memcpy(buffer + offset, &now, sizeof(now));
-       offset += sizeof(now);
-       memcpy(buffer + offset, &rand, sizeof(rand));
-       offset += sizeof(rand);
-       while (offset < NONCE_INFO_SIZE) {
-               size_t len = min(NONCE_INFO_SIZE - offset, offset);
-
-               memcpy(buffer + offset, buffer, len);
-               offset += len;
-       }
-}
-
-static u64 hash_stuff(u64 start, const void *data, size_t len)
-{
-       u32 seed = start ^ (start >> 27);
-       u8 hash_buffer[16];
-
-       murmurhash3_128(data, len, seed, hash_buffer);
-       return get_unaligned_le64(hash_buffer + 4);
-}
-
-/* Generate a primary nonce from the provided data. */
-static u64 generate_primary_nonce(const void *data, size_t len)
-{
-       return hash_stuff(0xa1b1e0fc, data, len);
-}
-
-/*
- * Deterministically generate a secondary nonce from an existing nonce and some arbitrary data by
- * hashing the original nonce and the data to produce a new nonce.
- */
-static u64 generate_secondary_nonce(u64 nonce, const void *data, size_t len)
-{
-       return hash_stuff(nonce + 1, data, len);
-}
-
-static int __must_check open_layout_reader(struct index_layout *layout,
-                                          struct layout_region *lr, off_t offset,
-                                          struct buffered_reader **reader_ptr)
-{
-       return uds_make_buffered_reader(layout->factory, lr->start_block + offset,
-                                       lr->block_count, reader_ptr);
-}
-
-static int open_region_reader(struct index_layout *layout, struct layout_region *region,
-                             struct buffered_reader **reader_ptr)
-{
-       return open_layout_reader(layout, region, -layout->super.start_offset,
-                                 reader_ptr);
-}
-
-static int __must_check open_layout_writer(struct index_layout *layout,
-                                          struct layout_region *lr, off_t offset,
-                                          struct buffered_writer **writer_ptr)
-{
-       return uds_make_buffered_writer(layout->factory, lr->start_block + offset,
-                                       lr->block_count, writer_ptr);
-}
-
-static int open_region_writer(struct index_layout *layout, struct layout_region *region,
-                             struct buffered_writer **writer_ptr)
-{
-       return open_layout_writer(layout, region, -layout->super.start_offset,
-                                 writer_ptr);
-}
-
-static void generate_super_block_data(struct save_layout_sizes *sls,
-                                     struct super_block_data *super)
-{
-       memset(super, 0, sizeof(*super));
-       memcpy(super->magic_label, LAYOUT_MAGIC, MAGIC_SIZE);
-       create_unique_nonce_data(super->nonce_info);
-
-       super->nonce = generate_primary_nonce(super->nonce_info,
-                                             sizeof(super->nonce_info));
-       super->version = SUPER_VERSION_CURRENT;
-       super->block_size = sls->block_size;
-       super->index_count = 1;
-       super->max_saves = sls->save_count;
-       super->open_chapter_blocks = sls->open_chapter_blocks;
-       super->page_map_blocks = sls->page_map_blocks;
-       super->volume_offset = 0;
-       super->start_offset = 0;
-}
-
-static void define_sub_index_nonce(struct index_layout *layout)
-{
-       struct sub_index_nonce_data {
-               u64 offset;
-               u16 index_id;
-       };
-       struct sub_index_layout *sil = &layout->index;
-       u64 primary_nonce = layout->super.nonce;
-       u8 buffer[sizeof(struct sub_index_nonce_data)] = { 0 };
-       size_t offset = 0;
-
-       encode_u64_le(buffer, &offset, sil->sub_index.start_block);
-       encode_u16_le(buffer, &offset, 0);
-       sil->nonce = generate_secondary_nonce(primary_nonce, buffer, sizeof(buffer));
-       if (sil->nonce == 0) {
-               sil->nonce = generate_secondary_nonce(~primary_nonce + 1, buffer,
-                                                     sizeof(buffer));
-       }
-}
-
-static void setup_sub_index(struct index_layout *layout, u64 start_block,
-                           struct save_layout_sizes *sls)
-{
-       struct sub_index_layout *sil = &layout->index;
-       u64 next_block = start_block;
-       unsigned int i;
-
-       sil->sub_index = (struct layout_region) {
-               .start_block = start_block,
-               .block_count = sls->sub_index_blocks,
-               .kind = RL_KIND_INDEX,
-               .instance = 0,
-       };
-
-       sil->volume = (struct layout_region) {
-               .start_block = next_block,
-               .block_count = sls->volume_blocks,
-               .kind = RL_KIND_VOLUME,
-               .instance = RL_SOLE_INSTANCE,
-       };
-
-       next_block += sls->volume_blocks;
-
-       for (i = 0; i < sls->save_count; i++) {
-               sil->saves[i].index_save = (struct layout_region) {
-                       .start_block = next_block,
-                       .block_count = sls->save_blocks,
-                       .kind = RL_KIND_SAVE,
-                       .instance = i,
-               };
-
-               next_block += sls->save_blocks;
-       }
-
-       define_sub_index_nonce(layout);
-}
-
-static void initialize_layout(struct index_layout *layout, struct save_layout_sizes *sls)
-{
-       u64 next_block = layout->offset / sls->block_size;
-
-       layout->total_blocks = sls->total_blocks;
-       generate_super_block_data(sls, &layout->super);
-       layout->header = (struct layout_region) {
-               .start_block = next_block++,
-               .block_count = 1,
-               .kind = RL_KIND_HEADER,
-               .instance = RL_SOLE_INSTANCE,
-       };
-
-       layout->config = (struct layout_region) {
-               .start_block = next_block++,
-               .block_count = 1,
-               .kind = RL_KIND_CONFIG,
-               .instance = RL_SOLE_INSTANCE,
-       };
-
-       setup_sub_index(layout, next_block, sls);
-       next_block += sls->sub_index_blocks;
-
-       layout->seal = (struct layout_region) {
-               .start_block = next_block,
-               .block_count = 1,
-               .kind = RL_KIND_SEAL,
-               .instance = RL_SOLE_INSTANCE,
-       };
-}
-
-static int __must_check make_index_save_region_table(struct index_save_layout *isl,
-                                                    struct region_table **table_ptr)
-{
-       int result;
-       unsigned int z;
-       struct region_table *table;
-       struct layout_region *lr;
-       u16 region_count;
-       size_t payload;
-       size_t type;
-
-       if (isl->zone_count > 0) {
-               /*
-                * Normal save regions: header, page map, volume index zones,
-                * open chapter, and possibly free space.
-                */
-               region_count = 3 + isl->zone_count;
-               if (isl->free_space.block_count > 0)
-                       region_count++;
-
-               payload = sizeof(isl->save_data) + sizeof(isl->state_data);
-               type = RH_TYPE_SAVE;
-       } else {
-               /* Empty save regions: header, page map, free space. */
-               region_count = 3;
-               payload = sizeof(isl->save_data);
-               type = RH_TYPE_UNSAVED;
-       }
-
-       result = uds_allocate_extended(struct region_table, region_count,
-                                      struct layout_region,
-                                      "layout region table for ISL", &table);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       lr = &table->regions[0];
-       *lr++ = isl->header;
-       *lr++ = isl->index_page_map;
-       for (z = 0; z < isl->zone_count; z++)
-               *lr++ = isl->volume_index_zones[z];
-
-       if (isl->zone_count > 0)
-               *lr++ = isl->open_chapter;
-
-       if (isl->free_space.block_count > 0)
-               *lr++ = isl->free_space;
-
-       table->header = (struct region_header) {
-               .magic = REGION_MAGIC,
-               .region_blocks = isl->index_save.block_count,
-               .type = type,
-               .version = 1,
-               .region_count = region_count,
-               .payload = payload,
-       };
-
-       table->encoded_size = (sizeof(struct region_header) + payload +
-                              region_count * sizeof(struct layout_region));
-       *table_ptr = table;
-       return UDS_SUCCESS;
-}
-
-static void encode_region_table(u8 *buffer, size_t *offset, struct region_table *table)
-{
-       unsigned int i;
-
-       encode_u64_le(buffer, offset, REGION_MAGIC);
-       encode_u64_le(buffer, offset, table->header.region_blocks);
-       encode_u16_le(buffer, offset, table->header.type);
-       encode_u16_le(buffer, offset, table->header.version);
-       encode_u16_le(buffer, offset, table->header.region_count);
-       encode_u16_le(buffer, offset, table->header.payload);
-
-       for (i = 0; i < table->header.region_count; i++) {
-               encode_u64_le(buffer, offset, table->regions[i].start_block);
-               encode_u64_le(buffer, offset, table->regions[i].block_count);
-               encode_u32_le(buffer, offset, 0);
-               encode_u16_le(buffer, offset, table->regions[i].kind);
-               encode_u16_le(buffer, offset, table->regions[i].instance);
-       }
-}
-
-static int __must_check write_index_save_header(struct index_save_layout *isl,
-                                               struct region_table *table,
-                                               struct buffered_writer *writer)
-{
-       int result;
-       u8 *buffer;
-       size_t offset = 0;
-
-       result = uds_allocate(table->encoded_size, u8, "index save data", &buffer);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       encode_region_table(buffer, &offset, table);
-       encode_u64_le(buffer, &offset, isl->save_data.timestamp);
-       encode_u64_le(buffer, &offset, isl->save_data.nonce);
-       encode_u32_le(buffer, &offset, isl->save_data.version);
-       encode_u32_le(buffer, &offset, 0);
-       if (isl->zone_count > 0) {
-               encode_u32_le(buffer, &offset, INDEX_STATE_VERSION_301.signature);
-               encode_u32_le(buffer, &offset, INDEX_STATE_VERSION_301.version_id);
-               encode_u64_le(buffer, &offset, isl->state_data.newest_chapter);
-               encode_u64_le(buffer, &offset, isl->state_data.oldest_chapter);
-               encode_u64_le(buffer, &offset, isl->state_data.last_save);
-               encode_u64_le(buffer, &offset, 0);
-       }
-
-       result = uds_write_to_buffered_writer(writer, buffer, offset);
-       uds_free(buffer);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       return uds_flush_buffered_writer(writer);
-}
-
-static int write_index_save_layout(struct index_layout *layout,
-                                  struct index_save_layout *isl)
-{
-       int result;
-       struct region_table *table;
-       struct buffered_writer *writer;
-
-       result = make_index_save_region_table(isl, &table);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       result = open_region_writer(layout, &isl->header, &writer);
-       if (result != UDS_SUCCESS) {
-               uds_free(table);
-               return result;
-       }
-
-       result = write_index_save_header(isl, table, writer);
-       uds_free(table);
-       uds_free_buffered_writer(writer);
-
-       return result;
-}
-
-static void reset_index_save_layout(struct index_save_layout *isl, u64 page_map_blocks)
-{
-       u64 free_blocks;
-       u64 next_block = isl->index_save.start_block;
-
-       isl->zone_count = 0;
-       memset(&isl->save_data, 0, sizeof(isl->save_data));
-
-       isl->header = (struct layout_region) {
-               .start_block = next_block++,
-               .block_count = 1,
-               .kind = RL_KIND_HEADER,
-               .instance = RL_SOLE_INSTANCE,
-       };
-
-       isl->index_page_map = (struct layout_region) {
-               .start_block = next_block,
-               .block_count = page_map_blocks,
-               .kind = RL_KIND_INDEX_PAGE_MAP,
-               .instance = RL_SOLE_INSTANCE,
-       };
-
-       next_block += page_map_blocks;
-
-       free_blocks = isl->index_save.block_count - page_map_blocks - 1;
-       isl->free_space = (struct layout_region) {
-               .start_block = next_block,
-               .block_count = free_blocks,
-               .kind = RL_KIND_EMPTY,
-               .instance = RL_SOLE_INSTANCE,
-       };
-}
-
-static int __must_check invalidate_old_save(struct index_layout *layout,
-                                           struct index_save_layout *isl)
-{
-       reset_index_save_layout(isl, layout->super.page_map_blocks);
-       return write_index_save_layout(layout, isl);
-}
-
-static int discard_index_state_data(struct index_layout *layout)
-{
-       int result;
-       int saved_result = UDS_SUCCESS;
-       unsigned int i;
-
-       for (i = 0; i < layout->super.max_saves; i++) {
-               result = invalidate_old_save(layout, &layout->index.saves[i]);
-               if (result != UDS_SUCCESS)
-                       saved_result = result;
-       }
-
-       if (saved_result != UDS_SUCCESS) {
-               return uds_log_error_strerror(result,
-                                             "%s: cannot destroy all index saves",
-                                             __func__);
-       }
-
-       return UDS_SUCCESS;
-}
-
-static int __must_check make_layout_region_table(struct index_layout *layout,
-                                                struct region_table **table_ptr)
-{
-       int result;
-       unsigned int i;
-       /* Regions: header, config, index, volume, saves, seal */
-       u16 region_count = 5 + layout->super.max_saves;
-       u16 payload;
-       struct region_table *table;
-       struct layout_region *lr;
-
-       result = uds_allocate_extended(struct region_table, region_count,
-                                      struct layout_region, "layout region table",
-                                      &table);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       lr = &table->regions[0];
-       *lr++ = layout->header;
-       *lr++ = layout->config;
-       *lr++ = layout->index.sub_index;
-       *lr++ = layout->index.volume;
-
-       for (i = 0; i < layout->super.max_saves; i++)
-               *lr++ = layout->index.saves[i].index_save;
-
-       *lr++ = layout->seal;
-
-       if (is_converted_super_block(&layout->super)) {
-               payload = sizeof(struct super_block_data);
-       } else {
-               payload = (sizeof(struct super_block_data) -
-                          sizeof(layout->super.volume_offset) -
-                          sizeof(layout->super.start_offset));
-       }
-
-       table->header = (struct region_header) {
-               .magic = REGION_MAGIC,
-               .region_blocks = layout->total_blocks,
-               .type = RH_TYPE_SUPER,
-               .version = 1,
-               .region_count = region_count,
-               .payload = payload,
-       };
-
-       table->encoded_size = (sizeof(struct region_header) + payload +
-                              region_count * sizeof(struct layout_region));
-       *table_ptr = table;
-       return UDS_SUCCESS;
-}
-
-static int __must_check write_layout_header(struct index_layout *layout,
-                                           struct region_table *table,
-                                           struct buffered_writer *writer)
-{
-       int result;
-       u8 *buffer;
-       size_t offset = 0;
-
-       result = uds_allocate(table->encoded_size, u8, "layout data", &buffer);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       encode_region_table(buffer, &offset, table);
-       memcpy(buffer + offset, &layout->super.magic_label, MAGIC_SIZE);
-       offset += MAGIC_SIZE;
-       memcpy(buffer + offset, &layout->super.nonce_info, NONCE_INFO_SIZE);
-       offset += NONCE_INFO_SIZE;
-       encode_u64_le(buffer, &offset, layout->super.nonce);
-       encode_u32_le(buffer, &offset, layout->super.version);
-       encode_u32_le(buffer, &offset, layout->super.block_size);
-       encode_u16_le(buffer, &offset, layout->super.index_count);
-       encode_u16_le(buffer, &offset, layout->super.max_saves);
-       encode_u32_le(buffer, &offset, 0);
-       encode_u64_le(buffer, &offset, layout->super.open_chapter_blocks);
-       encode_u64_le(buffer, &offset, layout->super.page_map_blocks);
-
-       if (is_converted_super_block(&layout->super)) {
-               encode_u64_le(buffer, &offset, layout->super.volume_offset);
-               encode_u64_le(buffer, &offset, layout->super.start_offset);
-       }
-
-       result = uds_write_to_buffered_writer(writer, buffer, offset);
-       uds_free(buffer);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       return uds_flush_buffered_writer(writer);
-}
-
-static int __must_check write_uds_index_config(struct index_layout *layout,
-                                              struct uds_configuration *config,
-                                              off_t offset)
-{
-       int result;
-       struct buffered_writer *writer = NULL;
-
-       result = open_layout_writer(layout, &layout->config, offset, &writer);
-       if (result != UDS_SUCCESS)
-               return uds_log_error_strerror(result, "failed to open config region");
-
-       result = uds_write_config_contents(writer, config, layout->super.version);
-       if (result != UDS_SUCCESS) {
-               uds_free_buffered_writer(writer);
-               return uds_log_error_strerror(result, "failed to write config region");
-       }
-
-       result = uds_flush_buffered_writer(writer);
-       if (result != UDS_SUCCESS) {
-               uds_free_buffered_writer(writer);
-               return uds_log_error_strerror(result, "cannot flush config writer");
-       }
-
-       uds_free_buffered_writer(writer);
-       return UDS_SUCCESS;
-}
-
-static int __must_check save_layout(struct index_layout *layout, off_t offset)
-{
-       int result;
-       struct buffered_writer *writer = NULL;
-       struct region_table *table;
-
-       result = make_layout_region_table(layout, &table);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       result = open_layout_writer(layout, &layout->header, offset, &writer);
-       if (result != UDS_SUCCESS) {
-               uds_free(table);
-               return result;
-       }
-
-       result = write_layout_header(layout, table, writer);
-       uds_free(table);
-       uds_free_buffered_writer(writer);
-
-       return result;
-}
-
-static int create_index_layout(struct index_layout *layout, struct uds_configuration *config)
-{
-       int result;
-       struct save_layout_sizes sizes;
-
-       result = compute_sizes(config, &sizes);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       result = uds_allocate(sizes.save_count, struct index_save_layout, __func__,
-                             &layout->index.saves);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       initialize_layout(layout, &sizes);
-
-       result = discard_index_state_data(layout);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       result = write_uds_index_config(layout, config, 0);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       return save_layout(layout, 0);
-}
-
-static u64 generate_index_save_nonce(u64 volume_nonce, struct index_save_layout *isl)
-{
-       struct save_nonce_data {
-               struct index_save_data data;
-               u64 offset;
-       } nonce_data;
-       u8 buffer[sizeof(nonce_data)];
-       size_t offset = 0;
-
-       encode_u64_le(buffer, &offset, isl->save_data.timestamp);
-       encode_u64_le(buffer, &offset, 0);
-       encode_u32_le(buffer, &offset, isl->save_data.version);
-       encode_u32_le(buffer, &offset, 0U);
-       encode_u64_le(buffer, &offset, isl->index_save.start_block);
-       ASSERT_LOG_ONLY(offset == sizeof(nonce_data),
-                       "%zu bytes encoded of %zu expected", offset, sizeof(nonce_data));
-       return generate_secondary_nonce(volume_nonce, buffer, sizeof(buffer));
-}
-
-static u64 validate_index_save_layout(struct index_save_layout *isl, u64 volume_nonce)
-{
-       if ((isl->zone_count == 0) || (isl->save_data.timestamp == 0))
-               return 0;
-
-       if (isl->save_data.nonce != generate_index_save_nonce(volume_nonce, isl))
-               return 0;
-
-       return isl->save_data.timestamp;
-}
-
-static int find_latest_uds_index_save_slot(struct index_layout *layout,
-                                          struct index_save_layout **isl_ptr)
-{
-       struct index_save_layout *latest = NULL;
-       struct index_save_layout *isl;
-       unsigned int i;
-       u64 save_time = 0;
-       u64 latest_time = 0;
-
-       for (i = 0; i < layout->super.max_saves; i++) {
-               isl = &layout->index.saves[i];
-               save_time = validate_index_save_layout(isl, layout->index.nonce);
-               if (save_time > latest_time) {
-                       latest = isl;
-                       latest_time = save_time;
-               }
-       }
-
-       if (latest == NULL) {
-               uds_log_error("No valid index save found");
-               return UDS_INDEX_NOT_SAVED_CLEANLY;
-       }
-
-       *isl_ptr = latest;
-       return UDS_SUCCESS;
-}
-
-int uds_discard_open_chapter(struct index_layout *layout)
-{
-       int result;
-       struct index_save_layout *isl;
-       struct buffered_writer *writer;
-
-       result = find_latest_uds_index_save_slot(layout, &isl);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       result = open_region_writer(layout, &isl->open_chapter, &writer);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       result = uds_write_to_buffered_writer(writer, NULL, UDS_BLOCK_SIZE);
-       if (result != UDS_SUCCESS) {
-               uds_free_buffered_writer(writer);
-               return result;
-       }
-
-       result = uds_flush_buffered_writer(writer);
-       uds_free_buffered_writer(writer);
-       return result;
-}
-
-int uds_load_index_state(struct index_layout *layout, struct uds_index *index)
-{
-       int result;
-       unsigned int zone;
-       struct index_save_layout *isl;
-       struct buffered_reader *readers[MAX_ZONES];
-
-       result = find_latest_uds_index_save_slot(layout, &isl);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       index->newest_virtual_chapter = isl->state_data.newest_chapter;
-       index->oldest_virtual_chapter = isl->state_data.oldest_chapter;
-       index->last_save = isl->state_data.last_save;
-
-       result = open_region_reader(layout, &isl->open_chapter, &readers[0]);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       result = uds_load_open_chapter(index, readers[0]);
-       uds_free_buffered_reader(readers[0]);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       for (zone = 0; zone < isl->zone_count; zone++) {
-               result = open_region_reader(layout, &isl->volume_index_zones[zone],
-                                           &readers[zone]);
-               if (result != UDS_SUCCESS) {
-                       for (; zone > 0; zone--)
-                               uds_free_buffered_reader(readers[zone - 1]);
-
-                       return result;
-               }
-       }
-
-       result = uds_load_volume_index(index->volume_index, readers, isl->zone_count);
-       for (zone = 0; zone < isl->zone_count; zone++)
-               uds_free_buffered_reader(readers[zone]);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       result = open_region_reader(layout, &isl->index_page_map, &readers[0]);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       result = uds_read_index_page_map(index->volume->index_page_map, readers[0]);
-       uds_free_buffered_reader(readers[0]);
-
-       return result;
-}
-
-static struct index_save_layout *select_oldest_index_save_layout(struct index_layout *layout)
-{
-       struct index_save_layout *oldest = NULL;
-       struct index_save_layout *isl;
-       unsigned int i;
-       u64 save_time = 0;
-       u64 oldest_time = 0;
-
-       for (i = 0; i < layout->super.max_saves; i++) {
-               isl = &layout->index.saves[i];
-               save_time = validate_index_save_layout(isl, layout->index.nonce);
-               if (oldest == NULL || save_time < oldest_time) {
-                       oldest = isl;
-                       oldest_time = save_time;
-               }
-       }
-
-       return oldest;
-}
-
-static void instantiate_index_save_layout(struct index_save_layout *isl,
-                                         struct super_block_data *super,
-                                         u64 volume_nonce, unsigned int zone_count)
-{
-       unsigned int z;
-       u64 next_block;
-       u64 free_blocks;
-       u64 volume_index_blocks;
-
-       isl->zone_count = zone_count;
-       memset(&isl->save_data, 0, sizeof(isl->save_data));
-       isl->save_data.timestamp = ktime_to_ms(current_time_ns(CLOCK_REALTIME));
-       isl->save_data.version = 1;
-       isl->save_data.nonce = generate_index_save_nonce(volume_nonce, isl);
-
-       next_block = isl->index_save.start_block;
-       isl->header = (struct layout_region) {
-               .start_block = next_block++,
-               .block_count = 1,
-               .kind = RL_KIND_HEADER,
-               .instance = RL_SOLE_INSTANCE,
-       };
-
-       isl->index_page_map = (struct layout_region) {
-               .start_block = next_block,
-               .block_count = super->page_map_blocks,
-               .kind = RL_KIND_INDEX_PAGE_MAP,
-               .instance = RL_SOLE_INSTANCE,
-       };
-       next_block += super->page_map_blocks;
-
-       free_blocks = (isl->index_save.block_count - 1 -
-                      super->page_map_blocks -
-                      super->open_chapter_blocks);
-       volume_index_blocks = free_blocks / isl->zone_count;
-       for (z = 0; z < isl->zone_count; z++) {
-               isl->volume_index_zones[z] = (struct layout_region) {
-                       .start_block = next_block,
-                       .block_count = volume_index_blocks,
-                       .kind = RL_KIND_VOLUME_INDEX,
-                       .instance = z,
-               };
-
-               next_block += volume_index_blocks;
-               free_blocks -= volume_index_blocks;
-       }
-
-       isl->open_chapter = (struct layout_region) {
-               .start_block = next_block,
-               .block_count = super->open_chapter_blocks,
-               .kind = RL_KIND_OPEN_CHAPTER,
-               .instance = RL_SOLE_INSTANCE,
-       };
-
-       next_block += super->open_chapter_blocks;
-
-       isl->free_space = (struct layout_region) {
-               .start_block = next_block,
-               .block_count = free_blocks,
-               .kind = RL_KIND_EMPTY,
-               .instance = RL_SOLE_INSTANCE,
-       };
-}
-
-static int setup_uds_index_save_slot(struct index_layout *layout,
-                                    unsigned int zone_count,
-                                    struct index_save_layout **isl_ptr)
-{
-       int result;
-       struct index_save_layout *isl;
-
-       isl = select_oldest_index_save_layout(layout);
-       result = invalidate_old_save(layout, isl);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       instantiate_index_save_layout(isl, &layout->super, layout->index.nonce,
-                                     zone_count);
-
-       *isl_ptr = isl;
-       return UDS_SUCCESS;
-}
-
-static void cancel_uds_index_save(struct index_save_layout *isl)
-{
-       memset(&isl->save_data, 0, sizeof(isl->save_data));
-       memset(&isl->state_data, 0, sizeof(isl->state_data));
-       isl->zone_count = 0;
-}
-
-int uds_save_index_state(struct index_layout *layout, struct uds_index *index)
-{
-       int result;
-       unsigned int zone;
-       struct index_save_layout *isl;
-       struct buffered_writer *writers[MAX_ZONES];
-
-       result = setup_uds_index_save_slot(layout, index->zone_count, &isl);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       isl->state_data = (struct index_state_data301) {
-               .newest_chapter = index->newest_virtual_chapter,
-               .oldest_chapter = index->oldest_virtual_chapter,
-               .last_save = index->last_save,
-       };
-
-       result = open_region_writer(layout, &isl->open_chapter, &writers[0]);
-       if (result != UDS_SUCCESS) {
-               cancel_uds_index_save(isl);
-               return result;
-       }
-
-       result = uds_save_open_chapter(index, writers[0]);
-       uds_free_buffered_writer(writers[0]);
-       if (result != UDS_SUCCESS) {
-               cancel_uds_index_save(isl);
-               return result;
-       }
-
-       for (zone = 0; zone < index->zone_count; zone++) {
-               result = open_region_writer(layout, &isl->volume_index_zones[zone],
-                                           &writers[zone]);
-               if (result != UDS_SUCCESS) {
-                       for (; zone > 0; zone--)
-                               uds_free_buffered_writer(writers[zone - 1]);
-
-                       cancel_uds_index_save(isl);
-                       return result;
-               }
-       }
-
-       result = uds_save_volume_index(index->volume_index, writers, index->zone_count);
-       for (zone = 0; zone < index->zone_count; zone++)
-               uds_free_buffered_writer(writers[zone]);
-       if (result != UDS_SUCCESS) {
-               cancel_uds_index_save(isl);
-               return result;
-       }
-
-       result = open_region_writer(layout, &isl->index_page_map, &writers[0]);
-       if (result != UDS_SUCCESS) {
-               cancel_uds_index_save(isl);
-               return result;
-       }
-
-       result = uds_write_index_page_map(index->volume->index_page_map, writers[0]);
-       uds_free_buffered_writer(writers[0]);
-       if (result != UDS_SUCCESS) {
-               cancel_uds_index_save(isl);
-               return result;
-       }
-
-       return write_index_save_layout(layout, isl);
-}
-
-static int __must_check load_region_table(struct buffered_reader *reader,
-                                         struct region_table **table_ptr)
-{
-       int result;
-       unsigned int i;
-       struct region_header header;
-       struct region_table *table;
-       u8 buffer[sizeof(struct region_header)];
-       size_t offset = 0;
-
-       result = uds_read_from_buffered_reader(reader, buffer, sizeof(buffer));
-       if (result != UDS_SUCCESS)
-               return uds_log_error_strerror(result, "cannot read region table header");
-
-       decode_u64_le(buffer, &offset, &header.magic);
-       decode_u64_le(buffer, &offset, &header.region_blocks);
-       decode_u16_le(buffer, &offset, &header.type);
-       decode_u16_le(buffer, &offset, &header.version);
-       decode_u16_le(buffer, &offset, &header.region_count);
-       decode_u16_le(buffer, &offset, &header.payload);
-
-       if (header.magic != REGION_MAGIC)
-               return UDS_NO_INDEX;
-
-       if (header.version != 1) {
-               return uds_log_error_strerror(UDS_UNSUPPORTED_VERSION,
-                                             "unknown region table version %hu",
-                                             header.version);
-       }
-
-       result = uds_allocate_extended(struct region_table, header.region_count,
-                                      struct layout_region,
-                                      "single file layout region table", &table);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       table->header = header;
-       for (i = 0; i < header.region_count; i++) {
-               u8 region_buffer[sizeof(struct layout_region)];
-
-               offset = 0;
-               result = uds_read_from_buffered_reader(reader, region_buffer,
-                                                      sizeof(region_buffer));
-               if (result != UDS_SUCCESS) {
-                       uds_free(table);
-                       return uds_log_error_strerror(UDS_CORRUPT_DATA,
-                                                     "cannot read region table layouts");
-               }
-
-               decode_u64_le(region_buffer, &offset, &table->regions[i].start_block);
-               decode_u64_le(region_buffer, &offset, &table->regions[i].block_count);
-               offset += sizeof(u32);
-               decode_u16_le(region_buffer, &offset, &table->regions[i].kind);
-               decode_u16_le(region_buffer, &offset, &table->regions[i].instance);
-       }
-
-       *table_ptr = table;
-       return UDS_SUCCESS;
-}
-
-static int __must_check read_super_block_data(struct buffered_reader *reader,
-                                             struct index_layout *layout,
-                                             size_t saved_size)
-{
-       int result;
-       struct super_block_data *super = &layout->super;
-       u8 *buffer;
-       size_t offset = 0;
-
-       result = uds_allocate(saved_size, u8, "super block data", &buffer);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       result = uds_read_from_buffered_reader(reader, buffer, saved_size);
-       if (result != UDS_SUCCESS) {
-               uds_free(buffer);
-               return uds_log_error_strerror(result, "cannot read region table header");
-       }
-
-       memcpy(&super->magic_label, buffer, MAGIC_SIZE);
-       offset += MAGIC_SIZE;
-       memcpy(&super->nonce_info, buffer + offset, NONCE_INFO_SIZE);
-       offset += NONCE_INFO_SIZE;
-       decode_u64_le(buffer, &offset, &super->nonce);
-       decode_u32_le(buffer, &offset, &super->version);
-       decode_u32_le(buffer, &offset, &super->block_size);
-       decode_u16_le(buffer, &offset, &super->index_count);
-       decode_u16_le(buffer, &offset, &super->max_saves);
-       offset += sizeof(u32);
-       decode_u64_le(buffer, &offset, &super->open_chapter_blocks);
-       decode_u64_le(buffer, &offset, &super->page_map_blocks);
-
-       if (is_converted_super_block(super)) {
-               decode_u64_le(buffer, &offset, &super->volume_offset);
-               decode_u64_le(buffer, &offset, &super->start_offset);
-       } else {
-               super->volume_offset = 0;
-               super->start_offset = 0;
-       }
-
-       uds_free(buffer);
-
-       if (memcmp(super->magic_label, LAYOUT_MAGIC, MAGIC_SIZE) != 0)
-               return uds_log_error_strerror(UDS_CORRUPT_DATA,
-                                             "unknown superblock magic label");
-
-       if ((super->version < SUPER_VERSION_MINIMUM) ||
-           (super->version == 4) || (super->version == 5) || (super->version == 6) ||
-           (super->version > SUPER_VERSION_MAXIMUM)) {
-               return uds_log_error_strerror(UDS_UNSUPPORTED_VERSION,
-                                             "unknown superblock version number %u",
-                                             super->version);
-       }
-
-       if (super->volume_offset < super->start_offset) {
-               return uds_log_error_strerror(UDS_CORRUPT_DATA,
-                                             "inconsistent offsets (start %llu, volume %llu)",
-                                             (unsigned long long) super->start_offset,
-                                             (unsigned long long) super->volume_offset);
-       }
-
-       /* Sub-indexes are no longer used but the layout retains this field. */
-       if (super->index_count != 1) {
-               return uds_log_error_strerror(UDS_CORRUPT_DATA,
-                                             "invalid subindex count %u",
-                                             super->index_count);
-       }
-
-       if (generate_primary_nonce(super->nonce_info, sizeof(super->nonce_info)) != super->nonce) {
-               return uds_log_error_strerror(UDS_CORRUPT_DATA,
-                                             "inconsistent superblock nonce");
-       }
-
-       return UDS_SUCCESS;
-}
-
-static int __must_check verify_region(struct layout_region *lr, u64 start_block,
-                                     enum region_kind kind, unsigned int instance)
-{
-       if (lr->start_block != start_block)
-               return uds_log_error_strerror(UDS_CORRUPT_DATA,
-                                             "incorrect layout region offset");
-
-       if (lr->kind != kind)
-               return uds_log_error_strerror(UDS_CORRUPT_DATA,
-                                             "incorrect layout region kind");
-
-       if (lr->instance != instance) {
-               return uds_log_error_strerror(UDS_CORRUPT_DATA,
-                                             "incorrect layout region instance");
-       }
-
-       return UDS_SUCCESS;
-}
-
-static int __must_check verify_sub_index(struct index_layout *layout, u64 start_block,
-                                        struct region_table *table)
-{
-       int result;
-       unsigned int i;
-       struct sub_index_layout *sil = &layout->index;
-       u64 next_block = start_block;
-
-       sil->sub_index = table->regions[2];
-       result = verify_region(&sil->sub_index, next_block, RL_KIND_INDEX, 0);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       define_sub_index_nonce(layout);
-
-       sil->volume = table->regions[3];
-       result = verify_region(&sil->volume, next_block, RL_KIND_VOLUME,
-                              RL_SOLE_INSTANCE);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       next_block += sil->volume.block_count + layout->super.volume_offset;
-
-       for (i = 0; i < layout->super.max_saves; i++) {
-               sil->saves[i].index_save = table->regions[i + 4];
-               result = verify_region(&sil->saves[i].index_save, next_block,
-                                      RL_KIND_SAVE, i);
-               if (result != UDS_SUCCESS)
-                       return result;
-
-               next_block += sil->saves[i].index_save.block_count;
-       }
-
-       next_block -= layout->super.volume_offset;
-       if (next_block != start_block + sil->sub_index.block_count) {
-               return uds_log_error_strerror(UDS_CORRUPT_DATA,
-                                             "sub index region does not span all saves");
-       }
-
-       return UDS_SUCCESS;
-}
-
-static int __must_check reconstitute_layout(struct index_layout *layout,
-                                           struct region_table *table, u64 first_block)
-{
-       int result;
-       u64 next_block = first_block;
-
-       result = uds_allocate(layout->super.max_saves, struct index_save_layout,
-                             __func__, &layout->index.saves);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       layout->total_blocks = table->header.region_blocks;
-
-       layout->header = table->regions[0];
-       result = verify_region(&layout->header, next_block++, RL_KIND_HEADER,
-                              RL_SOLE_INSTANCE);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       layout->config = table->regions[1];
-       result = verify_region(&layout->config, next_block++, RL_KIND_CONFIG,
-                              RL_SOLE_INSTANCE);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       result = verify_sub_index(layout, next_block, table);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       next_block += layout->index.sub_index.block_count;
-
-       layout->seal = table->regions[table->header.region_count - 1];
-       result = verify_region(&layout->seal, next_block + layout->super.volume_offset,
-                              RL_KIND_SEAL, RL_SOLE_INSTANCE);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       if (++next_block != (first_block + layout->total_blocks)) {
-               return uds_log_error_strerror(UDS_CORRUPT_DATA,
-                                             "layout table does not span total blocks");
-       }
-
-       return UDS_SUCCESS;
-}
-
-static int __must_check load_super_block(struct index_layout *layout, size_t block_size,
-                                        u64 first_block, struct buffered_reader *reader)
-{
-       int result;
-       struct region_table *table = NULL;
-       struct super_block_data *super = &layout->super;
-
-       result = load_region_table(reader, &table);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       if (table->header.type != RH_TYPE_SUPER) {
-               uds_free(table);
-               return uds_log_error_strerror(UDS_CORRUPT_DATA,
-                                             "not a superblock region table");
-       }
-
-       result = read_super_block_data(reader, layout, table->header.payload);
-       if (result != UDS_SUCCESS) {
-               uds_free(table);
-               return uds_log_error_strerror(result, "unknown superblock format");
-       }
-
-       if (super->block_size != block_size) {
-               uds_free(table);
-               return uds_log_error_strerror(UDS_CORRUPT_DATA,
-                                             "superblock saved block_size %u differs from supplied block_size %zu",
-                                             super->block_size, block_size);
-       }
-
-       first_block -= (super->volume_offset - super->start_offset);
-       result = reconstitute_layout(layout, table, first_block);
-       uds_free(table);
-       return result;
-}
-
-static int __must_check read_index_save_data(struct buffered_reader *reader,
-                                            struct index_save_layout *isl,
-                                            size_t saved_size)
-{
-       int result;
-       struct index_state_version file_version;
-       u8 buffer[sizeof(struct index_save_data) + sizeof(struct index_state_data301)];
-       size_t offset = 0;
-
-       if (saved_size != sizeof(buffer)) {
-               return uds_log_error_strerror(UDS_CORRUPT_DATA,
-                                             "unexpected index save data size %zu",
-                                             saved_size);
-       }
-
-       result = uds_read_from_buffered_reader(reader, buffer, sizeof(buffer));
-       if (result != UDS_SUCCESS)
-               return uds_log_error_strerror(result, "cannot read index save data");
-
-       decode_u64_le(buffer, &offset, &isl->save_data.timestamp);
-       decode_u64_le(buffer, &offset, &isl->save_data.nonce);
-       decode_u32_le(buffer, &offset, &isl->save_data.version);
-       offset += sizeof(u32);
-
-       if (isl->save_data.version > 1) {
-               return uds_log_error_strerror(UDS_UNSUPPORTED_VERSION,
-                                             "unknown index save version number %u",
-                                             isl->save_data.version);
-       }
-
-       decode_s32_le(buffer, &offset, &file_version.signature);
-       decode_s32_le(buffer, &offset, &file_version.version_id);
-
-       if ((file_version.signature != INDEX_STATE_VERSION_301.signature) ||
-           (file_version.version_id != INDEX_STATE_VERSION_301.version_id)) {
-               return uds_log_error_strerror(UDS_UNSUPPORTED_VERSION,
-                                             "index state version %d,%d is unsupported",
-                                             file_version.signature,
-                                             file_version.version_id);
-       }
-
-       decode_u64_le(buffer, &offset, &isl->state_data.newest_chapter);
-       decode_u64_le(buffer, &offset, &isl->state_data.oldest_chapter);
-       decode_u64_le(buffer, &offset, &isl->state_data.last_save);
-       /* Skip past some historical fields that are now unused */
-       offset += sizeof(u32) + sizeof(u32);
-       return UDS_SUCCESS;
-}
-
-static int __must_check reconstruct_index_save(struct index_save_layout *isl,
-                                              struct region_table *table)
-{
-       int result;
-       unsigned int z;
-       struct layout_region *last_region;
-       u64 next_block = isl->index_save.start_block;
-       u64 last_block = next_block + isl->index_save.block_count;
-
-       isl->zone_count = table->header.region_count - 3;
-
-       last_region = &table->regions[table->header.region_count - 1];
-       if (last_region->kind == RL_KIND_EMPTY) {
-               isl->free_space = *last_region;
-               isl->zone_count--;
-       } else {
-               isl->free_space = (struct layout_region) {
-                       .start_block = last_block,
-                       .block_count = 0,
-                       .kind = RL_KIND_EMPTY,
-                       .instance = RL_SOLE_INSTANCE,
-               };
-       }
-
-       isl->header = table->regions[0];
-       result = verify_region(&isl->header, next_block++, RL_KIND_HEADER,
-                              RL_SOLE_INSTANCE);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       isl->index_page_map = table->regions[1];
-       result = verify_region(&isl->index_page_map, next_block, RL_KIND_INDEX_PAGE_MAP,
-                              RL_SOLE_INSTANCE);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       next_block += isl->index_page_map.block_count;
-
-       for (z = 0; z < isl->zone_count; z++) {
-               isl->volume_index_zones[z] = table->regions[z + 2];
-               result = verify_region(&isl->volume_index_zones[z], next_block,
-                                      RL_KIND_VOLUME_INDEX, z);
-               if (result != UDS_SUCCESS)
-                       return result;
-
-               next_block += isl->volume_index_zones[z].block_count;
-       }
-
-       isl->open_chapter = table->regions[isl->zone_count + 2];
-       result = verify_region(&isl->open_chapter, next_block, RL_KIND_OPEN_CHAPTER,
-                              RL_SOLE_INSTANCE);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       next_block += isl->open_chapter.block_count;
-
-       result = verify_region(&isl->free_space, next_block, RL_KIND_EMPTY,
-                              RL_SOLE_INSTANCE);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       next_block += isl->free_space.block_count;
-       if (next_block != last_block) {
-               return uds_log_error_strerror(UDS_CORRUPT_DATA,
-                                             "index save layout table incomplete");
-       }
-
-       return UDS_SUCCESS;
-}
-
-static int __must_check load_index_save(struct index_save_layout *isl,
-                                       struct buffered_reader *reader,
-                                       unsigned int instance)
-{
-       int result;
-       struct region_table *table = NULL;
-
-       result = load_region_table(reader, &table);
-       if (result != UDS_SUCCESS) {
-               return uds_log_error_strerror(result, "cannot read index save %u header",
-                                             instance);
-       }
-
-       if (table->header.region_blocks != isl->index_save.block_count) {
-               u64 region_blocks = table->header.region_blocks;
-
-               uds_free(table);
-               return uds_log_error_strerror(UDS_CORRUPT_DATA,
-                                             "unexpected index save %u region block count %llu",
-                                             instance,
-                                             (unsigned long long) region_blocks);
-       }
-
-       if (table->header.type == RH_TYPE_UNSAVED) {
-               uds_free(table);
-               reset_index_save_layout(isl, 0);
-               return UDS_SUCCESS;
-       }
-
-
-       if (table->header.type != RH_TYPE_SAVE) {
-               uds_free(table);
-               return uds_log_error_strerror(UDS_CORRUPT_DATA,
-                                             "unexpected index save %u header type %u",
-                                             instance, table->header.type);
-       }
-
-       result = read_index_save_data(reader, isl, table->header.payload);
-       if (result != UDS_SUCCESS) {
-               uds_free(table);
-               return uds_log_error_strerror(result,
-                                             "unknown index save %u data format",
-                                             instance);
-       }
-
-       result = reconstruct_index_save(isl, table);
-       uds_free(table);
-       if (result != UDS_SUCCESS) {
-               return uds_log_error_strerror(result, "cannot reconstruct index save %u",
-                                             instance);
-       }
-
-       return UDS_SUCCESS;
-}
-
-static int __must_check load_sub_index_regions(struct index_layout *layout)
-{
-       int result;
-       unsigned int j;
-       struct index_save_layout *isl;
-       struct buffered_reader *reader;
-
-       for (j = 0; j < layout->super.max_saves; j++) {
-               isl = &layout->index.saves[j];
-               result = open_region_reader(layout, &isl->index_save, &reader);
-
-               if (result != UDS_SUCCESS) {
-                       uds_log_error_strerror(result,
-                                              "cannot get reader for index 0 save %u",
-                                              j);
-                       return result;
-               }
-
-               result = load_index_save(isl, reader, j);
-               uds_free_buffered_reader(reader);
-               if (result != UDS_SUCCESS) {
-                       /* Another save slot might be valid. */
-                       reset_index_save_layout(isl, 0);
-                       continue;
-               }
-       }
-
-       return UDS_SUCCESS;
-}
-
-static int __must_check verify_uds_index_config(struct index_layout *layout,
-                                               struct uds_configuration *config)
-{
-       int result;
-       struct buffered_reader *reader = NULL;
-       u64 offset;
-
-       offset = layout->super.volume_offset - layout->super.start_offset;
-       result = open_layout_reader(layout, &layout->config, offset, &reader);
-       if (result != UDS_SUCCESS)
-               return uds_log_error_strerror(result, "failed to open config reader");
-
-       result = uds_validate_config_contents(reader, config);
-       if (result != UDS_SUCCESS) {
-               uds_free_buffered_reader(reader);
-               return uds_log_error_strerror(result, "failed to read config region");
-       }
-
-       uds_free_buffered_reader(reader);
-       return UDS_SUCCESS;
-}
-
-static int load_index_layout(struct index_layout *layout, struct uds_configuration *config)
-{
-       int result;
-       struct buffered_reader *reader;
-
-       result = uds_make_buffered_reader(layout->factory,
-                                         layout->offset / UDS_BLOCK_SIZE, 1, &reader);
-       if (result != UDS_SUCCESS)
-               return uds_log_error_strerror(result, "unable to read superblock");
-
-       result = load_super_block(layout, UDS_BLOCK_SIZE,
-                                 layout->offset / UDS_BLOCK_SIZE, reader);
-       uds_free_buffered_reader(reader);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       result = verify_uds_index_config(layout, config);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       return load_sub_index_regions(layout);
-}
-
-static int create_layout_factory(struct index_layout *layout,
-                                const struct uds_configuration *config)
-{
-       int result;
-       size_t writable_size;
-       struct io_factory *factory = NULL;
-
-       result = uds_make_io_factory(config->bdev, &factory);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       writable_size = uds_get_writable_size(factory) & -UDS_BLOCK_SIZE;
-       if (writable_size < config->size + config->offset) {
-               uds_put_io_factory(factory);
-               uds_log_error("index storage (%zu) is smaller than the requested size %zu",
-                             writable_size, config->size + config->offset);
-               return -ENOSPC;
-       }
-
-       layout->factory = factory;
-       layout->factory_size = (config->size > 0) ? config->size : writable_size;
-       layout->offset = config->offset;
-       return UDS_SUCCESS;
-}
-
-int uds_make_index_layout(struct uds_configuration *config, bool new_layout,
-                         struct index_layout **layout_ptr)
-{
-       int result;
-       struct index_layout *layout = NULL;
-       struct save_layout_sizes sizes;
-
-       result = compute_sizes(config, &sizes);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       result = uds_allocate(1, struct index_layout, __func__, &layout);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       result = create_layout_factory(layout, config);
-       if (result != UDS_SUCCESS) {
-               uds_free_index_layout(layout);
-               return result;
-       }
-
-       if (layout->factory_size < sizes.total_size) {
-               uds_log_error("index storage (%zu) is smaller than the required size %llu",
-                             layout->factory_size,
-                             (unsigned long long) sizes.total_size);
-               uds_free_index_layout(layout);
-               return -ENOSPC;
-       }
-
-       if (new_layout)
-               result = create_index_layout(layout, config);
-       else
-               result = load_index_layout(layout, config);
-       if (result != UDS_SUCCESS) {
-               uds_free_index_layout(layout);
-               return result;
-       }
-
-       *layout_ptr = layout;
-       return UDS_SUCCESS;
-}
-
-void uds_free_index_layout(struct index_layout *layout)
-{
-       if (layout == NULL)
-               return;
-
-       uds_free(layout->index.saves);
-       if (layout->factory != NULL)
-               uds_put_io_factory(layout->factory);
-
-       uds_free(layout);
-}
-
-int uds_replace_index_layout_storage(struct index_layout *layout,
-                                    struct block_device *bdev)
-{
-       return uds_replace_storage(layout->factory, bdev);
-}
-
-/* Obtain a dm_bufio_client for the volume region. */
-int uds_open_volume_bufio(struct index_layout *layout, size_t block_size,
-                         unsigned int reserved_buffers,
-                         struct dm_bufio_client **client_ptr)
-{
-       off_t offset = (layout->index.volume.start_block +
-                       layout->super.volume_offset -
-                       layout->super.start_offset);
-
-       return uds_make_bufio(layout->factory, offset, block_size, reserved_buffers,
-                             client_ptr);
-}
-
-u64 uds_get_volume_nonce(struct index_layout *layout)
-{
-       return layout->index.nonce;
-}
diff --git a/drivers/md/dm-vdo/index-layout.h b/drivers/md/dm-vdo/index-layout.h
deleted file mode 100644 (file)
index e9ac6f4..0000000
+++ /dev/null
@@ -1,43 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright 2023 Red Hat
- */
-
-#ifndef UDS_INDEX_LAYOUT_H
-#define UDS_INDEX_LAYOUT_H
-
-#include "config.h"
-#include "indexer.h"
-#include "io-factory.h"
-
-/*
- * The index layout describes the format of the index on the underlying storage, and is responsible
- * for creating those structures when the index is first created. It also validates the index data
- * when loading a saved index, and updates it when saving the index.
- */
-
-struct index_layout;
-
-int __must_check uds_make_index_layout(struct uds_configuration *config, bool new_layout,
-                                      struct index_layout **layout_ptr);
-
-void uds_free_index_layout(struct index_layout *layout);
-
-int __must_check uds_replace_index_layout_storage(struct index_layout *layout,
-                                                 struct block_device *bdev);
-
-int __must_check uds_load_index_state(struct index_layout *layout,
-                                     struct uds_index *index);
-
-int __must_check uds_save_index_state(struct index_layout *layout,
-                                     struct uds_index *index);
-
-int __must_check uds_discard_open_chapter(struct index_layout *layout);
-
-u64 __must_check uds_get_volume_nonce(struct index_layout *layout);
-
-int __must_check uds_open_volume_bufio(struct index_layout *layout, size_t block_size,
-                                      unsigned int reserved_buffers,
-                                      struct dm_bufio_client **client_ptr);
-
-#endif /* UDS_INDEX_LAYOUT_H */
diff --git a/drivers/md/dm-vdo/index-page-map.c b/drivers/md/dm-vdo/index-page-map.c
deleted file mode 100644 (file)
index 1bb1206..0000000
+++ /dev/null
@@ -1,174 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright 2023 Red Hat
- */
-
-#include "index-page-map.h"
-
-#include "errors.h"
-#include "hash-utils.h"
-#include "indexer.h"
-#include "logger.h"
-#include "memory-alloc.h"
-#include "numeric.h"
-#include "permassert.h"
-#include "string-utils.h"
-#include "thread-utils.h"
-
-/*
- * The index page map is conceptually a two-dimensional array indexed by chapter number and index
- * page number within the chapter. Each entry contains the number of the last delta list on that
- * index page. In order to save memory, the information for the last page in each chapter is not
- * recorded, as it is known from the geometry.
- */
-
-static const u8 PAGE_MAP_MAGIC[] = "ALBIPM02";
-
-enum {
-       PAGE_MAP_MAGIC_LENGTH = sizeof(PAGE_MAP_MAGIC) - 1,
-};
-
-static inline u32 get_entry_count(const struct index_geometry *geometry)
-{
-       return geometry->chapters_per_volume * (geometry->index_pages_per_chapter - 1);
-}
-
-int uds_make_index_page_map(const struct index_geometry *geometry,
-                           struct index_page_map **map_ptr)
-{
-       int result;
-       struct index_page_map *map;
-
-       result = uds_allocate(1, struct index_page_map, "page map", &map);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       map->geometry = geometry;
-       map->entries_per_chapter = geometry->index_pages_per_chapter - 1;
-       result = uds_allocate(get_entry_count(geometry), u16, "Index Page Map Entries",
-                             &map->entries);
-       if (result != UDS_SUCCESS) {
-               uds_free_index_page_map(map);
-               return result;
-       }
-
-       *map_ptr = map;
-       return UDS_SUCCESS;
-}
-
-void uds_free_index_page_map(struct index_page_map *map)
-{
-       if (map != NULL) {
-               uds_free(map->entries);
-               uds_free(map);
-       }
-}
-
-void uds_update_index_page_map(struct index_page_map *map, u64 virtual_chapter_number,
-                              u32 chapter_number, u32 index_page_number,
-                              u32 delta_list_number)
-{
-       size_t slot;
-
-       map->last_update = virtual_chapter_number;
-       if (index_page_number == map->entries_per_chapter)
-               return;
-
-       slot = (chapter_number * map->entries_per_chapter) + index_page_number;
-       map->entries[slot] = delta_list_number;
-}
-
-u32 uds_find_index_page_number(const struct index_page_map *map,
-                              const struct uds_record_name *name, u32 chapter_number)
-{
-       u32 delta_list_number = uds_hash_to_chapter_delta_list(name, map->geometry);
-       u32 slot = chapter_number * map->entries_per_chapter;
-       u32 page;
-
-       for (page = 0; page < map->entries_per_chapter; page++) {
-               if (delta_list_number <= map->entries[slot + page])
-                       break;
-       }
-
-       return page;
-}
-
-void uds_get_list_number_bounds(const struct index_page_map *map, u32 chapter_number,
-                               u32 index_page_number, u32 *lowest_list,
-                               u32 *highest_list)
-{
-       u32 slot = chapter_number * map->entries_per_chapter;
-
-       *lowest_list = ((index_page_number == 0) ?
-                       0 : map->entries[slot + index_page_number - 1] + 1);
-       *highest_list = ((index_page_number < map->entries_per_chapter) ?
-                        map->entries[slot + index_page_number] :
-                        map->geometry->delta_lists_per_chapter - 1);
-}
-
-u64 uds_compute_index_page_map_save_size(const struct index_geometry *geometry)
-{
-       return PAGE_MAP_MAGIC_LENGTH + sizeof(u64) + sizeof(u16) * get_entry_count(geometry);
-}
-
-int uds_write_index_page_map(struct index_page_map *map, struct buffered_writer *writer)
-{
-       int result;
-       u8 *buffer;
-       size_t offset = 0;
-       u64 saved_size = uds_compute_index_page_map_save_size(map->geometry);
-       u32 i;
-
-       result = uds_allocate(saved_size, u8, "page map data", &buffer);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       memcpy(buffer, PAGE_MAP_MAGIC, PAGE_MAP_MAGIC_LENGTH);
-       offset += PAGE_MAP_MAGIC_LENGTH;
-       encode_u64_le(buffer, &offset, map->last_update);
-       for (i = 0; i < get_entry_count(map->geometry); i++)
-               encode_u16_le(buffer, &offset, map->entries[i]);
-
-       result = uds_write_to_buffered_writer(writer, buffer, offset);
-       uds_free(buffer);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       return uds_flush_buffered_writer(writer);
-}
-
-int uds_read_index_page_map(struct index_page_map *map, struct buffered_reader *reader)
-{
-       int result;
-       u8 magic[PAGE_MAP_MAGIC_LENGTH];
-       u8 *buffer;
-       size_t offset = 0;
-       u64 saved_size = uds_compute_index_page_map_save_size(map->geometry);
-       u32 i;
-
-       result = uds_allocate(saved_size, u8, "page map data", &buffer);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       result = uds_read_from_buffered_reader(reader, buffer, saved_size);
-       if (result != UDS_SUCCESS) {
-               uds_free(buffer);
-               return result;
-       }
-
-       memcpy(&magic, buffer, PAGE_MAP_MAGIC_LENGTH);
-       offset += PAGE_MAP_MAGIC_LENGTH;
-       if (memcmp(magic, PAGE_MAP_MAGIC, PAGE_MAP_MAGIC_LENGTH) != 0) {
-               uds_free(buffer);
-               return UDS_CORRUPT_DATA;
-       }
-
-       decode_u64_le(buffer, &offset, &map->last_update);
-       for (i = 0; i < get_entry_count(map->geometry); i++)
-               decode_u16_le(buffer, &offset, &map->entries[i]);
-
-       uds_free(buffer);
-       uds_log_debug("read index page map, last update %llu",
-                     (unsigned long long) map->last_update);
-       return UDS_SUCCESS;
-}
diff --git a/drivers/md/dm-vdo/index-page-map.h b/drivers/md/dm-vdo/index-page-map.h
deleted file mode 100644 (file)
index b327c0b..0000000
+++ /dev/null
@@ -1,50 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright 2023 Red Hat
- */
-
-#ifndef UDS_INDEX_PAGE_MAP_H
-#define UDS_INDEX_PAGE_MAP_H
-
-#include "geometry.h"
-#include "io-factory.h"
-
-/*
- * The index maintains a page map which records how the chapter delta lists are distributed among
- * the index pages for each chapter, allowing the volume to be efficient about reading only pages
- * that it knows it will need.
- */
-
-struct index_page_map {
-       const struct index_geometry *geometry;
-       u64 last_update;
-       u32 entries_per_chapter;
-       u16 *entries;
-};
-
-int __must_check uds_make_index_page_map(const struct index_geometry *geometry,
-                                        struct index_page_map **map_ptr);
-
-void uds_free_index_page_map(struct index_page_map *map);
-
-int __must_check uds_read_index_page_map(struct index_page_map *map,
-                                        struct buffered_reader *reader);
-
-int __must_check uds_write_index_page_map(struct index_page_map *map,
-                                         struct buffered_writer *writer);
-
-void uds_update_index_page_map(struct index_page_map *map, u64 virtual_chapter_number,
-                              u32 chapter_number, u32 index_page_number,
-                              u32 delta_list_number);
-
-u32 __must_check uds_find_index_page_number(const struct index_page_map *map,
-                                           const struct uds_record_name *name,
-                                           u32 chapter_number);
-
-void uds_get_list_number_bounds(const struct index_page_map *map, u32 chapter_number,
-                               u32 index_page_number, u32 *lowest_list,
-                               u32 *highest_list);
-
-u64 uds_compute_index_page_map_save_size(const struct index_geometry *geometry);
-
-#endif /* UDS_INDEX_PAGE_MAP_H */
diff --git a/drivers/md/dm-vdo/index-session.c b/drivers/md/dm-vdo/index-session.c
deleted file mode 100644 (file)
index a482ccd..0000000
+++ /dev/null
@@ -1,738 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright 2023 Red Hat
- */
-
-#include "index-session.h"
-
-#include <linux/atomic.h>
-
-#include "funnel-requestqueue.h"
-#include "index.h"
-#include "index-layout.h"
-#include "logger.h"
-#include "memory-alloc.h"
-#include "time-utils.h"
-
-/*
- * The index session contains a lock (the request_mutex) which ensures that only one thread can
- * change the state of its index at a time. The state field indicates the current state of the
- * index through a set of descriptive flags. The request_mutex must be notified whenever a
- * non-transient state flag is cleared. The request_mutex is also used to count the number of
- * requests currently in progress so that they can be drained when suspending or closing the index.
- *
- * If the index session is suspended shortly after opening an index, it may have to suspend during
- * a rebuild. Depending on the size of the index, a rebuild may take a significant amount of time,
- * so UDS allows the rebuild to be paused in order to suspend the session in a timely manner. When
- * the index session is resumed, the rebuild can continue from where it left off. If the index
- * session is shut down with a suspended rebuild, the rebuild progress is abandoned and the rebuild
- * will start from the beginning the next time the index is loaded. The mutex and status fields in
- * the index_load_context are used to record the state of any interrupted rebuild.
- */
-
-enum index_session_flag_bit {
-       IS_FLAG_BIT_START = 8,
-       /* The session has started loading an index but not completed it. */
-       IS_FLAG_BIT_LOADING = IS_FLAG_BIT_START,
-       /* The session has loaded an index, which can handle requests. */
-       IS_FLAG_BIT_LOADED,
-       /* The session's index has been permanently disabled. */
-       IS_FLAG_BIT_DISABLED,
-       /* The session's index is suspended. */
-       IS_FLAG_BIT_SUSPENDED,
-       /* The session is handling some index state change. */
-       IS_FLAG_BIT_WAITING,
-       /* The session's index is closing and draining requests. */
-       IS_FLAG_BIT_CLOSING,
-       /* The session is being destroyed and is draining requests. */
-       IS_FLAG_BIT_DESTROYING,
-};
-
-enum index_session_flag {
-       IS_FLAG_LOADED = (1 << IS_FLAG_BIT_LOADED),
-       IS_FLAG_LOADING = (1 << IS_FLAG_BIT_LOADING),
-       IS_FLAG_DISABLED = (1 << IS_FLAG_BIT_DISABLED),
-       IS_FLAG_SUSPENDED = (1 << IS_FLAG_BIT_SUSPENDED),
-       IS_FLAG_WAITING = (1 << IS_FLAG_BIT_WAITING),
-       IS_FLAG_CLOSING = (1 << IS_FLAG_BIT_CLOSING),
-       IS_FLAG_DESTROYING = (1 << IS_FLAG_BIT_DESTROYING),
-};
-
-/* Release a reference to an index session. */
-static void release_index_session(struct uds_index_session *index_session)
-{
-       mutex_lock(&index_session->request_mutex);
-       if (--index_session->request_count == 0)
-               uds_broadcast_cond(&index_session->request_cond);
-       mutex_unlock(&index_session->request_mutex);
-}
-
-/*
- * Acquire a reference to the index session for an asynchronous index request. The reference must
- * eventually be released with a corresponding call to release_index_session().
- */
-static int get_index_session(struct uds_index_session *index_session)
-{
-       unsigned int state;
-       int result = UDS_SUCCESS;
-
-       mutex_lock(&index_session->request_mutex);
-       index_session->request_count++;
-       state = index_session->state;
-       mutex_unlock(&index_session->request_mutex);
-
-       if (state == IS_FLAG_LOADED) {
-               return UDS_SUCCESS;
-       } else if (state & IS_FLAG_DISABLED) {
-               result = UDS_DISABLED;
-       } else if ((state & IS_FLAG_LOADING) ||
-                  (state & IS_FLAG_SUSPENDED) ||
-                  (state & IS_FLAG_WAITING)) {
-               result = -EBUSY;
-       } else {
-               result = UDS_NO_INDEX;
-       }
-
-       release_index_session(index_session);
-       return result;
-}
-
-int uds_launch_request(struct uds_request *request)
-{
-       size_t internal_size;
-       int result;
-
-       if (request->callback == NULL) {
-               uds_log_error("missing required callback");
-               return -EINVAL;
-       }
-
-       switch (request->type) {
-       case UDS_DELETE:
-       case UDS_POST:
-       case UDS_QUERY:
-       case UDS_QUERY_NO_UPDATE:
-       case UDS_UPDATE:
-               break;
-       default:
-               uds_log_error("received invalid callback type");
-               return -EINVAL;
-       }
-
-       /* Reset all internal fields before processing. */
-       internal_size =
-               sizeof(struct uds_request) - offsetof(struct uds_request, zone_number);
-       // FIXME should be using struct_group for this instead
-       memset((char *) request + sizeof(*request) - internal_size, 0, internal_size);
-
-       result = get_index_session(request->session);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       request->found = false;
-       request->unbatched = false;
-       request->index = request->session->index;
-
-       uds_enqueue_request(request, STAGE_TRIAGE);
-       return UDS_SUCCESS;
-}
-
-static void enter_callback_stage(struct uds_request *request)
-{
-       if (request->status != UDS_SUCCESS) {
-               /* All request errors are considered unrecoverable */
-               mutex_lock(&request->session->request_mutex);
-               request->session->state |= IS_FLAG_DISABLED;
-               mutex_unlock(&request->session->request_mutex);
-       }
-
-       uds_request_queue_enqueue(request->session->callback_queue, request);
-}
-
-static inline void count_once(u64 *count_ptr)
-{
-       WRITE_ONCE(*count_ptr, READ_ONCE(*count_ptr) + 1);
-}
-
-static void update_session_stats(struct uds_request *request)
-{
-       struct session_stats *session_stats = &request->session->stats;
-
-       count_once(&session_stats->requests);
-
-       switch (request->type) {
-       case UDS_POST:
-               if (request->found)
-                       count_once(&session_stats->posts_found);
-               else
-                       count_once(&session_stats->posts_not_found);
-
-               if (request->location == UDS_LOCATION_IN_OPEN_CHAPTER)
-                       count_once(&session_stats->posts_found_open_chapter);
-               else if (request->location == UDS_LOCATION_IN_DENSE)
-                       count_once(&session_stats->posts_found_dense);
-               else if (request->location == UDS_LOCATION_IN_SPARSE)
-                       count_once(&session_stats->posts_found_sparse);
-               break;
-
-       case UDS_UPDATE:
-               if (request->found)
-                       count_once(&session_stats->updates_found);
-               else
-                       count_once(&session_stats->updates_not_found);
-               break;
-
-       case UDS_DELETE:
-               if (request->found)
-                       count_once(&session_stats->deletions_found);
-               else
-                       count_once(&session_stats->deletions_not_found);
-               break;
-
-       case UDS_QUERY:
-       case UDS_QUERY_NO_UPDATE:
-               if (request->found)
-                       count_once(&session_stats->queries_found);
-               else
-                       count_once(&session_stats->queries_not_found);
-               break;
-
-       default:
-               request->status = ASSERT(false, "unknown request type: %d",
-                                        request->type);
-       }
-}
-
-static void handle_callbacks(struct uds_request *request)
-{
-       struct uds_index_session *index_session = request->session;
-
-       if (request->status == UDS_SUCCESS)
-               update_session_stats(request);
-
-       request->status = uds_status_to_errno(request->status);
-       request->callback(request);
-       release_index_session(index_session);
-}
-
-static int __must_check make_empty_index_session(struct uds_index_session **index_session_ptr)
-{
-       int result;
-       struct uds_index_session *session;
-
-       result = uds_allocate(1, struct uds_index_session, __func__, &session);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       mutex_init(&session->request_mutex);
-       uds_init_cond(&session->request_cond);
-       mutex_init(&session->load_context.mutex);
-       uds_init_cond(&session->load_context.cond);
-
-       result = uds_make_request_queue("callbackW", &handle_callbacks,
-                                       &session->callback_queue);
-       if (result != UDS_SUCCESS) {
-               uds_free(session);
-               return result;
-       }
-
-       *index_session_ptr = session;
-       return UDS_SUCCESS;
-}
-
-int uds_create_index_session(struct uds_index_session **session)
-{
-       if (session == NULL) {
-               uds_log_error("missing session pointer");
-               return -EINVAL;
-       }
-
-       return uds_status_to_errno(make_empty_index_session(session));
-}
-
-static int __must_check start_loading_index_session(struct uds_index_session *index_session)
-{
-       int result;
-
-       mutex_lock(&index_session->request_mutex);
-       if (index_session->state & IS_FLAG_SUSPENDED) {
-               uds_log_info("Index session is suspended");
-               result = -EBUSY;
-       } else if (index_session->state != 0) {
-               uds_log_info("Index is already loaded");
-               result = -EBUSY;
-       } else {
-               index_session->state |= IS_FLAG_LOADING;
-               result = UDS_SUCCESS;
-       }
-       mutex_unlock(&index_session->request_mutex);
-       return result;
-}
-
-static void finish_loading_index_session(struct uds_index_session *index_session,
-                                        int result)
-{
-       mutex_lock(&index_session->request_mutex);
-       index_session->state &= ~IS_FLAG_LOADING;
-       if (result == UDS_SUCCESS)
-               index_session->state |= IS_FLAG_LOADED;
-
-       uds_broadcast_cond(&index_session->request_cond);
-       mutex_unlock(&index_session->request_mutex);
-}
-
-static int initialize_index_session(struct uds_index_session *index_session,
-                                   enum uds_open_index_type open_type)
-{
-       int result;
-       struct uds_configuration *config;
-
-       result = uds_make_configuration(&index_session->parameters, &config);
-       if (result != UDS_SUCCESS) {
-               uds_log_error_strerror(result, "Failed to allocate config");
-               return result;
-       }
-
-       memset(&index_session->stats, 0, sizeof(index_session->stats));
-       result = uds_make_index(config, open_type, &index_session->load_context,
-                               enter_callback_stage, &index_session->index);
-       if (result != UDS_SUCCESS)
-               uds_log_error_strerror(result, "Failed to make index");
-       else
-               uds_log_configuration(config);
-
-       uds_free_configuration(config);
-       return result;
-}
-
-static const char *get_open_type_string(enum uds_open_index_type open_type)
-{
-       switch (open_type) {
-       case UDS_CREATE:
-               return "creating index";
-       case UDS_LOAD:
-               return "loading or rebuilding index";
-       case UDS_NO_REBUILD:
-               return "loading index";
-       default:
-               return "unknown open method";
-       }
-}
-
-/*
- * Open an index under the given session. This operation will fail if the
- * index session is suspended, or if there is already an open index.
- */
-int uds_open_index(enum uds_open_index_type open_type,
-                  const struct uds_parameters *parameters,
-                  struct uds_index_session *session)
-{
-       int result;
-       char name[BDEVNAME_SIZE];
-
-       if (parameters == NULL) {
-               uds_log_error("missing required parameters");
-               return -EINVAL;
-       }
-       if (parameters->bdev == NULL) {
-               uds_log_error("missing required block device");
-               return -EINVAL;
-       }
-       if (session == NULL) {
-               uds_log_error("missing required session pointer");
-               return -EINVAL;
-       }
-
-       result = start_loading_index_session(session);
-       if (result != UDS_SUCCESS)
-               return uds_status_to_errno(result);
-
-       session->parameters = *parameters;
-       format_dev_t(name, parameters->bdev->bd_dev);
-       uds_log_info("%s: %s", get_open_type_string(open_type), name);
-
-       result = initialize_index_session(session, open_type);
-       if (result != UDS_SUCCESS)
-               uds_log_error_strerror(result, "Failed %s",
-                                      get_open_type_string(open_type));
-
-       finish_loading_index_session(session, result);
-       return uds_status_to_errno(result);
-}
-
-static void wait_for_no_requests_in_progress(struct uds_index_session *index_session)
-{
-       mutex_lock(&index_session->request_mutex);
-       while (index_session->request_count > 0) {
-               uds_wait_cond(&index_session->request_cond,
-                             &index_session->request_mutex);
-       }
-       mutex_unlock(&index_session->request_mutex);
-}
-
-static int __must_check save_index(struct uds_index_session *index_session)
-{
-       wait_for_no_requests_in_progress(index_session);
-       return uds_save_index(index_session->index);
-}
-
-static void suspend_rebuild(struct uds_index_session *session)
-{
-       mutex_lock(&session->load_context.mutex);
-       switch (session->load_context.status) {
-       case INDEX_OPENING:
-               session->load_context.status = INDEX_SUSPENDING;
-
-               /* Wait until the index indicates that it is not replaying. */
-               while ((session->load_context.status != INDEX_SUSPENDED) &&
-                      (session->load_context.status != INDEX_READY)) {
-                       uds_wait_cond(&session->load_context.cond,
-                                     &session->load_context.mutex);
-               }
-
-               break;
-
-       case INDEX_READY:
-               /* Index load does not need to be suspended. */
-               break;
-
-       case INDEX_SUSPENDED:
-       case INDEX_SUSPENDING:
-       case INDEX_FREEING:
-       default:
-               /* These cases should not happen. */
-               ASSERT_LOG_ONLY(false, "Bad load context state %u",
-                               session->load_context.status);
-               break;
-       }
-       mutex_unlock(&session->load_context.mutex);
-}
-
-/*
- * Suspend index operation, draining all current index requests and preventing new index requests
- * from starting. Optionally saves all index data before returning.
- */
-int uds_suspend_index_session(struct uds_index_session *session, bool save)
-{
-       int result = UDS_SUCCESS;
-       bool no_work = false;
-       bool rebuilding = false;
-
-       /* Wait for any current index state change to complete. */
-       mutex_lock(&session->request_mutex);
-       while (session->state & IS_FLAG_CLOSING)
-               uds_wait_cond(&session->request_cond, &session->request_mutex);
-
-       if ((session->state & IS_FLAG_WAITING) || (session->state & IS_FLAG_DESTROYING)) {
-               no_work = true;
-               uds_log_info("Index session is already changing state");
-               result = -EBUSY;
-       } else if (session->state & IS_FLAG_SUSPENDED) {
-               no_work = true;
-       } else if (session->state & IS_FLAG_LOADING) {
-               session->state |= IS_FLAG_WAITING;
-               rebuilding = true;
-       } else if (session->state & IS_FLAG_LOADED) {
-               session->state |= IS_FLAG_WAITING;
-       } else {
-               no_work = true;
-               session->state |= IS_FLAG_SUSPENDED;
-               uds_broadcast_cond(&session->request_cond);
-       }
-       mutex_unlock(&session->request_mutex);
-
-       if (no_work)
-               return uds_status_to_errno(result);
-
-       if (rebuilding)
-               suspend_rebuild(session);
-       else if (save)
-               result = save_index(session);
-       else
-               result = uds_flush_index_session(session);
-
-       mutex_lock(&session->request_mutex);
-       session->state &= ~IS_FLAG_WAITING;
-       session->state |= IS_FLAG_SUSPENDED;
-       uds_broadcast_cond(&session->request_cond);
-       mutex_unlock(&session->request_mutex);
-       return uds_status_to_errno(result);
-}
-
-static int replace_device(struct uds_index_session *session, struct block_device *bdev)
-{
-       int result;
-
-       result = uds_replace_index_storage(session->index, bdev);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       session->parameters.bdev = bdev;
-       return UDS_SUCCESS;
-}
-
-/*
- * Resume index operation after being suspended. If the index is suspended and the supplied block
- * device differs from the current backing store, the index will start using the new backing store.
- */
-int uds_resume_index_session(struct uds_index_session *session,
-                            struct block_device *bdev)
-{
-       int result = UDS_SUCCESS;
-       bool no_work = false;
-       bool resume_replay = false;
-
-       mutex_lock(&session->request_mutex);
-       if (session->state & IS_FLAG_WAITING) {
-               uds_log_info("Index session is already changing state");
-               no_work = true;
-               result = -EBUSY;
-       } else if (!(session->state & IS_FLAG_SUSPENDED)) {
-               /* If not suspended, just succeed. */
-               no_work = true;
-               result = UDS_SUCCESS;
-       } else {
-               session->state |= IS_FLAG_WAITING;
-               if (session->state & IS_FLAG_LOADING)
-                       resume_replay = true;
-       }
-       mutex_unlock(&session->request_mutex);
-
-       if (no_work)
-               return result;
-
-       if ((session->index != NULL) && (bdev != session->parameters.bdev)) {
-               result = replace_device(session, bdev);
-               if (result != UDS_SUCCESS) {
-                       mutex_lock(&session->request_mutex);
-                       session->state &= ~IS_FLAG_WAITING;
-                       uds_broadcast_cond(&session->request_cond);
-                       mutex_unlock(&session->request_mutex);
-                       return uds_status_to_errno(result);
-               }
-       }
-
-       if (resume_replay) {
-               mutex_lock(&session->load_context.mutex);
-               switch (session->load_context.status) {
-               case INDEX_SUSPENDED:
-                       session->load_context.status = INDEX_OPENING;
-                       /* Notify the index to start replaying again. */
-                       uds_broadcast_cond(&session->load_context.cond);
-                       break;
-
-               case INDEX_READY:
-                       /* There is no index rebuild to resume. */
-                       break;
-
-               case INDEX_OPENING:
-               case INDEX_SUSPENDING:
-               case INDEX_FREEING:
-               default:
-                       /* These cases should not happen; do nothing. */
-                       ASSERT_LOG_ONLY(false, "Bad load context state %u",
-                                       session->load_context.status);
-                       break;
-               }
-               mutex_unlock(&session->load_context.mutex);
-       }
-
-       mutex_lock(&session->request_mutex);
-       session->state &= ~IS_FLAG_WAITING;
-       session->state &= ~IS_FLAG_SUSPENDED;
-       uds_broadcast_cond(&session->request_cond);
-       mutex_unlock(&session->request_mutex);
-       return UDS_SUCCESS;
-}
-
-static int save_and_free_index(struct uds_index_session *index_session)
-{
-       int result = UDS_SUCCESS;
-       bool suspended;
-       struct uds_index *index = index_session->index;
-
-       if (index == NULL)
-               return UDS_SUCCESS;
-
-       mutex_lock(&index_session->request_mutex);
-       suspended = (index_session->state & IS_FLAG_SUSPENDED);
-       mutex_unlock(&index_session->request_mutex);
-
-       if (!suspended) {
-               result = uds_save_index(index);
-               if (result != UDS_SUCCESS)
-                       uds_log_warning_strerror(result,
-                                                "ignoring error from save_index");
-       }
-       uds_free_index(index);
-       index_session->index = NULL;
-
-       /*
-        * Reset all index state that happens to be in the index
-        * session, so it doesn't affect any future index.
-        */
-       mutex_lock(&index_session->load_context.mutex);
-       index_session->load_context.status = INDEX_OPENING;
-       mutex_unlock(&index_session->load_context.mutex);
-
-       mutex_lock(&index_session->request_mutex);
-       /* Only the suspend bit will remain relevant. */
-       index_session->state &= IS_FLAG_SUSPENDED;
-       mutex_unlock(&index_session->request_mutex);
-
-       return result;
-}
-
-/* Save and close the current index. */
-int uds_close_index(struct uds_index_session *index_session)
-{
-       int result = UDS_SUCCESS;
-
-       /* Wait for any current index state change to complete. */
-       mutex_lock(&index_session->request_mutex);
-       while ((index_session->state & IS_FLAG_WAITING) ||
-              (index_session->state & IS_FLAG_CLOSING)) {
-               uds_wait_cond(&index_session->request_cond,
-                             &index_session->request_mutex);
-       }
-
-       if (index_session->state & IS_FLAG_SUSPENDED) {
-               uds_log_info("Index session is suspended");
-               result = -EBUSY;
-       } else if ((index_session->state & IS_FLAG_DESTROYING) ||
-                  !(index_session->state & IS_FLAG_LOADED)) {
-               /* The index doesn't exist, hasn't finished loading, or is being destroyed. */
-               result = UDS_NO_INDEX;
-       } else {
-               index_session->state |= IS_FLAG_CLOSING;
-       }
-       mutex_unlock(&index_session->request_mutex);
-       if (result != UDS_SUCCESS)
-               return uds_status_to_errno(result);
-
-       uds_log_debug("Closing index");
-       wait_for_no_requests_in_progress(index_session);
-       result = save_and_free_index(index_session);
-       uds_log_debug("Closed index");
-
-       mutex_lock(&index_session->request_mutex);
-       index_session->state &= ~IS_FLAG_CLOSING;
-       uds_broadcast_cond(&index_session->request_cond);
-       mutex_unlock(&index_session->request_mutex);
-       return uds_status_to_errno(result);
-}
-
-/* This will save and close an open index before destroying the session. */
-int uds_destroy_index_session(struct uds_index_session *index_session)
-{
-       int result;
-       bool load_pending = false;
-
-       uds_log_debug("Destroying index session");
-
-       /* Wait for any current index state change to complete. */
-       mutex_lock(&index_session->request_mutex);
-       while ((index_session->state & IS_FLAG_WAITING) ||
-              (index_session->state & IS_FLAG_CLOSING)) {
-               uds_wait_cond(&index_session->request_cond,
-                             &index_session->request_mutex);
-       }
-
-       if (index_session->state & IS_FLAG_DESTROYING) {
-               mutex_unlock(&index_session->request_mutex);
-               uds_log_info("Index session is already closing");
-               return -EBUSY;
-       }
-
-       index_session->state |= IS_FLAG_DESTROYING;
-       load_pending = ((index_session->state & IS_FLAG_LOADING) &&
-                       (index_session->state & IS_FLAG_SUSPENDED));
-       mutex_unlock(&index_session->request_mutex);
-
-       if (load_pending) {
-               /* Tell the index to terminate the rebuild. */
-               mutex_lock(&index_session->load_context.mutex);
-               if (index_session->load_context.status == INDEX_SUSPENDED) {
-                       index_session->load_context.status = INDEX_FREEING;
-                       uds_broadcast_cond(&index_session->load_context.cond);
-               }
-               mutex_unlock(&index_session->load_context.mutex);
-
-               /* Wait until the load exits before proceeding. */
-               mutex_lock(&index_session->request_mutex);
-               while (index_session->state & IS_FLAG_LOADING) {
-                       uds_wait_cond(&index_session->request_cond,
-                                     &index_session->request_mutex);
-               }
-               mutex_unlock(&index_session->request_mutex);
-       }
-
-       wait_for_no_requests_in_progress(index_session);
-       result = save_and_free_index(index_session);
-       uds_request_queue_finish(index_session->callback_queue);
-       index_session->callback_queue = NULL;
-       uds_log_debug("Destroyed index session");
-       uds_free(index_session);
-       return uds_status_to_errno(result);
-}
-
-/* Wait until all callbacks for index operations are complete. */
-int uds_flush_index_session(struct uds_index_session *index_session)
-{
-       wait_for_no_requests_in_progress(index_session);
-       uds_wait_for_idle_index(index_session->index);
-       return UDS_SUCCESS;
-}
-
-/* Statistics collection is intended to be thread-safe. */
-static void collect_stats(const struct uds_index_session *index_session,
-                         struct uds_index_stats *stats)
-{
-       const struct session_stats *session_stats = &index_session->stats;
-
-       stats->current_time = ktime_to_seconds(current_time_ns(CLOCK_REALTIME));
-       stats->posts_found = READ_ONCE(session_stats->posts_found);
-       stats->in_memory_posts_found = READ_ONCE(session_stats->posts_found_open_chapter);
-       stats->dense_posts_found = READ_ONCE(session_stats->posts_found_dense);
-       stats->sparse_posts_found = READ_ONCE(session_stats->posts_found_sparse);
-       stats->posts_not_found = READ_ONCE(session_stats->posts_not_found);
-       stats->updates_found = READ_ONCE(session_stats->updates_found);
-       stats->updates_not_found = READ_ONCE(session_stats->updates_not_found);
-       stats->deletions_found = READ_ONCE(session_stats->deletions_found);
-       stats->deletions_not_found = READ_ONCE(session_stats->deletions_not_found);
-       stats->queries_found = READ_ONCE(session_stats->queries_found);
-       stats->queries_not_found = READ_ONCE(session_stats->queries_not_found);
-       stats->requests = READ_ONCE(session_stats->requests);
-}
-
-int uds_get_index_session_stats(struct uds_index_session *index_session,
-                               struct uds_index_stats *stats)
-{
-       if (stats == NULL) {
-               uds_log_error("received a NULL index stats pointer");
-               return -EINVAL;
-       }
-
-       collect_stats(index_session, stats);
-       if (index_session->index != NULL) {
-               uds_get_index_stats(index_session->index, stats);
-       } else {
-               stats->entries_indexed = 0;
-               stats->memory_used = 0;
-               stats->collisions = 0;
-               stats->entries_discarded = 0;
-       }
-
-       return UDS_SUCCESS;
-}
-
-void uds_wait_cond(struct cond_var *cv, struct mutex *mutex)
-{
-       DEFINE_WAIT(__wait);
-
-       prepare_to_wait(&cv->wait_queue, &__wait, TASK_IDLE);
-       mutex_unlock(mutex);
-       schedule();
-       finish_wait(&cv->wait_queue, &__wait);
-       mutex_lock(mutex);
-}
diff --git a/drivers/md/dm-vdo/index-session.h b/drivers/md/dm-vdo/index-session.h
deleted file mode 100644 (file)
index 733d10f..0000000
+++ /dev/null
@@ -1,84 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright 2023 Red Hat
- */
-
-#ifndef UDS_INDEX_SESSION_H
-#define UDS_INDEX_SESSION_H
-
-#include <linux/atomic.h>
-#include <linux/cache.h>
-
-#include "config.h"
-#include "indexer.h"
-#include "thread-utils.h"
-
-/*
- * The index session mediates all interactions with a UDS index. Once the index session is created,
- * it can be used to open, close, suspend, or recreate an index. It implements the majority of the
- * functions in the top-level UDS API.
- *
- * If any deduplication request fails due to an internal error, the index is marked disabled. It
- * will not accept any further requests and can only be closed. Closing the index will clear the
- * disabled flag, and the index can then be reopened and recovered using the same index session.
- */
-
-struct __aligned(L1_CACHE_BYTES) session_stats {
-       /* Post requests that found an entry */
-       u64 posts_found;
-       /* Post requests found in the open chapter */
-       u64 posts_found_open_chapter;
-       /* Post requests found in the dense index */
-       u64 posts_found_dense;
-       /* Post requests found in the sparse index */
-       u64 posts_found_sparse;
-       /* Post requests that did not find an entry */
-       u64 posts_not_found;
-       /* Update requests that found an entry */
-       u64 updates_found;
-       /* Update requests that did not find an entry */
-       u64 updates_not_found;
-       /* Delete requests that found an entry */
-       u64 deletions_found;
-       /* Delete requests that did not find an entry */
-       u64 deletions_not_found;
-       /* Query requests that found an entry */
-       u64 queries_found;
-       /* Query requests that did not find an entry */
-       u64 queries_not_found;
-       /* Total number of requests */
-       u64 requests;
-};
-
-enum index_suspend_status {
-       /* An index load has started but the index is not ready for use. */
-       INDEX_OPENING = 0,
-       /* The index is able to handle requests. */
-       INDEX_READY,
-       /* The index is attempting to suspend a rebuild. */
-       INDEX_SUSPENDING,
-       /* An index rebuild has been suspended. */
-       INDEX_SUSPENDED,
-       /* An index rebuild is being stopped in order to shut down. */
-       INDEX_FREEING,
-};
-
-struct index_load_context {
-       struct mutex mutex;
-       struct cond_var cond;
-       enum index_suspend_status status;
-};
-
-struct uds_index_session {
-       unsigned int state;
-       struct uds_index *index;
-       struct uds_request_queue *callback_queue;
-       struct uds_parameters parameters;
-       struct index_load_context load_context;
-       struct mutex request_mutex;
-       struct cond_var request_cond;
-       int request_count;
-       struct session_stats stats;
-};
-
-#endif /* UDS_INDEX_SESSION_H */
diff --git a/drivers/md/dm-vdo/index.c b/drivers/md/dm-vdo/index.c
deleted file mode 100644 (file)
index 9d4a8e5..0000000
+++ /dev/null
@@ -1,1387 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright 2023 Red Hat
- */
-
-
-#include "index.h"
-
-#include "funnel-requestqueue.h"
-#include "hash-utils.h"
-#include "logger.h"
-#include "memory-alloc.h"
-#include "sparse-cache.h"
-
-static const u64 NO_LAST_SAVE = U64_MAX;
-
-/*
- * When searching for deduplication records, the index first searches the volume index, and then
- * searches the chapter index for the relevant chapter. If the chapter has been fully committed to
- * storage, the chapter pages are loaded into the page cache. If the chapter has not yet been
- * committed (either the open chapter or a recently closed one), the index searches the in-memory
- * representation of the chapter. Finally, if the volume index does not find a record and the index
- * is sparse, the index will search the sparse cache.
- *
- * The index send two kinds of messages to coordinate between zones: chapter close messages for the
- * chapter writer, and sparse cache barrier messages for the sparse cache.
- *
- * The chapter writer is responsible for committing chapters of records to storage. Since zones can
- * get different numbers of records, some zones may fall behind others. Each time a zone fills up
- * its available space in a chapter, it informs the chapter writer that the chapter is complete,
- * and also informs all other zones that it has closed the chapter. Each other zone will then close
- * the chapter immediately, regardless of how full it is, in order to minimize skew between zones.
- * Once every zone has closed the chapter, the chapter writer will commit that chapter to storage.
- *
- * The last zone to close the chapter also removes the oldest chapter from the volume index.
- * Although that chapter is invalid for zones that have moved on, the existence of the open chapter
- * means that those zones will never ask the volume index about it. No zone is allowed to get more
- * than one chapter ahead of any other. If a zone is so far ahead that it tries to close another
- * chapter before the previous one has been closed by all zones, it is forced to wait.
- *
- * The sparse cache relies on having the same set of chapter indexes available to all zones. When a
- * request wants to add a chapter to the sparse cache, it sends a barrier message to each zone
- * during the triage stage that acts as a rendezvous. Once every zone has reached the barrier and
- * paused its operations, the cache membership is changed and each zone is then informed that it
- * can proceed. More details can be found in the sparse cache documentation.
- *
- * If a sparse cache has only one zone, it will not create a triage queue, but it still needs the
- * barrier message to change the sparse cache membership, so the index simulates the message by
- * invoking the handler directly.
- */
-
-struct chapter_writer {
-       /* The index to which we belong */
-       struct uds_index *index;
-       /* The thread to do the writing */
-       struct thread *thread;
-       /* The lock protecting the following fields */
-       struct mutex mutex;
-       /* The condition signalled on state changes */
-       struct cond_var cond;
-       /* Set to true to stop the thread */
-       bool stop;
-       /* The result from the most recent write */
-       int result;
-       /* The number of bytes allocated by the chapter writer */
-       size_t memory_size;
-       /* The number of zones which have submitted a chapter for writing */
-       unsigned int zones_to_write;
-       /* Open chapter index used by uds_close_open_chapter() */
-       struct open_chapter_index *open_chapter_index;
-       /* Collated records used by uds_close_open_chapter() */
-       struct uds_volume_record *collated_records;
-       /* The chapters to write (one per zone) */
-       struct open_chapter_zone *chapters[];
-};
-
-static bool is_zone_chapter_sparse(const struct index_zone *zone, u64 virtual_chapter)
-{
-       return uds_is_chapter_sparse(zone->index->volume->geometry,
-                                    zone->oldest_virtual_chapter,
-                                    zone->newest_virtual_chapter, virtual_chapter);
-}
-
-static int launch_zone_message(struct uds_zone_message message, unsigned int zone,
-                              struct uds_index *index)
-{
-       int result;
-       struct uds_request *request;
-
-       result = uds_allocate(1, struct uds_request, __func__, &request);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       request->index = index;
-       request->unbatched = true;
-       request->zone_number = zone;
-       request->zone_message = message;
-
-       uds_enqueue_request(request, STAGE_MESSAGE);
-       return UDS_SUCCESS;
-}
-
-static void enqueue_barrier_messages(struct uds_index *index, u64 virtual_chapter)
-{
-       struct uds_zone_message message = {
-               .type = UDS_MESSAGE_SPARSE_CACHE_BARRIER,
-               .virtual_chapter = virtual_chapter,
-       };
-       unsigned int zone;
-
-       for (zone = 0; zone < index->zone_count; zone++) {
-               int result = launch_zone_message(message, zone, index);
-
-               ASSERT_LOG_ONLY((result == UDS_SUCCESS), "barrier message allocation");
-       }
-}
-
-/*
- * Determine whether this request should trigger a sparse cache barrier message to change the
- * membership of the sparse cache. If a change in membership is desired, the function returns the
- * chapter number to add.
- */
-static u64 triage_index_request(struct uds_index *index, struct uds_request *request)
-{
-       u64 virtual_chapter;
-       struct index_zone *zone;
-
-       virtual_chapter = uds_lookup_volume_index_name(index->volume_index,
-                                                      &request->record_name);
-       if (virtual_chapter == NO_CHAPTER)
-               return NO_CHAPTER;
-
-       zone = index->zones[request->zone_number];
-       if (!is_zone_chapter_sparse(zone, virtual_chapter))
-               return NO_CHAPTER;
-
-       /*
-        * FIXME: Optimize for a common case by remembering the chapter from the most recent
-        * barrier message and skipping this chapter if is it the same.
-        */
-
-       return virtual_chapter;
-}
-
-/*
- * Simulate a message to change the sparse cache membership for a single-zone sparse index. This
- * allows us to forgo the complicated locking required by a multi-zone sparse index. Any other kind
- * of index does nothing here.
- */
-static int simulate_index_zone_barrier_message(struct index_zone *zone,
-                                              struct uds_request *request)
-{
-       u64 sparse_virtual_chapter;
-
-       if ((zone->index->zone_count > 1) ||
-           !uds_is_sparse_index_geometry(zone->index->volume->geometry))
-               return UDS_SUCCESS;
-
-       sparse_virtual_chapter = triage_index_request(zone->index, request);
-       if (sparse_virtual_chapter == NO_CHAPTER)
-               return UDS_SUCCESS;
-
-       return uds_update_sparse_cache(zone, sparse_virtual_chapter);
-}
-
-/* This is the request processing function for the triage queue. */
-static void triage_request(struct uds_request *request)
-{
-       struct uds_index *index = request->index;
-       u64 sparse_virtual_chapter = triage_index_request(index, request);
-
-       if (sparse_virtual_chapter != NO_CHAPTER)
-               enqueue_barrier_messages(index, sparse_virtual_chapter);
-
-       uds_enqueue_request(request, STAGE_INDEX);
-}
-
-static int finish_previous_chapter(struct uds_index *index, u64 current_chapter_number)
-{
-       int result;
-       struct chapter_writer *writer = index->chapter_writer;
-
-       mutex_lock(&writer->mutex);
-       while (index->newest_virtual_chapter < current_chapter_number)
-               uds_wait_cond(&writer->cond, &writer->mutex);
-       result = writer->result;
-       mutex_unlock(&writer->mutex);
-
-       if (result != UDS_SUCCESS)
-               return uds_log_error_strerror(result,
-                                             "Writing of previous open chapter failed");
-
-       return UDS_SUCCESS;
-}
-
-static int swap_open_chapter(struct index_zone *zone)
-{
-       int result;
-       struct open_chapter_zone *temporary_chapter;
-
-       result = finish_previous_chapter(zone->index, zone->newest_virtual_chapter);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       temporary_chapter = zone->open_chapter;
-       zone->open_chapter = zone->writing_chapter;
-       zone->writing_chapter = temporary_chapter;
-       return UDS_SUCCESS;
-}
-
-/*
- * Inform the chapter writer that this zone is done with this chapter. The chapter won't start
- * writing until all zones have closed it.
- */
-static unsigned int start_closing_chapter(struct uds_index *index,
-                                         unsigned int zone_number,
-                                         struct open_chapter_zone *chapter)
-{
-       unsigned int finished_zones;
-       struct chapter_writer *writer = index->chapter_writer;
-
-       mutex_lock(&writer->mutex);
-       finished_zones = ++writer->zones_to_write;
-       writer->chapters[zone_number] = chapter;
-       uds_broadcast_cond(&writer->cond);
-       mutex_unlock(&writer->mutex);
-
-       return finished_zones;
-}
-
-static int announce_chapter_closed(struct index_zone *zone, u64 closed_chapter)
-{
-       int result;
-       unsigned int i;
-       struct uds_zone_message zone_message = {
-               .type = UDS_MESSAGE_ANNOUNCE_CHAPTER_CLOSED,
-               .virtual_chapter = closed_chapter,
-       };
-
-       for (i = 0; i < zone->index->zone_count; i++) {
-               if (zone->id == i)
-                       continue;
-
-               result = launch_zone_message(zone_message, i, zone->index);
-               if (result != UDS_SUCCESS)
-                       return result;
-       }
-
-       return UDS_SUCCESS;
-}
-
-static int open_next_chapter(struct index_zone *zone)
-{
-       int result;
-       u64 closed_chapter;
-       u64 expiring;
-       unsigned int finished_zones;
-       u32 expire_chapters;
-
-       uds_log_debug("closing chapter %llu of zone %u after %u entries (%u short)",
-                     (unsigned long long) zone->newest_virtual_chapter, zone->id,
-                     zone->open_chapter->size,
-                     zone->open_chapter->capacity - zone->open_chapter->size);
-
-       result = swap_open_chapter(zone);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       closed_chapter = zone->newest_virtual_chapter++;
-       uds_set_volume_index_zone_open_chapter(zone->index->volume_index, zone->id,
-                                              zone->newest_virtual_chapter);
-       uds_reset_open_chapter(zone->open_chapter);
-
-       finished_zones = start_closing_chapter(zone->index, zone->id,
-                                              zone->writing_chapter);
-       if ((finished_zones == 1) && (zone->index->zone_count > 1)) {
-               result = announce_chapter_closed(zone, closed_chapter);
-               if (result != UDS_SUCCESS)
-                       return result;
-       }
-
-       expiring = zone->oldest_virtual_chapter;
-       expire_chapters = uds_chapters_to_expire(zone->index->volume->geometry,
-                                                zone->newest_virtual_chapter);
-       zone->oldest_virtual_chapter += expire_chapters;
-
-       if (finished_zones < zone->index->zone_count)
-               return UDS_SUCCESS;
-
-       while (expire_chapters-- > 0)
-               uds_forget_chapter(zone->index->volume, expiring++);
-
-       return UDS_SUCCESS;
-}
-
-static int handle_chapter_closed(struct index_zone *zone, u64 virtual_chapter)
-{
-       if (zone->newest_virtual_chapter == virtual_chapter)
-               return open_next_chapter(zone);
-
-       return UDS_SUCCESS;
-}
-
-static int dispatch_index_zone_control_request(struct uds_request *request)
-{
-       struct uds_zone_message *message = &request->zone_message;
-       struct index_zone *zone = request->index->zones[request->zone_number];
-
-       switch (message->type) {
-       case UDS_MESSAGE_SPARSE_CACHE_BARRIER:
-               return uds_update_sparse_cache(zone, message->virtual_chapter);
-
-       case UDS_MESSAGE_ANNOUNCE_CHAPTER_CLOSED:
-               return handle_chapter_closed(zone, message->virtual_chapter);
-
-       default:
-               uds_log_error("invalid message type: %d", message->type);
-               return UDS_INVALID_ARGUMENT;
-       }
-}
-
-static void set_request_location(struct uds_request *request,
-                                enum uds_index_region new_location)
-{
-       request->location = new_location;
-       request->found = ((new_location == UDS_LOCATION_IN_OPEN_CHAPTER) ||
-                         (new_location == UDS_LOCATION_IN_DENSE) ||
-                         (new_location == UDS_LOCATION_IN_SPARSE));
-}
-
-static void set_chapter_location(struct uds_request *request,
-                                const struct index_zone *zone, u64 virtual_chapter)
-{
-       request->found = true;
-       if (virtual_chapter == zone->newest_virtual_chapter)
-               request->location = UDS_LOCATION_IN_OPEN_CHAPTER;
-       else if (is_zone_chapter_sparse(zone, virtual_chapter))
-               request->location = UDS_LOCATION_IN_SPARSE;
-       else
-               request->location = UDS_LOCATION_IN_DENSE;
-}
-
-static int search_sparse_cache_in_zone(struct index_zone *zone, struct uds_request *request,
-                                      u64 virtual_chapter, bool *found)
-{
-       int result;
-       struct volume *volume;
-       u16 record_page_number;
-       u32 chapter;
-
-       result = uds_search_sparse_cache(zone, &request->record_name, &virtual_chapter,
-                                        &record_page_number);
-       if ((result != UDS_SUCCESS) || (virtual_chapter == NO_CHAPTER))
-               return result;
-
-       request->virtual_chapter = virtual_chapter;
-       volume = zone->index->volume;
-       chapter = uds_map_to_physical_chapter(volume->geometry, virtual_chapter);
-       return uds_search_cached_record_page(volume, request, chapter,
-                                            record_page_number, found);
-}
-
-static int get_record_from_zone(struct index_zone *zone, struct uds_request *request,
-                               bool *found)
-{
-       struct volume *volume;
-
-       if (request->location == UDS_LOCATION_RECORD_PAGE_LOOKUP) {
-               *found = true;
-               return UDS_SUCCESS;
-       } else if (request->location == UDS_LOCATION_UNAVAILABLE) {
-               *found = false;
-               return UDS_SUCCESS;
-       }
-
-       if (request->virtual_chapter == zone->newest_virtual_chapter) {
-               uds_search_open_chapter(zone->open_chapter, &request->record_name,
-                                       &request->old_metadata, found);
-               return UDS_SUCCESS;
-       }
-
-       if ((zone->newest_virtual_chapter > 0) &&
-           (request->virtual_chapter == (zone->newest_virtual_chapter - 1)) &&
-           (zone->writing_chapter->size > 0)) {
-               uds_search_open_chapter(zone->writing_chapter, &request->record_name,
-                                       &request->old_metadata, found);
-               return UDS_SUCCESS;
-       }
-
-       volume = zone->index->volume;
-       if (is_zone_chapter_sparse(zone, request->virtual_chapter) &&
-           uds_sparse_cache_contains(volume->sparse_cache, request->virtual_chapter,
-                                     request->zone_number))
-               return search_sparse_cache_in_zone(zone, request,
-                                                  request->virtual_chapter, found);
-
-       return uds_search_volume_page_cache(volume, request, found);
-}
-
-static int put_record_in_zone(struct index_zone *zone, struct uds_request *request,
-                             const struct uds_record_data *metadata)
-{
-       unsigned int remaining;
-
-       remaining = uds_put_open_chapter(zone->open_chapter, &request->record_name,
-                                        metadata);
-       if (remaining == 0)
-               return open_next_chapter(zone);
-
-       return UDS_SUCCESS;
-}
-
-static int search_index_zone(struct index_zone *zone, struct uds_request *request)
-{
-       int result;
-       struct volume_index_record record;
-       bool overflow_record, found = false;
-       struct uds_record_data *metadata;
-       u64 chapter;
-
-       result = uds_get_volume_index_record(zone->index->volume_index,
-                                            &request->record_name, &record);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       if (record.is_found) {
-               if (request->requeued && request->virtual_chapter != record.virtual_chapter)
-                       set_request_location(request, UDS_LOCATION_UNKNOWN);
-
-               request->virtual_chapter = record.virtual_chapter;
-               result = get_record_from_zone(zone, request, &found);
-               if (result != UDS_SUCCESS)
-                       return result;
-       }
-
-       if (found)
-               set_chapter_location(request, zone, record.virtual_chapter);
-
-       /*
-        * If a record has overflowed a chapter index in more than one chapter (or overflowed in
-        * one chapter and collided with an existing record), it will exist as a collision record
-        * in the volume index, but we won't find it in the volume. This case needs special
-        * handling.
-        */
-       overflow_record = (record.is_found && record.is_collision && !found);
-       chapter = zone->newest_virtual_chapter;
-       if (found || overflow_record) {
-               if ((request->type == UDS_QUERY_NO_UPDATE) ||
-                   ((request->type == UDS_QUERY) && overflow_record)) {
-                       /* There is nothing left to do. */
-                       return UDS_SUCCESS;
-               }
-
-               if (record.virtual_chapter != chapter) {
-                       /*
-                        * Update the volume index to reference the new chapter for the block. If
-                        * the record had been deleted or dropped from the chapter index, it will
-                        * be back.
-                        */
-                       result = uds_set_volume_index_record_chapter(&record, chapter);
-               } else if (request->type != UDS_UPDATE) {
-                       /* The record is already in the open chapter. */
-                       return UDS_SUCCESS;
-               }
-       } else {
-               /*
-                * The record wasn't in the volume index, so check whether the
-                * name is in a cached sparse chapter. If we found the name on
-                * a previous search, use that result instead.
-                */
-               if (request->location == UDS_LOCATION_RECORD_PAGE_LOOKUP) {
-                       found = true;
-               } else if (request->location == UDS_LOCATION_UNAVAILABLE) {
-                       found = false;
-               } else if (uds_is_sparse_index_geometry(zone->index->volume->geometry) &&
-                          !uds_is_volume_index_sample(zone->index->volume_index,
-                                                      &request->record_name)) {
-                       result = search_sparse_cache_in_zone(zone, request, NO_CHAPTER,
-                                                            &found);
-                       if (result != UDS_SUCCESS)
-                               return result;
-               }
-
-               if (found)
-                       set_request_location(request, UDS_LOCATION_IN_SPARSE);
-
-               if ((request->type == UDS_QUERY_NO_UPDATE) ||
-                   ((request->type == UDS_QUERY) && !found)) {
-                       /* There is nothing left to do. */
-                       return UDS_SUCCESS;
-               }
-
-               /*
-                * Add a new entry to the volume index referencing the open chapter. This needs to
-                * be done both for new records, and for records from cached sparse chapters.
-                */
-               result = uds_put_volume_index_record(&record, chapter);
-       }
-
-       if (result == UDS_OVERFLOW) {
-               /*
-                * The volume index encountered a delta list overflow. The condition was already
-                * logged. We will go on without adding the record to the open chapter.
-                */
-               return UDS_SUCCESS;
-       }
-
-       if (result != UDS_SUCCESS)
-               return result;
-
-       if (!found || (request->type == UDS_UPDATE)) {
-               /* This is a new record or we're updating an existing record. */
-               metadata = &request->new_metadata;
-       } else {
-               /* Move the existing record to the open chapter. */
-               metadata = &request->old_metadata;
-       }
-
-       return put_record_in_zone(zone, request, metadata);
-}
-
-static int remove_from_index_zone(struct index_zone *zone, struct uds_request *request)
-{
-       int result;
-       struct volume_index_record record;
-
-       result = uds_get_volume_index_record(zone->index->volume_index,
-                                            &request->record_name, &record);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       if (!record.is_found)
-               return UDS_SUCCESS;
-
-       /* If the request was requeued, check whether the saved state is still valid. */
-
-       if (record.is_collision) {
-               set_chapter_location(request, zone, record.virtual_chapter);
-       } else {
-               /* Non-collision records are hints, so resolve the name in the chapter. */
-               bool found;
-
-               if (request->requeued && request->virtual_chapter != record.virtual_chapter)
-                       set_request_location(request, UDS_LOCATION_UNKNOWN);
-
-               request->virtual_chapter = record.virtual_chapter;
-               result = get_record_from_zone(zone, request, &found);
-               if (result != UDS_SUCCESS)
-                       return result;
-
-               if (!found) {
-                       /* There is no record to remove. */
-                       return UDS_SUCCESS;
-               }
-       }
-
-       set_chapter_location(request, zone, record.virtual_chapter);
-
-       /*
-        * Delete the volume index entry for the named record only. Note that a later search might
-        * later return stale advice if there is a colliding name in the same chapter, but it's a
-        * very rare case (1 in 2^21).
-        */
-       result = uds_remove_volume_index_record(&record);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       /*
-        * If the record is in the open chapter, we must remove it or mark it deleted to avoid
-        * trouble if the record is added again later.
-        */
-       if (request->location == UDS_LOCATION_IN_OPEN_CHAPTER)
-               uds_remove_from_open_chapter(zone->open_chapter, &request->record_name);
-
-       return UDS_SUCCESS;
-}
-
-static int dispatch_index_request(struct uds_index *index, struct uds_request *request)
-{
-       int result;
-       struct index_zone *zone = index->zones[request->zone_number];
-
-       if (!request->requeued) {
-               result = simulate_index_zone_barrier_message(zone, request);
-               if (result != UDS_SUCCESS)
-                       return result;
-       }
-
-       switch (request->type) {
-       case UDS_POST:
-       case UDS_UPDATE:
-       case UDS_QUERY:
-       case UDS_QUERY_NO_UPDATE:
-               result = search_index_zone(zone, request);
-               break;
-
-       case UDS_DELETE:
-               result = remove_from_index_zone(zone, request);
-               break;
-
-       default:
-               result = uds_log_warning_strerror(UDS_INVALID_ARGUMENT,
-                                                 "invalid request type: %d",
-                                                 request->type);
-               break;
-       }
-
-       return result;
-}
-
-/* This is the request processing function invoked by each zone's thread. */
-static void execute_zone_request(struct uds_request *request)
-{
-       int result;
-       struct uds_index *index = request->index;
-
-       if (request->zone_message.type != UDS_MESSAGE_NONE) {
-               result = dispatch_index_zone_control_request(request);
-               if (result != UDS_SUCCESS) {
-                       uds_log_error_strerror(result, "error executing message: %d",
-                                              request->zone_message.type);
-               }
-
-               /* Once the message is processed it can be freed. */
-               uds_free(uds_forget(request));
-               return;
-       }
-
-       index->need_to_save = true;
-       if (request->requeued && (request->status != UDS_SUCCESS)) {
-               set_request_location(request, UDS_LOCATION_UNAVAILABLE);
-               index->callback(request);
-               return;
-       }
-
-       result = dispatch_index_request(index, request);
-       if (result == UDS_QUEUED) {
-               /* The request has been requeued so don't let it complete. */
-               return;
-       }
-
-       if (!request->found)
-               set_request_location(request, UDS_LOCATION_UNAVAILABLE);
-
-       request->status = result;
-       index->callback(request);
-}
-
-static int initialize_index_queues(struct uds_index *index,
-                                  const struct index_geometry *geometry)
-{
-       int result;
-       unsigned int i;
-
-       for (i = 0; i < index->zone_count; i++) {
-               result = uds_make_request_queue("indexW", &execute_zone_request,
-                                               &index->zone_queues[i]);
-               if (result != UDS_SUCCESS)
-                       return result;
-       }
-
-       /* The triage queue is only needed for sparse multi-zone indexes. */
-       if ((index->zone_count > 1) && uds_is_sparse_index_geometry(geometry)) {
-               result = uds_make_request_queue("triageW", &triage_request,
-                                               &index->triage_queue);
-               if (result != UDS_SUCCESS)
-                       return result;
-       }
-
-       return UDS_SUCCESS;
-}
-
-/* This is the driver function for the chapter writer thread. */
-static void close_chapters(void *arg)
-{
-       int result;
-       struct chapter_writer *writer = arg;
-       struct uds_index *index = writer->index;
-
-       uds_log_debug("chapter writer starting");
-       mutex_lock(&writer->mutex);
-       for (;;) {
-               while (writer->zones_to_write < index->zone_count) {
-                       if (writer->stop && (writer->zones_to_write == 0)) {
-                               /*
-                                * We've been told to stop, and all of the zones are in the same
-                                * open chapter, so we can exit now.
-                                */
-                               mutex_unlock(&writer->mutex);
-                               uds_log_debug("chapter writer stopping");
-                               return;
-                       }
-                       uds_wait_cond(&writer->cond, &writer->mutex);
-               }
-
-               /*
-                * Release the lock while closing a chapter. We probably don't need to do this, but
-                * it seems safer in principle. It's OK to access the chapter and chapter_number
-                * fields without the lock since those aren't allowed to change until we're done.
-                */
-               mutex_unlock(&writer->mutex);
-
-               if (index->has_saved_open_chapter) {
-                       /*
-                        * Remove the saved open chapter the first time we close an open chapter
-                        * after loading from a clean shutdown, or after doing a clean save. The
-                        * lack of the saved open chapter will indicate that a recovery is
-                        * necessary.
-                        */
-                       index->has_saved_open_chapter = false;
-                       result = uds_discard_open_chapter(index->layout);
-                       if (result == UDS_SUCCESS)
-                               uds_log_debug("Discarding saved open chapter");
-               }
-
-               result = uds_close_open_chapter(writer->chapters, index->zone_count,
-                                               index->volume,
-                                               writer->open_chapter_index,
-                                               writer->collated_records,
-                                               index->newest_virtual_chapter);
-
-               mutex_lock(&writer->mutex);
-               index->newest_virtual_chapter++;
-               index->oldest_virtual_chapter +=
-                       uds_chapters_to_expire(index->volume->geometry,
-                                              index->newest_virtual_chapter);
-               writer->result = result;
-               writer->zones_to_write = 0;
-               uds_broadcast_cond(&writer->cond);
-       }
-}
-
-static void stop_chapter_writer(struct chapter_writer *writer)
-{
-       struct thread *writer_thread = NULL;
-
-       mutex_lock(&writer->mutex);
-       if (writer->thread != NULL) {
-               writer_thread = writer->thread;
-               writer->thread = NULL;
-               writer->stop = true;
-               uds_broadcast_cond(&writer->cond);
-       }
-       mutex_unlock(&writer->mutex);
-
-       if (writer_thread != NULL)
-               vdo_join_threads(writer_thread);
-}
-
-static void free_chapter_writer(struct chapter_writer *writer)
-{
-       if (writer == NULL)
-               return;
-
-       stop_chapter_writer(writer);
-       uds_free_open_chapter_index(writer->open_chapter_index);
-       uds_free(writer->collated_records);
-       uds_free(writer);
-}
-
-static int make_chapter_writer(struct uds_index *index,
-                              struct chapter_writer **writer_ptr)
-{
-       int result;
-       struct chapter_writer *writer;
-       size_t collated_records_size =
-               (sizeof(struct uds_volume_record) * index->volume->geometry->records_per_chapter);
-
-       result = uds_allocate_extended(struct chapter_writer, index->zone_count,
-                                      struct open_chapter_zone *, "Chapter Writer",
-                                      &writer);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       writer->index = index;
-       mutex_init(&writer->mutex);
-       uds_init_cond(&writer->cond);
-
-       result = uds_allocate_cache_aligned(collated_records_size, "collated records",
-                                           &writer->collated_records);
-       if (result != UDS_SUCCESS) {
-               free_chapter_writer(writer);
-               return result;
-       }
-
-       result = uds_make_open_chapter_index(&writer->open_chapter_index,
-                                            index->volume->geometry,
-                                            index->volume->nonce);
-       if (result != UDS_SUCCESS) {
-               free_chapter_writer(writer);
-               return result;
-       }
-
-       writer->memory_size = (sizeof(struct chapter_writer) +
-                              index->zone_count * sizeof(struct open_chapter_zone *) +
-                              collated_records_size +
-                              writer->open_chapter_index->memory_size);
-
-       result = vdo_create_thread(close_chapters, writer, "writer", &writer->thread);
-       if (result != UDS_SUCCESS) {
-               free_chapter_writer(writer);
-               return result;
-       }
-
-       *writer_ptr = writer;
-       return UDS_SUCCESS;
-}
-
-static int load_index(struct uds_index *index)
-{
-       int result;
-       u64 last_save_chapter;
-
-       result = uds_load_index_state(index->layout, index);
-       if (result != UDS_SUCCESS)
-               return UDS_INDEX_NOT_SAVED_CLEANLY;
-
-       last_save_chapter = ((index->last_save != NO_LAST_SAVE) ? index->last_save : 0);
-
-       uds_log_info("loaded index from chapter %llu through chapter %llu",
-                    (unsigned long long) index->oldest_virtual_chapter,
-                    (unsigned long long) last_save_chapter);
-
-       return UDS_SUCCESS;
-}
-
-static int rebuild_index_page_map(struct uds_index *index, u64 vcn)
-{
-       int result;
-       struct delta_index_page *chapter_index_page;
-       struct index_geometry *geometry = index->volume->geometry;
-       u32 chapter = uds_map_to_physical_chapter(geometry, vcn);
-       u32 expected_list_number = 0;
-       u32 index_page_number;
-       u32 lowest_delta_list;
-       u32 highest_delta_list;
-
-       for (index_page_number = 0;
-            index_page_number < geometry->index_pages_per_chapter;
-            index_page_number++) {
-               result = uds_get_volume_index_page(index->volume, chapter,
-                                                  index_page_number,
-                                                  &chapter_index_page);
-               if (result != UDS_SUCCESS) {
-                       return uds_log_error_strerror(result,
-                                                     "failed to read index page %u in chapter %u",
-                                                     index_page_number, chapter);
-               }
-
-               lowest_delta_list = chapter_index_page->lowest_list_number;
-               highest_delta_list = chapter_index_page->highest_list_number;
-               if (lowest_delta_list != expected_list_number) {
-                       return uds_log_error_strerror(UDS_CORRUPT_DATA,
-                                                     "chapter %u index page %u is corrupt",
-                                                     chapter, index_page_number);
-               }
-
-               uds_update_index_page_map(index->volume->index_page_map, vcn, chapter,
-                                         index_page_number, highest_delta_list);
-               expected_list_number = highest_delta_list + 1;
-       }
-
-       return UDS_SUCCESS;
-}
-
-static int replay_record(struct uds_index *index, const struct uds_record_name *name,
-                        u64 virtual_chapter, bool will_be_sparse_chapter)
-{
-       int result;
-       struct volume_index_record record;
-       bool update_record;
-
-       if (will_be_sparse_chapter &&
-           !uds_is_volume_index_sample(index->volume_index, name)) {
-               /*
-                * This entry will be in a sparse chapter after the rebuild completes, and it is
-                * not a sample, so just skip over it.
-                */
-               return UDS_SUCCESS;
-       }
-
-       result = uds_get_volume_index_record(index->volume_index, name, &record);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       if (record.is_found) {
-               if (record.is_collision) {
-                       if (record.virtual_chapter == virtual_chapter) {
-                               /* The record is already correct. */
-                               return UDS_SUCCESS;
-                       }
-
-                       update_record = true;
-               } else if (record.virtual_chapter == virtual_chapter) {
-                       /*
-                        * There is a volume index entry pointing to the current chapter, but we
-                        * don't know if it is for the same name as the one we are currently
-                        * working on or not. For now, we're just going to assume that it isn't.
-                        * This will create one extra collision record if there was a deleted
-                        * record in the current chapter.
-                        */
-                       update_record = false;
-               } else {
-                       /*
-                        * If we're rebuilding, we don't normally want to go to disk to see if the
-                        * record exists, since we will likely have just read the record from disk
-                        * (i.e. we know it's there). The exception to this is when we find an
-                        * entry in the volume index that has a different chapter. In this case, we
-                        * need to search that chapter to determine if the volume index entry was
-                        * for the same record or a different one.
-                        */
-                       result = uds_search_volume_page_cache_for_rebuild(index->volume,
-                                                                         name,
-                                                                         record.virtual_chapter,
-                                                                         &update_record);
-                       if (result != UDS_SUCCESS)
-                               return result;
-                       }
-       } else {
-               update_record = false;
-       }
-
-       if (update_record) {
-               /*
-                * Update the volume index to reference the new chapter for the block. If the
-                * record had been deleted or dropped from the chapter index, it will be back.
-                */
-               result = uds_set_volume_index_record_chapter(&record, virtual_chapter);
-       } else {
-               /*
-                * Add a new entry to the volume index referencing the open chapter. This should be
-                * done regardless of whether we are a brand new record or a sparse record, i.e.
-                * one that doesn't exist in the index but does on disk, since for a sparse record,
-                * we would want to un-sparsify if it did exist.
-                */
-               result = uds_put_volume_index_record(&record, virtual_chapter);
-       }
-
-       if ((result == UDS_DUPLICATE_NAME) || (result == UDS_OVERFLOW)) {
-               /* The rebuilt index will lose these records. */
-               return UDS_SUCCESS;
-       }
-
-       return result;
-}
-
-static bool check_for_suspend(struct uds_index *index)
-{
-       bool closing;
-
-       if (index->load_context == NULL)
-               return false;
-
-       mutex_lock(&index->load_context->mutex);
-       if (index->load_context->status != INDEX_SUSPENDING) {
-               mutex_unlock(&index->load_context->mutex);
-               return false;
-       }
-
-       /* Notify that we are suspended and wait for the resume. */
-       index->load_context->status = INDEX_SUSPENDED;
-       uds_broadcast_cond(&index->load_context->cond);
-
-       while ((index->load_context->status != INDEX_OPENING) &&
-              (index->load_context->status != INDEX_FREEING))
-               uds_wait_cond(&index->load_context->cond, &index->load_context->mutex);
-
-       closing = (index->load_context->status == INDEX_FREEING);
-       mutex_unlock(&index->load_context->mutex);
-       return closing;
-}
-
-static int replay_chapter(struct uds_index *index, u64 virtual, bool sparse)
-{
-       int result;
-       u32 i;
-       u32 j;
-       const struct index_geometry *geometry;
-       u32 physical_chapter;
-
-       if (check_for_suspend(index)) {
-               uds_log_info("Replay interrupted by index shutdown at chapter %llu",
-                            (unsigned long long) virtual);
-               return -EBUSY;
-       }
-
-       geometry = index->volume->geometry;
-       physical_chapter = uds_map_to_physical_chapter(geometry, virtual);
-       uds_prefetch_volume_chapter(index->volume, physical_chapter);
-       uds_set_volume_index_open_chapter(index->volume_index, virtual);
-
-       result = rebuild_index_page_map(index, virtual);
-       if (result != UDS_SUCCESS) {
-               return uds_log_error_strerror(result,
-                                             "could not rebuild index page map for chapter %u",
-                                             physical_chapter);
-       }
-
-       for (i = 0; i < geometry->record_pages_per_chapter; i++) {
-               u8 *record_page;
-               u32 record_page_number;
-
-               record_page_number = geometry->index_pages_per_chapter + i;
-               result = uds_get_volume_record_page(index->volume, physical_chapter,
-                                                   record_page_number, &record_page);
-               if (result != UDS_SUCCESS) {
-                       return uds_log_error_strerror(result, "could not get page %d",
-                                                     record_page_number);
-               }
-
-               for (j = 0; j < geometry->records_per_page; j++) {
-                       const u8 *name_bytes;
-                       struct uds_record_name name;
-
-                       name_bytes = record_page + (j * BYTES_PER_RECORD);
-                       memcpy(&name.name, name_bytes, UDS_RECORD_NAME_SIZE);
-                       result = replay_record(index, &name, virtual, sparse);
-                       if (result != UDS_SUCCESS)
-                               return result;
-               }
-       }
-
-       return UDS_SUCCESS;
-}
-
-static int replay_volume(struct uds_index *index)
-{
-       int result;
-       u64 old_map_update;
-       u64 new_map_update;
-       u64 virtual;
-       u64 from_virtual = index->oldest_virtual_chapter;
-       u64 upto_virtual = index->newest_virtual_chapter;
-       bool will_be_sparse;
-
-       uds_log_info("Replaying volume from chapter %llu through chapter %llu",
-                    (unsigned long long) from_virtual,
-                    (unsigned long long) upto_virtual);
-
-       /*
-        * The index failed to load, so the volume index is empty. Add records to the volume index
-        * in order, skipping non-hooks in chapters which will be sparse to save time.
-        *
-        * Go through each record page of each chapter and add the records back to the volume
-        * index. This should not cause anything to be written to either the open chapter or the
-        * on-disk volume. Also skip the on-disk chapter corresponding to upto_virtual, as this
-        * would have already been purged from the volume index when the chapter was opened.
-        *
-        * Also, go through each index page for each chapter and rebuild the index page map.
-        */
-       old_map_update = index->volume->index_page_map->last_update;
-       for (virtual = from_virtual; virtual < upto_virtual; virtual++) {
-               will_be_sparse = uds_is_chapter_sparse(index->volume->geometry,
-                                                      from_virtual, upto_virtual,
-                                                      virtual);
-               result = replay_chapter(index, virtual, will_be_sparse);
-               if (result != UDS_SUCCESS)
-                       return result;
-       }
-
-       /* Also reap the chapter being replaced by the open chapter. */
-       uds_set_volume_index_open_chapter(index->volume_index, upto_virtual);
-
-       new_map_update = index->volume->index_page_map->last_update;
-       if (new_map_update != old_map_update) {
-               uds_log_info("replay changed index page map update from %llu to %llu",
-                            (unsigned long long) old_map_update,
-                            (unsigned long long) new_map_update);
-       }
-
-       return UDS_SUCCESS;
-}
-
-static int rebuild_index(struct uds_index *index)
-{
-       int result;
-       u64 lowest;
-       u64 highest;
-       bool is_empty = false;
-       u32 chapters_per_volume = index->volume->geometry->chapters_per_volume;
-
-       index->volume->lookup_mode = LOOKUP_FOR_REBUILD;
-       result = uds_find_volume_chapter_boundaries(index->volume, &lowest, &highest,
-                                                   &is_empty);
-       if (result != UDS_SUCCESS) {
-               return uds_log_fatal_strerror(result,
-                                             "cannot rebuild index: unknown volume chapter boundaries");
-       }
-
-       if (is_empty) {
-               index->newest_virtual_chapter = 0;
-               index->oldest_virtual_chapter = 0;
-               index->volume->lookup_mode = LOOKUP_NORMAL;
-               return UDS_SUCCESS;
-       }
-
-       index->newest_virtual_chapter = highest + 1;
-       index->oldest_virtual_chapter = lowest;
-       if (index->newest_virtual_chapter ==
-           (index->oldest_virtual_chapter + chapters_per_volume)) {
-               /* Skip the chapter shadowed by the open chapter. */
-               index->oldest_virtual_chapter++;
-       }
-
-       result = replay_volume(index);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       index->volume->lookup_mode = LOOKUP_NORMAL;
-       return UDS_SUCCESS;
-}
-
-static void free_index_zone(struct index_zone *zone)
-{
-       if (zone == NULL)
-               return;
-
-       uds_free_open_chapter(zone->open_chapter);
-       uds_free_open_chapter(zone->writing_chapter);
-       uds_free(zone);
-}
-
-static int make_index_zone(struct uds_index *index, unsigned int zone_number)
-{
-       int result;
-       struct index_zone *zone;
-
-       result = uds_allocate(1, struct index_zone, "index zone", &zone);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       result = uds_make_open_chapter(index->volume->geometry, index->zone_count,
-                                      &zone->open_chapter);
-       if (result != UDS_SUCCESS) {
-               free_index_zone(zone);
-               return result;
-       }
-
-       result = uds_make_open_chapter(index->volume->geometry, index->zone_count,
-                                      &zone->writing_chapter);
-       if (result != UDS_SUCCESS) {
-               free_index_zone(zone);
-               return result;
-       }
-
-       zone->index = index;
-       zone->id = zone_number;
-       index->zones[zone_number] = zone;
-
-       return UDS_SUCCESS;
-}
-
-int uds_make_index(struct uds_configuration *config, enum uds_open_index_type open_type,
-                  struct index_load_context *load_context, index_callback_fn callback,
-                  struct uds_index **new_index)
-{
-       int result;
-       bool loaded = false;
-       bool new = (open_type == UDS_CREATE);
-       struct uds_index *index = NULL;
-       struct index_zone *zone;
-       u64 nonce;
-       unsigned int z;
-
-       result = uds_allocate_extended(struct uds_index, config->zone_count,
-                                      struct uds_request_queue *, "index", &index);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       index->zone_count = config->zone_count;
-
-       result = uds_make_index_layout(config, new, &index->layout);
-       if (result != UDS_SUCCESS) {
-               uds_free_index(index);
-               return result;
-       }
-
-       result = uds_allocate(index->zone_count, struct index_zone *, "zones",
-                             &index->zones);
-       if (result != UDS_SUCCESS) {
-               uds_free_index(index);
-               return result;
-       }
-
-       result = uds_make_volume(config, index->layout, &index->volume);
-       if (result != UDS_SUCCESS) {
-               uds_free_index(index);
-               return result;
-       }
-
-       index->volume->lookup_mode = LOOKUP_NORMAL;
-       for (z = 0; z < index->zone_count; z++) {
-               result = make_index_zone(index, z);
-               if (result != UDS_SUCCESS) {
-                       uds_free_index(index);
-                       return uds_log_error_strerror(result,
-                                                     "Could not create index zone");
-               }
-       }
-
-       nonce = uds_get_volume_nonce(index->layout);
-       result = uds_make_volume_index(config, nonce, &index->volume_index);
-       if (result != UDS_SUCCESS) {
-               uds_free_index(index);
-               return uds_log_error_strerror(result, "could not make volume index");
-       }
-
-       index->load_context = load_context;
-       index->callback = callback;
-
-       result = initialize_index_queues(index, config->geometry);
-       if (result != UDS_SUCCESS) {
-               uds_free_index(index);
-               return result;
-       }
-
-       result = make_chapter_writer(index, &index->chapter_writer);
-       if (result != UDS_SUCCESS) {
-               uds_free_index(index);
-               return result;
-       }
-
-       if (!new) {
-               result = load_index(index);
-               switch (result) {
-               case UDS_SUCCESS:
-                       loaded = true;
-                       break;
-               case -ENOMEM:
-                       /* We should not try a rebuild for this error. */
-                       uds_log_error_strerror(result, "index could not be loaded");
-                       break;
-               default:
-                       uds_log_error_strerror(result, "index could not be loaded");
-                       if (open_type == UDS_LOAD) {
-                               result = rebuild_index(index);
-                               if (result != UDS_SUCCESS) {
-                                       uds_log_error_strerror(result,
-                                                              "index could not be rebuilt");
-                               }
-                       }
-                       break;
-               }
-       }
-
-       if (result != UDS_SUCCESS) {
-               uds_free_index(index);
-               return uds_log_error_strerror(result, "fatal error in %s()", __func__);
-       }
-
-       for (z = 0; z < index->zone_count; z++) {
-               zone = index->zones[z];
-               zone->oldest_virtual_chapter = index->oldest_virtual_chapter;
-               zone->newest_virtual_chapter = index->newest_virtual_chapter;
-       }
-
-       if (index->load_context != NULL) {
-               mutex_lock(&index->load_context->mutex);
-               index->load_context->status = INDEX_READY;
-               /*
-                * If we get here, suspend is meaningless, but notify any thread trying to suspend
-                * us so it doesn't hang.
-                */
-               uds_broadcast_cond(&index->load_context->cond);
-               mutex_unlock(&index->load_context->mutex);
-       }
-
-       index->has_saved_open_chapter = loaded;
-       index->need_to_save = !loaded;
-       *new_index = index;
-       return UDS_SUCCESS;
-}
-
-void uds_free_index(struct uds_index *index)
-{
-       unsigned int i;
-
-       if (index == NULL)
-               return;
-
-       uds_request_queue_finish(index->triage_queue);
-       for (i = 0; i < index->zone_count; i++)
-               uds_request_queue_finish(index->zone_queues[i]);
-
-       free_chapter_writer(index->chapter_writer);
-
-       uds_free_volume_index(index->volume_index);
-       if (index->zones != NULL) {
-               for (i = 0; i < index->zone_count; i++)
-                       free_index_zone(index->zones[i]);
-               uds_free(index->zones);
-       }
-
-       uds_free_volume(index->volume);
-       uds_free_index_layout(uds_forget(index->layout));
-       uds_free(index);
-}
-
-/* Wait for the chapter writer to complete any outstanding writes. */
-void uds_wait_for_idle_index(struct uds_index *index)
-{
-       struct chapter_writer *writer = index->chapter_writer;
-
-       mutex_lock(&writer->mutex);
-       while (writer->zones_to_write > 0)
-               uds_wait_cond(&writer->cond, &writer->mutex);
-       mutex_unlock(&writer->mutex);
-}
-
-/* This function assumes that all requests have been drained. */
-int uds_save_index(struct uds_index *index)
-{
-       int result;
-
-       if (!index->need_to_save)
-               return UDS_SUCCESS;
-
-       uds_wait_for_idle_index(index);
-       index->prev_save = index->last_save;
-       index->last_save = ((index->newest_virtual_chapter == 0) ?
-                           NO_LAST_SAVE : index->newest_virtual_chapter - 1);
-       uds_log_info("beginning save (vcn %llu)", (unsigned long long) index->last_save);
-
-       result = uds_save_index_state(index->layout, index);
-       if (result != UDS_SUCCESS) {
-               uds_log_info("save index failed");
-               index->last_save = index->prev_save;
-       } else {
-               index->has_saved_open_chapter = true;
-               index->need_to_save = false;
-               uds_log_info("finished save (vcn %llu)",
-                            (unsigned long long) index->last_save);
-       }
-
-       return result;
-}
-
-int uds_replace_index_storage(struct uds_index *index, struct block_device *bdev)
-{
-       return uds_replace_volume_storage(index->volume, index->layout, bdev);
-}
-
-/* Accessing statistics should be safe from any thread. */
-void uds_get_index_stats(struct uds_index *index, struct uds_index_stats *counters)
-{
-       struct volume_index_stats stats;
-
-       uds_get_volume_index_stats(index->volume_index, &stats);
-       counters->entries_indexed = stats.record_count;
-       counters->collisions = stats.collision_count;
-       counters->entries_discarded = stats.discard_count;
-
-       counters->memory_used = (index->volume_index->memory_size +
-                                index->volume->cache_size +
-                                index->chapter_writer->memory_size);
-}
-
-void uds_enqueue_request(struct uds_request *request, enum request_stage stage)
-{
-       struct uds_index *index = request->index;
-       struct uds_request_queue *queue;
-
-       switch (stage) {
-       case STAGE_TRIAGE:
-               if (index->triage_queue != NULL) {
-                       queue = index->triage_queue;
-                       break;
-               }
-
-               fallthrough;
-
-       case STAGE_INDEX:
-               request->zone_number =
-                       uds_get_volume_index_zone(index->volume_index, &request->record_name);
-               fallthrough;
-
-       case STAGE_MESSAGE:
-               queue = index->zone_queues[request->zone_number];
-               break;
-
-       default:
-               ASSERT_LOG_ONLY(false, "invalid index stage: %d", stage);
-               return;
-       }
-
-       uds_request_queue_enqueue(queue, request);
-}
diff --git a/drivers/md/dm-vdo/index.h b/drivers/md/dm-vdo/index.h
deleted file mode 100644 (file)
index edabb23..0000000
+++ /dev/null
@@ -1,83 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright 2023 Red Hat
- */
-
-#ifndef UDS_INDEX_H
-#define UDS_INDEX_H
-
-#include "index-layout.h"
-#include "index-session.h"
-#include "open-chapter.h"
-#include "volume.h"
-#include "volume-index.h"
-
-/*
- * The index is a high-level structure which represents the totality of the UDS index. It manages
- * the queues for incoming requests and dispatches them to the appropriate sub-components like the
- * volume or the volume index. It also manages administrative tasks such as saving and loading the
- * index.
- *
- * The index is divided into a number of independent zones and assigns each request to a zone based
- * on its name. Most sub-components are similarly divided into zones as well so that requests in
- * each zone usually operate without interference or coordination between zones.
- */
-
-typedef void (*index_callback_fn)(struct uds_request *request);
-
-struct index_zone {
-       struct uds_index *index;
-       struct open_chapter_zone *open_chapter;
-       struct open_chapter_zone *writing_chapter;
-       u64 oldest_virtual_chapter;
-       u64 newest_virtual_chapter;
-       unsigned int id;
-};
-
-struct uds_index {
-       bool has_saved_open_chapter;
-       bool need_to_save;
-       struct index_load_context *load_context;
-       struct index_layout *layout;
-       struct volume_index *volume_index;
-       struct volume *volume;
-       unsigned int zone_count;
-       struct index_zone **zones;
-
-       u64 oldest_virtual_chapter;
-       u64 newest_virtual_chapter;
-
-       u64 last_save;
-       u64 prev_save;
-       struct chapter_writer *chapter_writer;
-
-       index_callback_fn callback;
-       struct uds_request_queue *triage_queue;
-       struct uds_request_queue *zone_queues[];
-};
-
-enum request_stage {
-       STAGE_TRIAGE,
-       STAGE_INDEX,
-       STAGE_MESSAGE,
-};
-
-int __must_check uds_make_index(struct uds_configuration *config,
-                               enum uds_open_index_type open_type,
-                               struct index_load_context *load_context,
-                               index_callback_fn callback, struct uds_index **new_index);
-
-int __must_check uds_save_index(struct uds_index *index);
-
-void uds_free_index(struct uds_index *index);
-
-int __must_check uds_replace_index_storage(struct uds_index *index,
-                                          struct block_device *bdev);
-
-void uds_get_index_stats(struct uds_index *index, struct uds_index_stats *counters);
-
-void uds_enqueue_request(struct uds_request *request, enum request_stage stage);
-
-void uds_wait_for_idle_index(struct uds_index *index);
-
-#endif /* UDS_INDEX_H */
diff --git a/drivers/md/dm-vdo/indexer.h b/drivers/md/dm-vdo/indexer.h
deleted file mode 100644 (file)
index 3744aaf..0000000
+++ /dev/null
@@ -1,353 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright 2023 Red Hat
- */
-
-#ifndef INDEXER_H
-#define INDEXER_H
-
-#include <linux/mutex.h>
-#include <linux/sched.h>
-#include <linux/types.h>
-#include <linux/wait.h>
-
-#include "funnel-queue.h"
-
-/*
- * UDS public API
- *
- * The Universal Deduplication System (UDS) is an efficient name-value store. When used for
- * deduplicating storage, the names are generally hashes of data blocks and the associated data is
- * where that block is located on the underlying storage medium. The stored names are expected to
- * be randomly distributed among the space of possible names. If this assumption is violated, the
- * UDS index will store fewer names than normal but will otherwise continue to work. The data
- * associated with each name can be any 16-byte value.
- *
- * A client must first create an index session to interact with an index. Once created, the session
- * can be shared among multiple threads or users. When a session is destroyed, it will also close
- * and save any associated index.
- *
- * To make a request, a client must allocate a uds_request structure and set the required fields
- * before launching it. UDS will invoke the provided callback to complete the request. After the
- * callback has been called, the uds_request structure can be freed or reused for a new request.
- * There are five types of requests:
- *
- * A UDS_UPDATE request will associate the provided name with the provided data. Any previous data
- * associated with that name will be discarded.
- *
- * A UDS_QUERY request will return the data associated with the provided name, if any. The entry
- * for the name will also be marked as most recent, as if the data had been updated.
- *
- * A UDS_POST request is a combination of UDS_QUERY and UDS_UPDATE. If there is already data
- * associated with the provided name, that data is returned. If there is no existing association,
- * the name is associated with the newly provided data. This request is equivalent to a UDS_QUERY
- * request followed by a UDS_UPDATE request if no data is found, but it is much more efficient.
- *
- * A UDS_QUERY_NO_UPDATE request will return the data associated with the provided name, but will
- * not change the recency of the entry for the name. This request is primarily useful for testing,
- * to determine whether an entry exists without changing the internal state of the index.
- *
- * A UDS_DELETE request removes any data associated with the provided name. This operation is
- * generally not necessary, because the index will automatically discard its oldest entries once it
- * becomes full.
- */
-
-/* General UDS constants and structures */
-
-enum uds_request_type {
-       /* Create or update the mapping for a name, and make the name most recent. */
-       UDS_UPDATE,
-
-       /* Return any mapped data for a name, and make the name most recent. */
-       UDS_QUERY,
-
-       /*
-        * Return any mapped data for a name, or map the provided data to the name if there is no
-        * current data, and make the name most recent.
-        */
-       UDS_POST,
-
-       /* Return any mapped data for a name without updating its recency. */
-       UDS_QUERY_NO_UPDATE,
-
-       /* Remove any mapping for a name. */
-       UDS_DELETE,
-
-};
-
-enum uds_open_index_type {
-       /* Create a new index. */
-       UDS_CREATE,
-
-       /* Load an existing index and try to recover if necessary. */
-       UDS_LOAD,
-
-       /* Load an existing index, but only if it was saved cleanly. */
-       UDS_NO_REBUILD,
-};
-
-enum {
-       /* The record name size in bytes */
-       UDS_RECORD_NAME_SIZE = 16,
-       /* The maximum record data size in bytes */
-       UDS_RECORD_DATA_SIZE = 16,
-};
-
-/*
- * A type representing a UDS memory configuration which is either a positive integer number of
- * gigabytes or one of the six special constants for configurations smaller than one gigabyte.
- */
-typedef int uds_memory_config_size_t;
-
-enum {
-       /* The maximum configurable amount of memory */
-       UDS_MEMORY_CONFIG_MAX = 1024,
-       /* Flag indicating that the index has one less chapter than usual */
-       UDS_MEMORY_CONFIG_REDUCED = 0x1000,
-       UDS_MEMORY_CONFIG_REDUCED_MAX = 1024 + UDS_MEMORY_CONFIG_REDUCED,
-       /* Special values indicating sizes less than 1 GB */
-       UDS_MEMORY_CONFIG_256MB = -256,
-       UDS_MEMORY_CONFIG_512MB = -512,
-       UDS_MEMORY_CONFIG_768MB = -768,
-       UDS_MEMORY_CONFIG_REDUCED_256MB = -1280,
-       UDS_MEMORY_CONFIG_REDUCED_512MB = -1536,
-       UDS_MEMORY_CONFIG_REDUCED_768MB = -1792,
-};
-
-struct uds_record_name {
-       unsigned char name[UDS_RECORD_NAME_SIZE];
-};
-
-struct uds_record_data {
-       unsigned char data[UDS_RECORD_DATA_SIZE];
-};
-
-struct uds_volume_record {
-       struct uds_record_name name;
-       struct uds_record_data data;
-};
-
-struct uds_parameters {
-       /* The block_device used for storage */
-       struct block_device *bdev;
-       /* The maximum allowable size of the index on storage */
-       size_t size;
-       /* The offset where the index should start */
-       off_t offset;
-       /* The maximum memory allocation, in GB */
-       uds_memory_config_size_t memory_size;
-       /* Whether the index should include sparse chapters */
-       bool sparse;
-       /* A 64-bit nonce to validate the index */
-       u64 nonce;
-       /* The number of threads used to process index requests */
-       unsigned int zone_count;
-       /* The number of threads used to read volume pages */
-       unsigned int read_threads;
-};
-
-/*
- * These statistics capture characteristics of the current index, including resource usage and
- * requests processed since the index was opened.
- */
-struct uds_index_stats {
-       /* The total number of records stored in the index */
-       u64 entries_indexed;
-       /* An estimate of the index's memory usage, in bytes */
-       u64 memory_used;
-       /* The number of collisions recorded in the volume index */
-       u64 collisions;
-       /* The number of entries discarded from the index since startup */
-       u64 entries_discarded;
-       /* The time at which these statistics were fetched */
-       s64 current_time;
-       /* The number of post calls that found an existing entry */
-       u64 posts_found;
-       /* The number of post calls that added an entry */
-       u64 posts_not_found;
-       /*
-        * The number of post calls that found an existing entry that is current enough to only
-        * exist in memory and not have been committed to disk yet
-        */
-       u64 in_memory_posts_found;
-       /*
-        * The number of post calls that found an existing entry in the dense portion of the index
-        */
-       u64 dense_posts_found;
-       /*
-        * The number of post calls that found an existing entry in the sparse portion of the index
-        */
-       u64 sparse_posts_found;
-       /* The number of update calls that updated an existing entry */
-       u64 updates_found;
-       /* The number of update calls that added a new entry */
-       u64 updates_not_found;
-       /* The number of delete requests that deleted an existing entry */
-       u64 deletions_found;
-       /* The number of delete requests that did nothing */
-       u64 deletions_not_found;
-       /* The number of query calls that found existing entry */
-       u64 queries_found;
-       /* The number of query calls that did not find an entry */
-       u64 queries_not_found;
-       /* The total number of requests processed */
-       u64 requests;
-};
-
-enum uds_index_region {
-       /* No location information has been determined */
-       UDS_LOCATION_UNKNOWN = 0,
-       /* The index page entry has been found */
-       UDS_LOCATION_INDEX_PAGE_LOOKUP,
-       /* The record page entry has been found */
-       UDS_LOCATION_RECORD_PAGE_LOOKUP,
-       /* The record is not in the index */
-       UDS_LOCATION_UNAVAILABLE,
-       /* The record was found in the open chapter */
-       UDS_LOCATION_IN_OPEN_CHAPTER,
-       /* The record was found in the dense part of the index */
-       UDS_LOCATION_IN_DENSE,
-       /* The record was found in the sparse part of the index */
-       UDS_LOCATION_IN_SPARSE,
-} __packed;
-
-/* Zone message requests are used to communicate between index zones. */
-enum uds_zone_message_type {
-       /* A standard request with no message */
-       UDS_MESSAGE_NONE = 0,
-       /* Add a chapter to the sparse chapter index cache */
-       UDS_MESSAGE_SPARSE_CACHE_BARRIER,
-       /* Close a chapter to keep the zone from falling behind */
-       UDS_MESSAGE_ANNOUNCE_CHAPTER_CLOSED,
-} __packed;
-
-struct uds_zone_message {
-       /* The type of message, determining how it will be processed */
-       enum uds_zone_message_type type;
-       /* The virtual chapter number to which the message applies */
-       u64 virtual_chapter;
-};
-
-struct uds_index_session;
-struct uds_index;
-struct uds_request;
-
-/* Once this callback has been invoked, the uds_request structure can be reused or freed. */
-typedef void (*uds_request_callback_fn)(struct uds_request *request);
-
-struct uds_request {
-       /* These input fields must be set before launching a request. */
-
-       /* The name of the record to look up or create */
-       struct uds_record_name record_name;
-       /* New data to associate with the record name, if applicable */
-       struct uds_record_data new_metadata;
-       /* A callback to invoke when the request is complete */
-       uds_request_callback_fn callback;
-       /* The index session that will manage this request */
-       struct uds_index_session *session;
-       /* The type of operation to perform, as describe above */
-       enum uds_request_type type;
-
-       /* These output fields are set when a request is complete. */
-
-       /* The existing data associated with the request name, if any */
-       struct uds_record_data old_metadata;
-       /* Either UDS_SUCCESS or an error code for the request */
-       int status;
-       /* True if the record name had an existing entry in the index */
-       bool found;
-
-       /*
-        * The remaining fields are used internally and should not be altered by clients. The index
-        * relies on zone_number being the first field in this section.
-        */
-
-       /* The number of the zone which will process this request*/
-       unsigned int zone_number;
-       /* A link for adding a request to a lock-free queue */
-       struct funnel_queue_entry queue_link;
-       /* A link for adding a request to a standard linked list */
-       struct uds_request *next_request;
-       /* A pointer to the index processing this request */
-       struct uds_index *index;
-       /* Control message for coordinating between zones */
-       struct uds_zone_message zone_message;
-       /* If true, process request immediately by waking the worker thread */
-       bool unbatched;
-       /* If true, continue this request before processing newer requests */
-       bool requeued;
-       /* The virtual chapter containing the record name, if known */
-       u64 virtual_chapter;
-       /* The region of the index containing the record name */
-       enum uds_index_region location;
-};
-
-/* Compute the number of bytes needed to store an index. */
-int __must_check uds_compute_index_size(const struct uds_parameters *parameters,
-                                       u64 *index_size);
-
-/* A session is required for most index operations. */
-int __must_check uds_create_index_session(struct uds_index_session **session);
-
-/* Destroying an index session also closes and saves the associated index. */
-int uds_destroy_index_session(struct uds_index_session *session);
-
-/*
- * Create or open an index with an existing session. This operation fails if the index session is
- * suspended, or if there is already an open index.
- */
-int __must_check uds_open_index(enum uds_open_index_type open_type,
-                               const struct uds_parameters *parameters,
-                               struct uds_index_session *session);
-
-/*
- * Wait until all callbacks for index operations are complete, and prevent new index operations
- * from starting. New index operations will fail with EBUSY until the session is resumed. Also
- * optionally saves the index.
- */
-int __must_check uds_suspend_index_session(struct uds_index_session *session, bool save);
-
-/*
- * Allow new index operations for an index, whether it was suspended or not. If the index is
- * suspended and the supplied block device differs from the current backing store, the index will
- * start using the new backing store instead.
- */
-int __must_check uds_resume_index_session(struct uds_index_session *session,
-                                         struct block_device *bdev);
-
-/* Wait until all outstanding index operations are complete. */
-int __must_check uds_flush_index_session(struct uds_index_session *session);
-
-/* Close an index. This operation fails if the index session is suspended. */
-int __must_check uds_close_index(struct uds_index_session *session);
-
-/* Get index statistics since the last time the index was opened. */
-int __must_check uds_get_index_session_stats(struct uds_index_session *session,
-                                            struct uds_index_stats *stats);
-
-/* This function will fail if any required field of the request is not set. */
-int __must_check uds_launch_request(struct uds_request *request);
-
-struct cond_var {
-       wait_queue_head_t wait_queue;
-};
-
-static inline void uds_init_cond(struct cond_var *cv)
-{
-       init_waitqueue_head(&cv->wait_queue);
-}
-
-static inline void uds_signal_cond(struct cond_var *cv)
-{
-       wake_up(&cv->wait_queue);
-}
-
-static inline void uds_broadcast_cond(struct cond_var *cv)
-{
-       wake_up_all(&cv->wait_queue);
-}
-
-void uds_wait_cond(struct cond_var *cv, struct mutex *mutex);
-
-#endif /* INDEXER_H */
diff --git a/drivers/md/dm-vdo/indexer/chapter-index.c b/drivers/md/dm-vdo/indexer/chapter-index.c
new file mode 100644 (file)
index 0000000..6487825
--- /dev/null
@@ -0,0 +1,293 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#include "chapter-index.h"
+
+#include "errors.h"
+#include "logger.h"
+#include "memory-alloc.h"
+#include "permassert.h"
+
+#include "hash-utils.h"
+#include "indexer.h"
+
+int uds_make_open_chapter_index(struct open_chapter_index **chapter_index,
+                               const struct index_geometry *geometry, u64 volume_nonce)
+{
+       int result;
+       size_t memory_size;
+       struct open_chapter_index *index;
+
+       result = uds_allocate(1, struct open_chapter_index, "open chapter index", &index);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       /*
+        * The delta index will rebalance delta lists when memory gets tight,
+        * so give the chapter index one extra page.
+        */
+       memory_size = ((geometry->index_pages_per_chapter + 1) * geometry->bytes_per_page);
+       index->geometry = geometry;
+       index->volume_nonce = volume_nonce;
+       result = uds_initialize_delta_index(&index->delta_index, 1,
+                                           geometry->delta_lists_per_chapter,
+                                           geometry->chapter_mean_delta,
+                                           geometry->chapter_payload_bits,
+                                           memory_size, 'm');
+       if (result != UDS_SUCCESS) {
+               uds_free(index);
+               return result;
+       }
+
+       index->memory_size = index->delta_index.memory_size + sizeof(struct open_chapter_index);
+       *chapter_index = index;
+       return UDS_SUCCESS;
+}
+
+void uds_free_open_chapter_index(struct open_chapter_index *chapter_index)
+{
+       if (chapter_index == NULL)
+               return;
+
+       uds_uninitialize_delta_index(&chapter_index->delta_index);
+       uds_free(chapter_index);
+}
+
+/* Re-initialize an open chapter index for a new chapter. */
+void uds_empty_open_chapter_index(struct open_chapter_index *chapter_index,
+                                 u64 virtual_chapter_number)
+{
+       uds_reset_delta_index(&chapter_index->delta_index);
+       chapter_index->virtual_chapter_number = virtual_chapter_number;
+}
+
+static inline bool was_entry_found(const struct delta_index_entry *entry, u32 address)
+{
+       return (!entry->at_end) && (entry->key == address);
+}
+
+/* Associate a record name with the record page containing its metadata. */
+int uds_put_open_chapter_index_record(struct open_chapter_index *chapter_index,
+                                     const struct uds_record_name *name,
+                                     u32 page_number)
+{
+       int result;
+       struct delta_index_entry entry;
+       u32 address;
+       u32 list_number;
+       const u8 *found_name;
+       bool found;
+       const struct index_geometry *geometry = chapter_index->geometry;
+       u64 chapter_number = chapter_index->virtual_chapter_number;
+       u32 record_pages = geometry->record_pages_per_chapter;
+
+       result = ASSERT(page_number < record_pages,
+                       "Page number within chapter (%u) exceeds the maximum value %u",
+                       page_number, record_pages);
+       if (result != UDS_SUCCESS)
+               return UDS_INVALID_ARGUMENT;
+
+       address = uds_hash_to_chapter_delta_address(name, geometry);
+       list_number = uds_hash_to_chapter_delta_list(name, geometry);
+       result = uds_get_delta_index_entry(&chapter_index->delta_index, list_number,
+                                          address, name->name, &entry);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       found = was_entry_found(&entry, address);
+       result = ASSERT(!(found && entry.is_collision),
+                       "Chunk appears more than once in chapter %llu",
+                       (unsigned long long) chapter_number);
+       if (result != UDS_SUCCESS)
+               return UDS_BAD_STATE;
+
+       found_name = (found ? name->name : NULL);
+       return uds_put_delta_index_entry(&entry, address, page_number, found_name);
+}
+
+/*
+ * Pack a section of an open chapter index into a chapter index page. A range of delta lists
+ * (starting with a specified list index) is copied from the open chapter index into a memory page.
+ * The number of lists copied onto the page is returned to the caller on success.
+ *
+ * @chapter_index: The open chapter index
+ * @memory: The memory page to use
+ * @first_list: The first delta list number to be copied
+ * @last_page: If true, this is the last page of the chapter index and all the remaining lists must
+ *             be packed onto this page
+ * @lists_packed: The number of delta lists that were packed onto this page
+ */
+int uds_pack_open_chapter_index_page(struct open_chapter_index *chapter_index,
+                                    u8 *memory, u32 first_list, bool last_page,
+                                    u32 *lists_packed)
+{
+       int result;
+       struct delta_index *delta_index = &chapter_index->delta_index;
+       struct delta_index_stats stats;
+       u64 nonce = chapter_index->volume_nonce;
+       u64 chapter_number = chapter_index->virtual_chapter_number;
+       const struct index_geometry *geometry = chapter_index->geometry;
+       u32 list_count = geometry->delta_lists_per_chapter;
+       unsigned int removals = 0;
+       struct delta_index_entry entry;
+       u32 next_list;
+       s32 list_number;
+
+       for (;;) {
+               result = uds_pack_delta_index_page(delta_index, nonce, memory,
+                                                  geometry->bytes_per_page,
+                                                  chapter_number, first_list,
+                                                  lists_packed);
+               if (result != UDS_SUCCESS)
+                       return result;
+
+               if ((first_list + *lists_packed) == list_count) {
+                       /* All lists are packed. */
+                       break;
+               } else if (*lists_packed == 0) {
+                       /*
+                        * The next delta list does not fit on a page. This delta list will be
+                        * removed.
+                        */
+               } else if (last_page) {
+                       /*
+                        * This is the last page and there are lists left unpacked, but all of the
+                        * remaining lists must fit on the page. Find a list that contains entries
+                        * and remove the entire list. Try the first list that does not fit. If it
+                        * is empty, we will select the last list that already fits and has any
+                        * entries.
+                        */
+               } else {
+                       /* This page is done. */
+                       break;
+               }
+
+               if (removals == 0) {
+                       uds_get_delta_index_stats(delta_index, &stats);
+                       uds_log_warning("The chapter index for chapter %llu contains %llu entries with %llu collisions",
+                                       (unsigned long long) chapter_number,
+                                       (unsigned long long) stats.record_count,
+                                       (unsigned long long) stats.collision_count);
+               }
+
+               list_number = *lists_packed;
+               do {
+                       if (list_number < 0)
+                               return UDS_OVERFLOW;
+
+                       next_list = first_list + list_number--,
+                       result = uds_start_delta_index_search(delta_index, next_list, 0,
+                                                             &entry);
+                       if (result != UDS_SUCCESS)
+                               return result;
+
+                       result = uds_next_delta_index_entry(&entry);
+                       if (result != UDS_SUCCESS)
+                               return result;
+               } while (entry.at_end);
+
+               do {
+                       result = uds_remove_delta_index_entry(&entry);
+                       if (result != UDS_SUCCESS)
+                               return result;
+
+                       removals++;
+               } while (!entry.at_end);
+       }
+
+       if (removals > 0) {
+               uds_log_warning("To avoid chapter index page overflow in chapter %llu, %u entries were removed from the chapter index",
+                               (unsigned long long) chapter_number, removals);
+       }
+
+       return UDS_SUCCESS;
+}
+
+/* Make a new chapter index page, initializing it with the data from a given index_page buffer. */
+int uds_initialize_chapter_index_page(struct delta_index_page *index_page,
+                                     const struct index_geometry *geometry,
+                                     u8 *page_buffer, u64 volume_nonce)
+{
+       return uds_initialize_delta_index_page(index_page, volume_nonce,
+                                              geometry->chapter_mean_delta,
+                                              geometry->chapter_payload_bits,
+                                              page_buffer, geometry->bytes_per_page);
+}
+
+/* Validate a chapter index page read during rebuild. */
+int uds_validate_chapter_index_page(const struct delta_index_page *index_page,
+                                   const struct index_geometry *geometry)
+{
+       int result;
+       const struct delta_index *delta_index = &index_page->delta_index;
+       u32 first = index_page->lowest_list_number;
+       u32 last = index_page->highest_list_number;
+       u32 list_number;
+
+       /* We walk every delta list from start to finish. */
+       for (list_number = first; list_number <= last; list_number++) {
+               struct delta_index_entry entry;
+
+               result = uds_start_delta_index_search(delta_index, list_number - first,
+                                                     0, &entry);
+               if (result != UDS_SUCCESS)
+                       return result;
+
+               for (;;) {
+                       result = uds_next_delta_index_entry(&entry);
+                       if (result != UDS_SUCCESS) {
+                               /*
+                                * A random bit stream is highly likely to arrive here when we go
+                                * past the end of the delta list.
+                                */
+                               return result;
+                       }
+
+                       if (entry.at_end)
+                               break;
+
+                       /* Also make sure that the record page field contains a plausible value. */
+                       if (uds_get_delta_entry_value(&entry) >=
+                           geometry->record_pages_per_chapter) {
+                               /*
+                                * Do not log this as an error. It happens in normal operation when
+                                * we are doing a rebuild but haven't written the entire volume
+                                * once.
+                                */
+                               return UDS_CORRUPT_DATA;
+                       }
+               }
+       }
+       return UDS_SUCCESS;
+}
+
+/*
+ * Search a chapter index page for a record name, returning the record page number that may contain
+ * the name.
+ */
+int uds_search_chapter_index_page(struct delta_index_page *index_page,
+                                 const struct index_geometry *geometry,
+                                 const struct uds_record_name *name,
+                                 u16 *record_page_ptr)
+{
+       int result;
+       struct delta_index *delta_index = &index_page->delta_index;
+       u32 address = uds_hash_to_chapter_delta_address(name, geometry);
+       u32 delta_list_number = uds_hash_to_chapter_delta_list(name, geometry);
+       u32 sub_list_number = delta_list_number - index_page->lowest_list_number;
+       struct delta_index_entry entry;
+
+       result = uds_get_delta_index_entry(delta_index, sub_list_number, address,
+                                          name->name, &entry);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       if (was_entry_found(&entry, address))
+               *record_page_ptr = uds_get_delta_entry_value(&entry);
+       else
+               *record_page_ptr = NO_CHAPTER_INDEX_ENTRY;
+
+       return UDS_SUCCESS;
+}
diff --git a/drivers/md/dm-vdo/indexer/chapter-index.h b/drivers/md/dm-vdo/indexer/chapter-index.h
new file mode 100644 (file)
index 0000000..be8bf2b
--- /dev/null
@@ -0,0 +1,61 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#ifndef UDS_CHAPTER_INDEX_H
+#define UDS_CHAPTER_INDEX_H
+
+#include <linux/limits.h>
+
+#include "delta-index.h"
+#include "geometry.h"
+
+/*
+ * A chapter index for an open chapter is a mutable structure that tracks all the records that have
+ * been added to the chapter. A chapter index for a closed chapter is similar except that it is
+ * immutable because the contents of a closed chapter can never change, and the immutable structure
+ * is more efficient. Both types of chapter index are implemented with a delta index.
+ */
+
+/* The value returned when no entry is found in the chapter index. */
+#define NO_CHAPTER_INDEX_ENTRY U16_MAX
+
+struct open_chapter_index {
+       const struct index_geometry *geometry;
+       struct delta_index delta_index;
+       u64 virtual_chapter_number;
+       u64 volume_nonce;
+       size_t memory_size;
+};
+
+int __must_check uds_make_open_chapter_index(struct open_chapter_index **chapter_index,
+                                            const struct index_geometry *geometry,
+                                            u64 volume_nonce);
+
+void uds_free_open_chapter_index(struct open_chapter_index *chapter_index);
+
+void uds_empty_open_chapter_index(struct open_chapter_index *chapter_index,
+                                 u64 virtual_chapter_number);
+
+int __must_check uds_put_open_chapter_index_record(struct open_chapter_index *chapter_index,
+                                                  const struct uds_record_name *name,
+                                                  u32 page_number);
+
+int __must_check uds_pack_open_chapter_index_page(struct open_chapter_index *chapter_index,
+                                                 u8 *memory, u32 first_list,
+                                                 bool last_page, u32 *lists_packed);
+
+int __must_check uds_initialize_chapter_index_page(struct delta_index_page *index_page,
+                                                  const struct index_geometry *geometry,
+                                                  u8 *page_buffer, u64 volume_nonce);
+
+int __must_check uds_validate_chapter_index_page(const struct delta_index_page *index_page,
+                                                const struct index_geometry *geometry);
+
+int __must_check uds_search_chapter_index_page(struct delta_index_page *index_page,
+                                              const struct index_geometry *geometry,
+                                              const struct uds_record_name *name,
+                                              u16 *record_page_ptr);
+
+#endif /* UDS_CHAPTER_INDEX_H */
diff --git a/drivers/md/dm-vdo/indexer/config.c b/drivers/md/dm-vdo/indexer/config.c
new file mode 100644 (file)
index 0000000..0bf315e
--- /dev/null
@@ -0,0 +1,378 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#include "config.h"
+
+#include "logger.h"
+#include "memory-alloc.h"
+#include "numeric.h"
+#include "string-utils.h"
+#include "thread-utils.h"
+
+static const u8 INDEX_CONFIG_MAGIC[] = "ALBIC";
+static const u8 INDEX_CONFIG_VERSION_6_02[] = "06.02";
+static const u8 INDEX_CONFIG_VERSION_8_02[] = "08.02";
+
+enum {
+       DEFAULT_VOLUME_READ_THREADS = 2,
+       MAX_VOLUME_READ_THREADS = 16,
+       INDEX_CONFIG_MAGIC_LENGTH = sizeof(INDEX_CONFIG_MAGIC) - 1,
+       INDEX_CONFIG_VERSION_LENGTH = sizeof(INDEX_CONFIG_VERSION_6_02) - 1,
+};
+
+static bool is_version(const u8 *version, u8 *buffer)
+{
+       return memcmp(version, buffer, INDEX_CONFIG_VERSION_LENGTH) == 0;
+}
+
+static bool are_matching_configurations(struct uds_configuration *saved_config,
+                                       struct index_geometry *saved_geometry,
+                                       struct uds_configuration *user)
+{
+       struct index_geometry *geometry = user->geometry;
+       bool result = true;
+
+       if (saved_geometry->record_pages_per_chapter != geometry->record_pages_per_chapter) {
+               uds_log_error("Record pages per chapter (%u) does not match (%u)",
+                             saved_geometry->record_pages_per_chapter,
+                             geometry->record_pages_per_chapter);
+               result = false;
+       }
+
+       if (saved_geometry->chapters_per_volume != geometry->chapters_per_volume) {
+               uds_log_error("Chapter count (%u) does not match (%u)",
+                             saved_geometry->chapters_per_volume,
+                             geometry->chapters_per_volume);
+               result = false;
+       }
+
+       if (saved_geometry->sparse_chapters_per_volume != geometry->sparse_chapters_per_volume) {
+               uds_log_error("Sparse chapter count (%u) does not match (%u)",
+                             saved_geometry->sparse_chapters_per_volume,
+                             geometry->sparse_chapters_per_volume);
+               result = false;
+       }
+
+       if (saved_config->cache_chapters != user->cache_chapters) {
+               uds_log_error("Cache size (%u) does not match (%u)",
+                             saved_config->cache_chapters, user->cache_chapters);
+               result = false;
+       }
+
+       if (saved_config->volume_index_mean_delta != user->volume_index_mean_delta) {
+               uds_log_error("Volume index mean delta (%u) does not match (%u)",
+                             saved_config->volume_index_mean_delta,
+                             user->volume_index_mean_delta);
+               result = false;
+       }
+
+       if (saved_geometry->bytes_per_page != geometry->bytes_per_page) {
+               uds_log_error("Bytes per page value (%zu) does not match (%zu)",
+                             saved_geometry->bytes_per_page, geometry->bytes_per_page);
+               result = false;
+       }
+
+       if (saved_config->sparse_sample_rate != user->sparse_sample_rate) {
+               uds_log_error("Sparse sample rate (%u) does not match (%u)",
+                             saved_config->sparse_sample_rate,
+                             user->sparse_sample_rate);
+               result = false;
+       }
+
+       if (saved_config->nonce != user->nonce) {
+               uds_log_error("Nonce (%llu) does not match (%llu)",
+                             (unsigned long long) saved_config->nonce,
+                             (unsigned long long) user->nonce);
+               result = false;
+       }
+
+       return result;
+}
+
+/* Read the configuration and validate it against the provided one. */
+int uds_validate_config_contents(struct buffered_reader *reader,
+                                struct uds_configuration *user_config)
+{
+       int result;
+       struct uds_configuration config;
+       struct index_geometry geometry;
+       u8 version_buffer[INDEX_CONFIG_VERSION_LENGTH];
+       u32 bytes_per_page;
+       u8 buffer[sizeof(struct uds_configuration_6_02)];
+       size_t offset = 0;
+
+       result = uds_verify_buffered_data(reader, INDEX_CONFIG_MAGIC,
+                                         INDEX_CONFIG_MAGIC_LENGTH);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       result = uds_read_from_buffered_reader(reader, version_buffer,
+                                              INDEX_CONFIG_VERSION_LENGTH);
+       if (result != UDS_SUCCESS)
+               return uds_log_error_strerror(result, "cannot read index config version");
+
+       if (!is_version(INDEX_CONFIG_VERSION_6_02, version_buffer) &&
+           !is_version(INDEX_CONFIG_VERSION_8_02, version_buffer)) {
+               return uds_log_error_strerror(UDS_CORRUPT_DATA,
+                                             "unsupported configuration version: '%.*s'",
+                                             INDEX_CONFIG_VERSION_LENGTH,
+                                             version_buffer);
+       }
+
+       result = uds_read_from_buffered_reader(reader, buffer, sizeof(buffer));
+       if (result != UDS_SUCCESS)
+               return uds_log_error_strerror(result, "cannot read config data");
+
+       decode_u32_le(buffer, &offset, &geometry.record_pages_per_chapter);
+       decode_u32_le(buffer, &offset, &geometry.chapters_per_volume);
+       decode_u32_le(buffer, &offset, &geometry.sparse_chapters_per_volume);
+       decode_u32_le(buffer, &offset, &config.cache_chapters);
+       offset += sizeof(u32);
+       decode_u32_le(buffer, &offset, &config.volume_index_mean_delta);
+       decode_u32_le(buffer, &offset, &bytes_per_page);
+       geometry.bytes_per_page = bytes_per_page;
+       decode_u32_le(buffer, &offset, &config.sparse_sample_rate);
+       decode_u64_le(buffer, &offset, &config.nonce);
+
+       result = ASSERT(offset == sizeof(struct uds_configuration_6_02),
+                       "%zu bytes read but not decoded",
+                       sizeof(struct uds_configuration_6_02) - offset);
+       if (result != UDS_SUCCESS)
+               return UDS_CORRUPT_DATA;
+
+       if (is_version(INDEX_CONFIG_VERSION_6_02, version_buffer)) {
+               user_config->geometry->remapped_virtual = 0;
+               user_config->geometry->remapped_physical = 0;
+       } else {
+               u8 remapping[sizeof(u64) + sizeof(u64)];
+
+               result = uds_read_from_buffered_reader(reader, remapping,
+                                                      sizeof(remapping));
+               if (result != UDS_SUCCESS)
+                       return uds_log_error_strerror(result, "cannot read converted config");
+
+               offset = 0;
+               decode_u64_le(remapping, &offset,
+                             &user_config->geometry->remapped_virtual);
+               decode_u64_le(remapping, &offset,
+                             &user_config->geometry->remapped_physical);
+       }
+
+       if (!are_matching_configurations(&config, &geometry, user_config)) {
+               uds_log_warning("Supplied configuration does not match save");
+               return UDS_NO_INDEX;
+       }
+
+       return UDS_SUCCESS;
+}
+
+/*
+ * Write the configuration to stable storage. If the superblock version is < 4, write the 6.02
+ * version; otherwise write the 8.02 version, indicating the configuration is for an index that has
+ * been reduced by one chapter.
+ */
+int uds_write_config_contents(struct buffered_writer *writer,
+                             struct uds_configuration *config, u32 version)
+{
+       int result;
+       struct index_geometry *geometry = config->geometry;
+       u8 buffer[sizeof(struct uds_configuration_8_02)];
+       size_t offset = 0;
+
+       result = uds_write_to_buffered_writer(writer, INDEX_CONFIG_MAGIC,
+                                             INDEX_CONFIG_MAGIC_LENGTH);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       /*
+        * If version is < 4, the index has not been reduced by a chapter so it must be written out
+        * as version 6.02 so that it is still compatible with older versions of UDS.
+        */
+       if (version >= 4) {
+               result = uds_write_to_buffered_writer(writer, INDEX_CONFIG_VERSION_8_02,
+                                                     INDEX_CONFIG_VERSION_LENGTH);
+               if (result != UDS_SUCCESS)
+                       return result;
+       } else {
+               result = uds_write_to_buffered_writer(writer, INDEX_CONFIG_VERSION_6_02,
+                                                     INDEX_CONFIG_VERSION_LENGTH);
+               if (result != UDS_SUCCESS)
+                       return result;
+       }
+
+       encode_u32_le(buffer, &offset, geometry->record_pages_per_chapter);
+       encode_u32_le(buffer, &offset, geometry->chapters_per_volume);
+       encode_u32_le(buffer, &offset, geometry->sparse_chapters_per_volume);
+       encode_u32_le(buffer, &offset, config->cache_chapters);
+       encode_u32_le(buffer, &offset, 0);
+       encode_u32_le(buffer, &offset, config->volume_index_mean_delta);
+       encode_u32_le(buffer, &offset, geometry->bytes_per_page);
+       encode_u32_le(buffer, &offset, config->sparse_sample_rate);
+       encode_u64_le(buffer, &offset, config->nonce);
+
+       result = ASSERT(offset == sizeof(struct uds_configuration_6_02),
+                       "%zu bytes encoded, of %zu expected", offset,
+                       sizeof(struct uds_configuration_6_02));
+       if (result != UDS_SUCCESS)
+               return result;
+
+       if (version >= 4) {
+               encode_u64_le(buffer, &offset, geometry->remapped_virtual);
+               encode_u64_le(buffer, &offset, geometry->remapped_physical);
+       }
+
+       return uds_write_to_buffered_writer(writer, buffer, offset);
+}
+
+/* Compute configuration parameters that depend on memory size. */
+static int compute_memory_sizes(uds_memory_config_size_t mem_gb, bool sparse,
+                               u32 *chapters_per_volume, u32 *record_pages_per_chapter,
+                               u32 *sparse_chapters_per_volume)
+{
+       u32 reduced_chapters = 0;
+       u32 base_chapters;
+
+       if (mem_gb == UDS_MEMORY_CONFIG_256MB) {
+               base_chapters = DEFAULT_CHAPTERS_PER_VOLUME;
+               *record_pages_per_chapter = SMALL_RECORD_PAGES_PER_CHAPTER;
+       } else if (mem_gb == UDS_MEMORY_CONFIG_512MB) {
+               base_chapters = DEFAULT_CHAPTERS_PER_VOLUME;
+               *record_pages_per_chapter = 2 * SMALL_RECORD_PAGES_PER_CHAPTER;
+       } else if (mem_gb == UDS_MEMORY_CONFIG_768MB) {
+               base_chapters = DEFAULT_CHAPTERS_PER_VOLUME;
+               *record_pages_per_chapter = 3 * SMALL_RECORD_PAGES_PER_CHAPTER;
+       } else if ((mem_gb >= 1) && (mem_gb <= UDS_MEMORY_CONFIG_MAX)) {
+               base_chapters = mem_gb * DEFAULT_CHAPTERS_PER_VOLUME;
+               *record_pages_per_chapter = DEFAULT_RECORD_PAGES_PER_CHAPTER;
+       } else if (mem_gb == UDS_MEMORY_CONFIG_REDUCED_256MB) {
+               reduced_chapters = 1;
+               base_chapters = DEFAULT_CHAPTERS_PER_VOLUME;
+               *record_pages_per_chapter = SMALL_RECORD_PAGES_PER_CHAPTER;
+       } else if (mem_gb == UDS_MEMORY_CONFIG_REDUCED_512MB) {
+               reduced_chapters = 1;
+               base_chapters = DEFAULT_CHAPTERS_PER_VOLUME;
+               *record_pages_per_chapter = 2 * SMALL_RECORD_PAGES_PER_CHAPTER;
+       } else if (mem_gb == UDS_MEMORY_CONFIG_REDUCED_768MB) {
+               reduced_chapters = 1;
+               base_chapters = DEFAULT_CHAPTERS_PER_VOLUME;
+               *record_pages_per_chapter = 3 * SMALL_RECORD_PAGES_PER_CHAPTER;
+       } else if ((mem_gb >= 1 + UDS_MEMORY_CONFIG_REDUCED) &&
+                  (mem_gb <= UDS_MEMORY_CONFIG_REDUCED_MAX)) {
+               reduced_chapters = 1;
+               base_chapters = ((mem_gb - UDS_MEMORY_CONFIG_REDUCED) *
+                                DEFAULT_CHAPTERS_PER_VOLUME);
+               *record_pages_per_chapter = DEFAULT_RECORD_PAGES_PER_CHAPTER;
+       } else {
+               uds_log_error("received invalid memory size");
+               return -EINVAL;
+       }
+
+       if (sparse) {
+               /* Make 95% of chapters sparse, allowing 10x more records. */
+               *sparse_chapters_per_volume = (19 * base_chapters) / 2;
+               base_chapters *= 10;
+       } else {
+               *sparse_chapters_per_volume = 0;
+       }
+
+       *chapters_per_volume = base_chapters - reduced_chapters;
+       return UDS_SUCCESS;
+}
+
+static unsigned int __must_check normalize_zone_count(unsigned int requested)
+{
+       unsigned int zone_count = requested;
+
+       if (zone_count == 0)
+               zone_count = num_online_cpus() / 2;
+
+       if (zone_count < 1)
+               zone_count = 1;
+
+       if (zone_count > MAX_ZONES)
+               zone_count = MAX_ZONES;
+
+       uds_log_info("Using %u indexing zone%s for concurrency.",
+                    zone_count, zone_count == 1 ? "" : "s");
+       return zone_count;
+}
+
+static unsigned int __must_check normalize_read_threads(unsigned int requested)
+{
+       unsigned int read_threads = requested;
+
+       if (read_threads < 1)
+               read_threads = DEFAULT_VOLUME_READ_THREADS;
+
+       if (read_threads > MAX_VOLUME_READ_THREADS)
+               read_threads = MAX_VOLUME_READ_THREADS;
+
+       return read_threads;
+}
+
+int uds_make_configuration(const struct uds_parameters *params,
+                          struct uds_configuration **config_ptr)
+{
+       struct uds_configuration *config;
+       u32 chapters_per_volume = 0;
+       u32 record_pages_per_chapter = 0;
+       u32 sparse_chapters_per_volume = 0;
+       int result;
+
+       result = compute_memory_sizes(params->memory_size, params->sparse,
+                                     &chapters_per_volume, &record_pages_per_chapter,
+                                     &sparse_chapters_per_volume);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       result = uds_allocate(1, struct uds_configuration, __func__, &config);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       result = uds_make_index_geometry(DEFAULT_BYTES_PER_PAGE, record_pages_per_chapter,
+                                        chapters_per_volume, sparse_chapters_per_volume,
+                                        0, 0, &config->geometry);
+       if (result != UDS_SUCCESS) {
+               uds_free_configuration(config);
+               return result;
+       }
+
+       config->zone_count = normalize_zone_count(params->zone_count);
+       config->read_threads = normalize_read_threads(params->read_threads);
+
+       config->cache_chapters = DEFAULT_CACHE_CHAPTERS;
+       config->volume_index_mean_delta = DEFAULT_VOLUME_INDEX_MEAN_DELTA;
+       config->sparse_sample_rate = (params->sparse ? DEFAULT_SPARSE_SAMPLE_RATE : 0);
+       config->nonce = params->nonce;
+       config->bdev = params->bdev;
+       config->offset = params->offset;
+       config->size = params->size;
+
+       *config_ptr = config;
+       return UDS_SUCCESS;
+}
+
+void uds_free_configuration(struct uds_configuration *config)
+{
+       if (config != NULL) {
+               uds_free_index_geometry(config->geometry);
+               uds_free(config);
+       }
+}
+
+void uds_log_configuration(struct uds_configuration *config)
+{
+       struct index_geometry *geometry = config->geometry;
+
+       uds_log_debug("Configuration:");
+       uds_log_debug("  Record pages per chapter:   %10u", geometry->record_pages_per_chapter);
+       uds_log_debug("  Chapters per volume:        %10u", geometry->chapters_per_volume);
+       uds_log_debug("  Sparse chapters per volume: %10u", geometry->sparse_chapters_per_volume);
+       uds_log_debug("  Cache size (chapters):      %10u", config->cache_chapters);
+       uds_log_debug("  Volume index mean delta:    %10u", config->volume_index_mean_delta);
+       uds_log_debug("  Bytes per page:             %10zu", geometry->bytes_per_page);
+       uds_log_debug("  Sparse sample rate:         %10u", config->sparse_sample_rate);
+       uds_log_debug("  Nonce:                      %llu", (unsigned long long) config->nonce);
+}
diff --git a/drivers/md/dm-vdo/indexer/config.h b/drivers/md/dm-vdo/indexer/config.h
new file mode 100644 (file)
index 0000000..08507dc
--- /dev/null
@@ -0,0 +1,124 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#ifndef UDS_CONFIG_H
+#define UDS_CONFIG_H
+
+#include "geometry.h"
+#include "indexer.h"
+#include "io-factory.h"
+
+/*
+ * The uds_configuration records a variety of parameters used to configure a new UDS index. Some
+ * parameters are provided by the client, while others are fixed or derived from user-supplied
+ * values. It is created when an index is created, and it is recorded in the index metadata.
+ */
+
+enum {
+       DEFAULT_VOLUME_INDEX_MEAN_DELTA = 4096,
+       DEFAULT_CACHE_CHAPTERS = 7,
+       DEFAULT_SPARSE_SAMPLE_RATE = 32,
+       MAX_ZONES = 16,
+};
+
+/* A set of configuration parameters for the indexer. */
+struct uds_configuration {
+       /* Storage device for the index */
+       struct block_device *bdev;
+
+       /* The maximum allowable size of the index */
+       size_t size;
+
+       /* The offset where the index should start */
+       off_t offset;
+
+       /* Parameters for the volume */
+
+       /* The volume layout */
+       struct index_geometry *geometry;
+
+       /* Index owner's nonce */
+       u64 nonce;
+
+       /* The number of threads used to process index requests */
+       unsigned int zone_count;
+
+       /* The number of threads used to read volume pages */
+       unsigned int read_threads;
+
+       /* Size of the page cache and sparse chapter index cache in chapters */
+       u32 cache_chapters;
+
+       /* Parameters for the volume index */
+
+       /* The mean delta for the volume index */
+       u32 volume_index_mean_delta;
+
+       /* Sampling rate for sparse indexing */
+       u32 sparse_sample_rate;
+};
+
+/* On-disk structure of data for a version 8.02 index. */
+struct uds_configuration_8_02 {
+       /* Smaller (16), Small (64) or large (256) indices */
+       u32 record_pages_per_chapter;
+       /* Total number of chapters per volume */
+       u32 chapters_per_volume;
+       /* Number of sparse chapters per volume */
+       u32 sparse_chapters_per_volume;
+       /* Size of the page cache, in chapters */
+       u32 cache_chapters;
+       /* Unused field */
+       u32 unused;
+       /* The volume index mean delta to use */
+       u32 volume_index_mean_delta;
+       /* Size of a page, used for both record pages and index pages */
+       u32 bytes_per_page;
+       /* Sampling rate for sparse indexing */
+       u32 sparse_sample_rate;
+       /* Index owner's nonce */
+       u64 nonce;
+       /* Virtual chapter remapped from physical chapter 0 */
+       u64 remapped_virtual;
+       /* New physical chapter which remapped chapter was moved to */
+       u64 remapped_physical;
+} __packed;
+
+/* On-disk structure of data for a version 6.02 index. */
+struct uds_configuration_6_02 {
+       /* Smaller (16), Small (64) or large (256) indices */
+       u32 record_pages_per_chapter;
+       /* Total number of chapters per volume */
+       u32 chapters_per_volume;
+       /* Number of sparse chapters per volume */
+       u32 sparse_chapters_per_volume;
+       /* Size of the page cache, in chapters */
+       u32 cache_chapters;
+       /* Unused field */
+       u32 unused;
+       /* The volume index mean delta to use */
+       u32 volume_index_mean_delta;
+       /* Size of a page, used for both record pages and index pages */
+       u32 bytes_per_page;
+       /* Sampling rate for sparse indexing */
+       u32 sparse_sample_rate;
+       /* Index owner's nonce */
+       u64 nonce;
+} __packed;
+
+int __must_check uds_make_configuration(const struct uds_parameters *params,
+                                       struct uds_configuration **config_ptr);
+
+void uds_free_configuration(struct uds_configuration *config);
+
+int __must_check uds_validate_config_contents(struct buffered_reader *reader,
+                                             struct uds_configuration *config);
+
+int __must_check uds_write_config_contents(struct buffered_writer *writer,
+                                          struct uds_configuration *config, u32 version);
+
+void uds_log_configuration(struct uds_configuration *config);
+
+#endif /* UDS_CONFIG_H */
diff --git a/drivers/md/dm-vdo/indexer/delta-index.c b/drivers/md/dm-vdo/indexer/delta-index.c
new file mode 100644 (file)
index 0000000..4aace70
--- /dev/null
@@ -0,0 +1,1988 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2023 Red Hat
+ */
+#include "delta-index.h"
+
+#include <linux/bitops.h>
+#include <linux/bits.h>
+#include <linux/compiler.h>
+#include <linux/limits.h>
+#include <linux/log2.h>
+
+#include "cpu.h"
+#include "errors.h"
+#include "logger.h"
+#include "memory-alloc.h"
+#include "numeric.h"
+#include "permassert.h"
+#include "string-utils.h"
+#include "time-utils.h"
+
+#include "config.h"
+#include "indexer.h"
+
+/*
+ * The entries in a delta index could be stored in a single delta list, but to reduce search times
+ * and update costs it uses multiple delta lists. These lists are stored in a single chunk of
+ * memory managed by the delta_zone structure. The delta_zone can move the data around within its
+ * memory, so the location of each delta list is recorded as a bit offset into the memory. Because
+ * the volume index can contain over a million delta lists, we want to be efficient with the size
+ * of the delta list header information. This information is encoded into 16 bytes per list. The
+ * volume index delta list memory can easily exceed 4 gigabits, so a 64 bit value is needed to
+ * address the memory. The volume index delta lists average around 6 kilobits, so 16 bits are
+ * sufficient to store the size of a delta list.
+ *
+ * Each delta list is stored as a bit stream. Within the delta list encoding, bits and bytes are
+ * numbered in little endian order. Within a byte, bit 0 is the least significant bit (0x1), and
+ * bit 7 is the most significant bit (0x80). Within a bit stream, bit 7 is the most signficant bit
+ * of byte 0, and bit 8 is the least significant bit of byte 1. Within a byte array, a byte's
+ * number corresponds to its index in the array.
+ *
+ * A standard delta list entry is stored as a fixed length payload (the value) followed by a
+ * variable length key (the delta). A collision entry is used when two block names have the same
+ * delta list address. A collision entry always follows a standard entry for the hash with which it
+ * collides, and is encoded with DELTA == 0 with an additional 256 bits field at the end,
+ * containing the full block name. An entry with a delta of 0 at the beginning of a delta list
+ * indicates a normal entry.
+ *
+ * The delta in each entry is encoded with a variable-length Huffman code to minimize the memory
+ * used by small deltas. The Huffman code is specified by three parameters, which can be computed
+ * from the desired mean delta when the index is full. (See compute_coding_constants() for
+ * details.)
+ *
+ * The bit field utilities used to read and write delta entries assume that it is possible to read
+ * some bytes beyond the end of the bit field, so a delta_zone memory allocation is guarded by two
+ * invalid delta lists to prevent reading outside the delta_zone memory. The valid delta lists are
+ * numbered 1 to N, and the guard lists are numbered 0 and N+1. The function to decode the bit
+ * stream include a step that skips over bits set to 0 until the first 1 bit is found. A corrupted
+ * delta list could cause this step to run off the end of the delta_zone memory, so as extra
+ * protection against this happening, the tail guard list is set to all ones.
+ *
+ * The delta_index supports two different forms. The mutable form is created by
+ * uds_initialize_delta_index(), and is used for the volume index and for open chapter indexes. The
+ * immutable form is created by uds_initialize_delta_index_page(), and is used for closed (and
+ * cached) chapter index pages. The immutable form does not allocate delta list headers or
+ * temporary offsets, and thus is somewhat more memory efficient.
+ */
+
+/*
+ * This is the largest field size supported by get_field() and set_field(). Any field that is
+ * larger is not guaranteed to fit in a single byte-aligned u32.
+ */
+enum {
+       MAX_FIELD_BITS = (sizeof(u32) - 1) * BITS_PER_BYTE + 1,
+};
+
+/*
+ * This is the largest field size supported by get_big_field() and set_big_field(). Any field that
+ * is larger is not guaranteed to fit in a single byte-aligned u64.
+ */
+enum {
+       MAX_BIG_FIELD_BITS = (sizeof(u64) - 1) * BITS_PER_BYTE + 1,
+};
+
+/*
+ * This is the number of guard bytes needed at the end of the memory byte array when using the bit
+ * utilities. These utilities call get_big_field() and set_big_field(), which can access up to 7
+ * bytes beyond the end of the desired field. The definition is written to make it clear how this
+ * value is derived.
+ */
+enum {
+       POST_FIELD_GUARD_BYTES = sizeof(u64) - 1,
+};
+
+/* The number of guard bits that are needed in the tail guard list */
+enum {
+       GUARD_BITS = POST_FIELD_GUARD_BYTES * BITS_PER_BYTE
+};
+
+/*
+ * The maximum size of a single delta list in bytes. We count guard bytes in this value because a
+ * buffer of this size can be used with move_bits().
+ */
+enum {
+       DELTA_LIST_MAX_BYTE_COUNT =
+               ((U16_MAX + BITS_PER_BYTE) / BITS_PER_BYTE + POST_FIELD_GUARD_BYTES)
+};
+
+/* The number of extra bytes and bits needed to store a collision entry */
+enum {
+       COLLISION_BYTES = UDS_RECORD_NAME_SIZE,
+       COLLISION_BITS = COLLISION_BYTES * BITS_PER_BYTE
+};
+
+/*
+ * Immutable delta lists are packed into pages containing a header that encodes the delta list
+ * information into 19 bits per list (64KB bit offset).
+ */
+
+enum { IMMUTABLE_HEADER_SIZE = 19 };
+
+/*
+ * Constants and structures for the saved delta index. "DI" is for delta_index, and -##### is a
+ * number to increment when the format of the data changes.
+ */
+
+enum {
+       MAGIC_SIZE = 8,
+};
+
+static const char DELTA_INDEX_MAGIC[] = "DI-00002";
+
+struct delta_index_header {
+       char magic[MAGIC_SIZE];
+       u32 zone_number;
+       u32 zone_count;
+       u32 first_list;
+       u32 list_count;
+       u64 record_count;
+       u64 collision_count;
+};
+
+/*
+ * Header data used for immutable delta index pages. This data is followed by the delta list offset
+ * table.
+ */
+struct delta_page_header {
+       /* Externally-defined nonce */
+       u64 nonce;
+       /* The virtual chapter number */
+       u64 virtual_chapter_number;
+       /* Index of the first delta list on the page */
+       u16 first_list;
+       /* Number of delta lists on the page */
+       u16 list_count;
+} __packed;
+
+static inline u64 get_delta_list_byte_start(const struct delta_list *delta_list)
+{
+       return delta_list->start / BITS_PER_BYTE;
+}
+
+static inline u16 get_delta_list_byte_size(const struct delta_list *delta_list)
+{
+       unsigned int bit_offset = delta_list->start % BITS_PER_BYTE;
+
+       return BITS_TO_BYTES(bit_offset + delta_list->size);
+}
+
+static void rebalance_delta_zone(const struct delta_zone *delta_zone, u32 first,
+                                u32 last)
+{
+       struct delta_list *delta_list;
+       u64 new_start;
+
+       if (first == last) {
+               /* Only one list is moving, and we know there is space. */
+               delta_list = &delta_zone->delta_lists[first];
+               new_start = delta_zone->new_offsets[first];
+               if (delta_list->start != new_start) {
+                       u64 source;
+                       u64 destination;
+
+                       source = get_delta_list_byte_start(delta_list);
+                       delta_list->start = new_start;
+                       destination = get_delta_list_byte_start(delta_list);
+                       memmove(delta_zone->memory + destination,
+                               delta_zone->memory + source,
+                               get_delta_list_byte_size(delta_list));
+               }
+       } else {
+               /*
+                * There is more than one list. Divide the problem in half, and use recursive calls
+                * to process each half. Note that after this computation, first <= middle, and
+                * middle < last.
+                */
+               u32 middle = (first + last) / 2;
+
+               delta_list = &delta_zone->delta_lists[middle];
+               new_start = delta_zone->new_offsets[middle];
+
+               /*
+                * The direction that our middle list is moving determines which half of the
+                * problem must be processed first.
+                */
+               if (new_start > delta_list->start) {
+                       rebalance_delta_zone(delta_zone, middle + 1, last);
+                       rebalance_delta_zone(delta_zone, first, middle);
+               } else {
+                       rebalance_delta_zone(delta_zone, first, middle);
+                       rebalance_delta_zone(delta_zone, middle + 1, last);
+               }
+       }
+}
+
+static inline size_t get_zone_memory_size(unsigned int zone_count, size_t memory_size)
+{
+       /* Round up so that each zone is a multiple of 64K in size. */
+       enum {
+               ALLOC_BOUNDARY = 64 * 1024,
+       };
+
+       return (memory_size / zone_count + ALLOC_BOUNDARY - 1) & -ALLOC_BOUNDARY;
+}
+
+void uds_reset_delta_index(const struct delta_index *delta_index)
+{
+       unsigned int z;
+
+       /*
+        * Initialize all delta lists to be empty. We keep 2 extra delta list descriptors, one
+        * before the first real entry and one after so that we don't need to bounds check the
+        * array access when calculating preceding and following gap sizes.
+        */
+       for (z = 0; z < delta_index->zone_count; z++) {
+               u64 list_bits;
+               u64 spacing;
+               u64 offset;
+               unsigned int i;
+               struct delta_zone *zone = &delta_index->delta_zones[z];
+               struct delta_list *delta_lists = zone->delta_lists;
+
+               /* Zeroing the delta list headers initializes the head guard list correctly. */
+               memset(delta_lists, 0,
+                      (zone->list_count + 2) * sizeof(struct delta_list));
+
+               /* Set all the bits in the end guard list. */
+               list_bits = (u64) zone->size * BITS_PER_BYTE - GUARD_BITS;
+               delta_lists[zone->list_count + 1].start = list_bits;
+               delta_lists[zone->list_count + 1].size = GUARD_BITS;
+               memset(zone->memory + (list_bits / BITS_PER_BYTE), ~0,
+                      POST_FIELD_GUARD_BYTES);
+
+               /* Evenly space out the real delta lists by setting regular offsets. */
+               spacing = list_bits / zone->list_count;
+               offset = spacing / 2;
+               for (i = 1; i <= zone->list_count; i++) {
+                       delta_lists[i].start = offset;
+                       offset += spacing;
+               }
+
+               /* Update the statistics. */
+               zone->discard_count += zone->record_count;
+               zone->record_count = 0;
+               zone->collision_count = 0;
+       }
+}
+
+/* Compute the Huffman coding parameters for the given mean delta. The Huffman code is specified by
+ * three parameters:
+ *
+ *  MINBITS   The number of bits in the smallest code
+ *  BASE      The number of values coded using a code of length MINBITS
+ *  INCR      The number of values coded by using one additional bit
+ *
+ * These parameters are related by this equation:
+ *
+ *     BASE + INCR == 1 << MINBITS
+ *
+ * The math for the Huffman code of an exponential distribution says that
+ *
+ *     INCR = log(2) * MEAN_DELTA
+ *
+ * Then use the smallest MINBITS value so that
+ *
+ *     (1 << MINBITS) > INCR
+ *
+ * And then
+ *
+ *     BASE = (1 << MINBITS) - INCR
+ *
+ * Now the index can generate a code such that
+ * - The first BASE values code using MINBITS bits.
+ * - The next INCR values code using MINBITS+1 bits.
+ * - The next INCR values code using MINBITS+2 bits.
+ * - (and so on).
+ */
+static void compute_coding_constants(u32 mean_delta, u16 *min_bits, u32 *min_keys, u32 *incr_keys)
+{
+       /*
+        * We want to compute the rounded value of log(2) * mean_delta. Since we cannot always use
+        * floating point, use a really good integer approximation.
+        */
+       *incr_keys = (836158UL * mean_delta + 603160UL) / 1206321UL;
+       *min_bits = bits_per(*incr_keys + 1);
+       *min_keys = (1 << *min_bits) - *incr_keys;
+}
+
+void uds_uninitialize_delta_index(struct delta_index *delta_index)
+{
+       unsigned int z;
+
+       if (delta_index->delta_zones == NULL)
+               return;
+
+       for (z = 0; z < delta_index->zone_count; z++) {
+               uds_free(uds_forget(delta_index->delta_zones[z].new_offsets));
+               uds_free(uds_forget(delta_index->delta_zones[z].delta_lists));
+               uds_free(uds_forget(delta_index->delta_zones[z].memory));
+       }
+
+       uds_free(delta_index->delta_zones);
+       memset(delta_index, 0, sizeof(struct delta_index));
+}
+
+static int initialize_delta_zone(struct delta_zone *delta_zone, size_t size,
+                                u32 first_list, u32 list_count, u32 mean_delta,
+                                u32 payload_bits, u8 tag)
+{
+       int result;
+
+       result = uds_allocate(size, u8, "delta list", &delta_zone->memory);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       result = uds_allocate(list_count + 2, u64, "delta list temp",
+                             &delta_zone->new_offsets);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       /* Allocate the delta lists. */
+       result = uds_allocate(list_count + 2, struct delta_list, "delta lists",
+                             &delta_zone->delta_lists);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       compute_coding_constants(mean_delta, &delta_zone->min_bits,
+                                &delta_zone->min_keys, &delta_zone->incr_keys);
+       delta_zone->value_bits = payload_bits;
+       delta_zone->buffered_writer = NULL;
+       delta_zone->size = size;
+       delta_zone->rebalance_time = 0;
+       delta_zone->rebalance_count = 0;
+       delta_zone->record_count = 0;
+       delta_zone->collision_count = 0;
+       delta_zone->discard_count = 0;
+       delta_zone->overflow_count = 0;
+       delta_zone->first_list = first_list;
+       delta_zone->list_count = list_count;
+       delta_zone->tag = tag;
+
+       return UDS_SUCCESS;
+}
+
+int uds_initialize_delta_index(struct delta_index *delta_index, unsigned int zone_count,
+                              u32 list_count, u32 mean_delta, u32 payload_bits,
+                              size_t memory_size, u8 tag)
+{
+       int result;
+       unsigned int z;
+       size_t zone_memory;
+
+       result = uds_allocate(zone_count, struct delta_zone, "Delta Index Zones",
+                             &delta_index->delta_zones);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       delta_index->zone_count = zone_count;
+       delta_index->list_count = list_count;
+       delta_index->lists_per_zone = DIV_ROUND_UP(list_count, zone_count);
+       delta_index->memory_size = 0;
+       delta_index->mutable = true;
+       delta_index->tag = tag;
+
+       for (z = 0; z < zone_count; z++) {
+               u32 lists_in_zone = delta_index->lists_per_zone;
+               u32 first_list_in_zone = z * lists_in_zone;
+
+               if (z == zone_count - 1) {
+                       /*
+                        * The last zone gets fewer lists if zone_count doesn't evenly divide
+                        * list_count. We'll have an underflow if the assertion below doesn't hold.
+                        */
+                       if (delta_index->list_count <= first_list_in_zone) {
+                               uds_uninitialize_delta_index(delta_index);
+                               return uds_log_error_strerror(UDS_INVALID_ARGUMENT,
+                                                             "%u delta lists not enough for %u zones",
+                                                             list_count, zone_count);
+                       }
+                       lists_in_zone = delta_index->list_count - first_list_in_zone;
+               }
+
+               zone_memory = get_zone_memory_size(zone_count, memory_size);
+               result = initialize_delta_zone(&delta_index->delta_zones[z], zone_memory,
+                                              first_list_in_zone, lists_in_zone,
+                                              mean_delta, payload_bits, tag);
+               if (result != UDS_SUCCESS) {
+                       uds_uninitialize_delta_index(delta_index);
+                       return result;
+               }
+
+               delta_index->memory_size +=
+                       (sizeof(struct delta_zone) + zone_memory +
+                        (lists_in_zone + 2) * (sizeof(struct delta_list) + sizeof(u64)));
+       }
+
+       uds_reset_delta_index(delta_index);
+       return UDS_SUCCESS;
+}
+
+/* Read a bit field from an arbitrary bit boundary. */
+static inline u32 get_field(const u8 *memory, u64 offset, u8 size)
+{
+       const void *addr = memory + offset / BITS_PER_BYTE;
+
+       return (get_unaligned_le32(addr) >> (offset % BITS_PER_BYTE)) & ((1 << size) - 1);
+}
+
+/* Write a bit field to an arbitrary bit boundary. */
+static inline void set_field(u32 value, u8 *memory, u64 offset, u8 size)
+{
+       void *addr = memory + offset / BITS_PER_BYTE;
+       int shift = offset % BITS_PER_BYTE;
+       u32 data = get_unaligned_le32(addr);
+
+       data &= ~(((1 << size) - 1) << shift);
+       data |= value << shift;
+       put_unaligned_le32(data, addr);
+}
+
+/* Get the bit offset to the immutable delta list header. */
+static inline u32 get_immutable_header_offset(u32 list_number)
+{
+       return sizeof(struct delta_page_header) * BITS_PER_BYTE +
+               list_number * IMMUTABLE_HEADER_SIZE;
+}
+
+/* Get the bit offset to the start of the immutable delta list bit stream. */
+static inline u32 get_immutable_start(const u8 *memory, u32 list_number)
+{
+       return get_field(memory, get_immutable_header_offset(list_number),
+                        IMMUTABLE_HEADER_SIZE);
+}
+
+/* Set the bit offset to the start of the immutable delta list bit stream. */
+static inline void set_immutable_start(u8 *memory, u32 list_number, u32 start)
+{
+       set_field(start, memory, get_immutable_header_offset(list_number),
+                 IMMUTABLE_HEADER_SIZE);
+}
+
+static bool verify_delta_index_page(u64 nonce, u16 list_count, u64 expected_nonce,
+                                   u8 *memory, size_t memory_size)
+{
+       unsigned int i;
+
+       /*
+        * Verify the nonce. A mismatch can happen here during rebuild if we haven't written the
+        * entire volume at least once.
+        */
+       if (nonce != expected_nonce)
+               return false;
+
+       /* Verify that the number of delta lists can fit in the page. */
+       if (list_count > ((memory_size - sizeof(struct delta_page_header)) *
+                         BITS_PER_BYTE / IMMUTABLE_HEADER_SIZE))
+               return false;
+
+       /*
+        * Verify that the first delta list is immediately after the last delta
+        * list header.
+        */
+       if (get_immutable_start(memory, 0) != get_immutable_header_offset(list_count + 1))
+               return false;
+
+       /* Verify that the lists are in the correct order. */
+       for (i = 0; i < list_count; i++) {
+               if (get_immutable_start(memory, i) > get_immutable_start(memory, i + 1))
+                       return false;
+       }
+
+       /*
+        * Verify that the last list ends on the page, and that there is room
+        * for the post-field guard bits.
+        */
+       if (get_immutable_start(memory, list_count) >
+           (memory_size - POST_FIELD_GUARD_BYTES) * BITS_PER_BYTE)
+               return false;
+
+       /* Verify that the guard bytes are correctly set to all ones. */
+       for (i = 0; i < POST_FIELD_GUARD_BYTES; i++) {
+               if (memory[memory_size - POST_FIELD_GUARD_BYTES + i] != (u8) ~0)
+                       return false;
+       }
+
+       /* All verifications passed. */
+       return true;
+}
+
+/* Initialize a delta index page to refer to a supplied page. */
+int uds_initialize_delta_index_page(struct delta_index_page *delta_index_page,
+                                   u64 expected_nonce, u32 mean_delta, u32 payload_bits,
+                                   u8 *memory, size_t memory_size)
+{
+       u64 nonce;
+       u64 vcn;
+       u64 first_list;
+       u64 list_count;
+       struct delta_page_header *header = (struct delta_page_header *) memory;
+       struct delta_zone *delta_zone = &delta_index_page->delta_zone;
+       const u8 *nonce_addr = (const u8 *) &header->nonce;
+       const u8 *vcn_addr = (const u8 *) &header->virtual_chapter_number;
+       const u8 *first_list_addr = (const u8 *) &header->first_list;
+       const u8 *list_count_addr = (const u8 *) &header->list_count;
+
+       /* First assume that the header is little endian. */
+       nonce = get_unaligned_le64(nonce_addr);
+       vcn = get_unaligned_le64(vcn_addr);
+       first_list = get_unaligned_le16(first_list_addr);
+       list_count = get_unaligned_le16(list_count_addr);
+       if (!verify_delta_index_page(nonce, list_count, expected_nonce, memory,
+                                    memory_size)) {
+               /* If that fails, try big endian. */
+               nonce = get_unaligned_be64(nonce_addr);
+               vcn = get_unaligned_be64(vcn_addr);
+               first_list = get_unaligned_be16(first_list_addr);
+               list_count = get_unaligned_be16(list_count_addr);
+               if (!verify_delta_index_page(nonce, list_count, expected_nonce, memory,
+                                            memory_size)) {
+                       /*
+                        * Both attempts failed. Do not log this as an error, because it can happen
+                        * during a rebuild if we haven't written the entire volume at least once.
+                        */
+                       return UDS_CORRUPT_DATA;
+               }
+       }
+
+       delta_index_page->delta_index.delta_zones = delta_zone;
+       delta_index_page->delta_index.zone_count = 1;
+       delta_index_page->delta_index.list_count = list_count;
+       delta_index_page->delta_index.lists_per_zone = list_count;
+       delta_index_page->delta_index.mutable = false;
+       delta_index_page->delta_index.tag = 'p';
+       delta_index_page->virtual_chapter_number = vcn;
+       delta_index_page->lowest_list_number = first_list;
+       delta_index_page->highest_list_number = first_list + list_count - 1;
+
+       compute_coding_constants(mean_delta, &delta_zone->min_bits,
+                                &delta_zone->min_keys, &delta_zone->incr_keys);
+       delta_zone->value_bits = payload_bits;
+       delta_zone->memory = memory;
+       delta_zone->delta_lists = NULL;
+       delta_zone->new_offsets = NULL;
+       delta_zone->buffered_writer = NULL;
+       delta_zone->size = memory_size;
+       delta_zone->rebalance_time = 0;
+       delta_zone->rebalance_count = 0;
+       delta_zone->record_count = 0;
+       delta_zone->collision_count = 0;
+       delta_zone->discard_count = 0;
+       delta_zone->overflow_count = 0;
+       delta_zone->first_list = 0;
+       delta_zone->list_count = list_count;
+       delta_zone->tag = 'p';
+
+       return UDS_SUCCESS;
+}
+
+/* Read a large bit field from an arbitrary bit boundary. */
+static inline u64 get_big_field(const u8 *memory, u64 offset, u8 size)
+{
+       const void *addr = memory + offset / BITS_PER_BYTE;
+
+       return (get_unaligned_le64(addr) >> (offset % BITS_PER_BYTE)) & ((1UL << size) - 1);
+}
+
+/* Write a large bit field to an arbitrary bit boundary. */
+static inline void set_big_field(u64 value, u8 *memory, u64 offset, u8 size)
+{
+       void *addr = memory + offset / BITS_PER_BYTE;
+       u8 shift = offset % BITS_PER_BYTE;
+       u64 data = get_unaligned_le64(addr);
+
+       data &= ~(((1UL << size) - 1) << shift);
+       data |= value << shift;
+       put_unaligned_le64(data, addr);
+}
+
+/* Set a sequence of bits to all zeros. */
+static inline void set_zero(u8 *memory, u64 offset, u32 size)
+{
+       if (size > 0) {
+               u8 *addr = memory + offset / BITS_PER_BYTE;
+               u8 shift = offset % BITS_PER_BYTE;
+               u32 count = size + shift > BITS_PER_BYTE ? (u32) BITS_PER_BYTE - shift : size;
+
+               *addr++ &= ~(((1 << count) - 1) << shift);
+               for (size -= count; size > BITS_PER_BYTE; size -= BITS_PER_BYTE)
+                       *addr++ = 0;
+
+               if (size > 0)
+                       *addr &= 0xFF << size;
+       }
+}
+
+/*
+ * Move several bits from a higher to a lower address, moving the lower addressed bits first. The
+ * size and memory offsets are measured in bits.
+ */
+static void move_bits_down(const u8 *from, u64 from_offset, u8 *to, u64 to_offset, u32 size)
+{
+       const u8 *source;
+       u8 *destination;
+       u8 offset;
+       u8 count;
+       u64 field;
+
+       /* Start by moving one field that ends on a to int boundary. */
+       count = (MAX_BIG_FIELD_BITS - ((to_offset + MAX_BIG_FIELD_BITS) % BITS_PER_TYPE(u32)));
+       field = get_big_field(from, from_offset, count);
+       set_big_field(field, to, to_offset, count);
+       from_offset += count;
+       to_offset += count;
+       size -= count;
+
+       /* Now do the main loop to copy 32 bit chunks that are int-aligned at the destination. */
+       offset = from_offset % BITS_PER_TYPE(u32);
+       source = from + (from_offset - offset) / BITS_PER_BYTE;
+       destination = to + to_offset / BITS_PER_BYTE;
+       while (size > MAX_BIG_FIELD_BITS) {
+               put_unaligned_le32(get_unaligned_le64(source) >> offset, destination);
+               source += sizeof(u32);
+               destination += sizeof(u32);
+               from_offset += BITS_PER_TYPE(u32);
+               to_offset += BITS_PER_TYPE(u32);
+               size -= BITS_PER_TYPE(u32);
+       }
+
+       /* Finish up by moving any remaining bits. */
+       if (size > 0) {
+               field = get_big_field(from, from_offset, size);
+               set_big_field(field, to, to_offset, size);
+       }
+}
+
+/*
+ * Move several bits from a lower to a higher address, moving the higher addressed bits first. The
+ * size and memory offsets are measured in bits.
+ */
+static void move_bits_up(const u8 *from, u64 from_offset, u8 *to, u64 to_offset, u32 size)
+{
+       const u8 *source;
+       u8 *destination;
+       u8 offset;
+       u8 count;
+       u64 field;
+
+       /* Start by moving one field that begins on a destination int boundary. */
+       count = (to_offset + size) % BITS_PER_TYPE(u32);
+       if (count > 0) {
+               size -= count;
+               field = get_big_field(from, from_offset + size, count);
+               set_big_field(field, to, to_offset + size, count);
+       }
+
+       /* Now do the main loop to copy 32 bit chunks that are int-aligned at the destination. */
+       offset = (from_offset + size) % BITS_PER_TYPE(u32);
+       source = from + (from_offset + size - offset) / BITS_PER_BYTE;
+       destination = to + (to_offset + size) / BITS_PER_BYTE;
+       while (size > MAX_BIG_FIELD_BITS) {
+               source -= sizeof(u32);
+               destination -= sizeof(u32);
+               size -= BITS_PER_TYPE(u32);
+               put_unaligned_le32(get_unaligned_le64(source) >> offset, destination);
+       }
+
+       /* Finish up by moving any remaining bits. */
+       if (size > 0) {
+               field = get_big_field(from, from_offset, size);
+               set_big_field(field, to, to_offset, size);
+       }
+}
+
+/*
+ * Move bits from one field to another. When the fields overlap, behave as if we first move all the
+ * bits from the source to a temporary value, and then move all the bits from the temporary value
+ * to the destination. The size and memory offsets are measured in bits.
+ */
+static void move_bits(const u8 *from, u64 from_offset, u8 *to, u64 to_offset, u32 size)
+{
+       u64 field;
+
+       /* A small move doesn't require special handling. */
+       if (size <= MAX_BIG_FIELD_BITS) {
+               if (size > 0) {
+                       field = get_big_field(from, from_offset, size);
+                       set_big_field(field, to, to_offset, size);
+               }
+
+               return;
+       }
+
+       if (from_offset > to_offset)
+               move_bits_down(from, from_offset, to, to_offset, size);
+       else
+               move_bits_up(from, from_offset, to, to_offset, size);
+}
+
+/*
+ * Pack delta lists from a mutable delta index into an immutable delta index page. A range of delta
+ * lists (starting with a specified list index) is copied from the mutable delta index into a
+ * memory page used in the immutable index. The number of lists copied onto the page is returned in
+ * list_count.
+ */
+int uds_pack_delta_index_page(const struct delta_index *delta_index, u64 header_nonce,
+                             u8 *memory, size_t memory_size, u64 virtual_chapter_number,
+                             u32 first_list, u32 *list_count)
+{
+       const struct delta_zone *delta_zone;
+       struct delta_list *delta_lists;
+       u32 max_lists;
+       u32 n_lists = 0;
+       u32 offset;
+       u32 i;
+       int free_bits;
+       int bits;
+       struct delta_page_header *header;
+
+       delta_zone = &delta_index->delta_zones[0];
+       delta_lists = &delta_zone->delta_lists[first_list + 1];
+       max_lists = delta_index->list_count - first_list;
+
+       /*
+        * Compute how many lists will fit on the page. Subtract the size of the fixed header, one
+        * delta list offset, and the guard bytes from the page size to determine how much space is
+        * available for delta lists.
+        */
+       free_bits = memory_size * BITS_PER_BYTE;
+       free_bits -= get_immutable_header_offset(1);
+       free_bits -= GUARD_BITS;
+       if (free_bits < IMMUTABLE_HEADER_SIZE) {
+               /* This page is too small to store any delta lists. */
+               return uds_log_error_strerror(UDS_OVERFLOW,
+                                             "Chapter Index Page of %zu bytes is too small",
+                                             memory_size);
+       }
+
+       while (n_lists < max_lists) {
+               /* Each list requires a delta list offset and the list data. */
+               bits = IMMUTABLE_HEADER_SIZE + delta_lists[n_lists].size;
+               if (bits > free_bits)
+                       break;
+
+               n_lists++;
+               free_bits -= bits;
+       }
+
+       *list_count = n_lists;
+
+       header = (struct delta_page_header *) memory;
+       put_unaligned_le64(header_nonce, (u8 *) &header->nonce);
+       put_unaligned_le64(virtual_chapter_number,
+                          (u8 *) &header->virtual_chapter_number);
+       put_unaligned_le16(first_list, (u8 *) &header->first_list);
+       put_unaligned_le16(n_lists, (u8 *) &header->list_count);
+
+       /* Construct the delta list offset table. */
+       offset = get_immutable_header_offset(n_lists + 1);
+       set_immutable_start(memory, 0, offset);
+       for (i = 0; i < n_lists; i++) {
+               offset += delta_lists[i].size;
+               set_immutable_start(memory, i + 1, offset);
+       }
+
+       /* Copy the delta list data onto the memory page. */
+       for (i = 0; i < n_lists; i++) {
+               move_bits(delta_zone->memory, delta_lists[i].start, memory,
+                         get_immutable_start(memory, i), delta_lists[i].size);
+       }
+
+       /* Set all the bits in the guard bytes. */
+       memset(memory + memory_size - POST_FIELD_GUARD_BYTES, ~0,
+              POST_FIELD_GUARD_BYTES);
+       return UDS_SUCCESS;
+}
+
+/* Compute the new offsets of the delta lists. */
+static void compute_new_list_offsets(struct delta_zone *delta_zone, u32 growing_index,
+                                    size_t growing_size, size_t used_space)
+{
+       size_t spacing;
+       u32 i;
+       struct delta_list *delta_lists = delta_zone->delta_lists;
+       u32 tail_guard_index = delta_zone->list_count + 1;
+
+       spacing = (delta_zone->size - used_space) / delta_zone->list_count;
+       delta_zone->new_offsets[0] = 0;
+       for (i = 0; i <= delta_zone->list_count; i++) {
+               delta_zone->new_offsets[i + 1] =
+                       (delta_zone->new_offsets[i] +
+                        get_delta_list_byte_size(&delta_lists[i]) + spacing);
+               delta_zone->new_offsets[i] *= BITS_PER_BYTE;
+               delta_zone->new_offsets[i] += delta_lists[i].start % BITS_PER_BYTE;
+               if (i == 0)
+                       delta_zone->new_offsets[i + 1] -= spacing / 2;
+               if (i + 1 == growing_index)
+                       delta_zone->new_offsets[i + 1] += growing_size;
+       }
+
+       delta_zone->new_offsets[tail_guard_index] =
+               (delta_zone->size * BITS_PER_BYTE - delta_lists[tail_guard_index].size);
+}
+
+static void rebalance_lists(struct delta_zone *delta_zone)
+{
+       struct delta_list *delta_lists;
+       u32 i;
+       size_t used_space = 0;
+
+       /* Extend and balance memory to receive the delta lists */
+       delta_lists = delta_zone->delta_lists;
+       for (i = 0; i <= delta_zone->list_count + 1; i++)
+               used_space += get_delta_list_byte_size(&delta_lists[i]);
+
+       compute_new_list_offsets(delta_zone, 0, 0, used_space);
+       for (i = 1; i <= delta_zone->list_count + 1; i++)
+               delta_lists[i].start = delta_zone->new_offsets[i];
+}
+
+/* Start restoring a delta index from multiple input streams. */
+int uds_start_restoring_delta_index(struct delta_index *delta_index,
+                                   struct buffered_reader **buffered_readers,
+                                   unsigned int reader_count)
+{
+       int result;
+       unsigned int zone_count = reader_count;
+       u64 record_count = 0;
+       u64 collision_count = 0;
+       u32 first_list[MAX_ZONES];
+       u32 list_count[MAX_ZONES];
+       unsigned int z;
+       u32 list_next = 0;
+       const struct delta_zone *delta_zone;
+
+       /* Read and validate each header. */
+       for (z = 0; z < zone_count; z++) {
+               struct delta_index_header header;
+               u8 buffer[sizeof(struct delta_index_header)];
+               size_t offset = 0;
+
+               result = uds_read_from_buffered_reader(buffered_readers[z], buffer,
+                                                      sizeof(buffer));
+               if (result != UDS_SUCCESS) {
+                       return uds_log_warning_strerror(result,
+                                                       "failed to read delta index header");
+               }
+
+               memcpy(&header.magic, buffer, MAGIC_SIZE);
+               offset += MAGIC_SIZE;
+               decode_u32_le(buffer, &offset, &header.zone_number);
+               decode_u32_le(buffer, &offset, &header.zone_count);
+               decode_u32_le(buffer, &offset, &header.first_list);
+               decode_u32_le(buffer, &offset, &header.list_count);
+               decode_u64_le(buffer, &offset, &header.record_count);
+               decode_u64_le(buffer, &offset, &header.collision_count);
+
+               result = ASSERT(offset == sizeof(struct delta_index_header),
+                               "%zu bytes decoded of %zu expected", offset,
+                               sizeof(struct delta_index_header));
+               if (result != UDS_SUCCESS) {
+                       return uds_log_warning_strerror(result,
+                                                       "failed to read delta index header");
+               }
+
+               if (memcmp(header.magic, DELTA_INDEX_MAGIC, MAGIC_SIZE) != 0) {
+                       return uds_log_warning_strerror(UDS_CORRUPT_DATA,
+                                                       "delta index file has bad magic number");
+               }
+
+               if (zone_count != header.zone_count) {
+                       return uds_log_warning_strerror(UDS_CORRUPT_DATA,
+                                                       "delta index files contain mismatched zone counts (%u,%u)",
+                                                       zone_count, header.zone_count);
+               }
+
+               if (header.zone_number != z) {
+                       return uds_log_warning_strerror(UDS_CORRUPT_DATA,
+                                                       "delta index zone %u found in slot %u",
+                                                       header.zone_number, z);
+               }
+
+               first_list[z] = header.first_list;
+               list_count[z] = header.list_count;
+               record_count += header.record_count;
+               collision_count += header.collision_count;
+
+               if (first_list[z] != list_next) {
+                       return uds_log_warning_strerror(UDS_CORRUPT_DATA,
+                                                       "delta index file for zone %u starts with list %u instead of list %u",
+                                                       z, first_list[z], list_next);
+               }
+
+               list_next += list_count[z];
+       }
+
+       if (list_next != delta_index->list_count) {
+               return uds_log_warning_strerror(UDS_CORRUPT_DATA,
+                                               "delta index files contain %u delta lists instead of %u delta lists",
+                                               list_next, delta_index->list_count);
+       }
+
+       if (collision_count > record_count) {
+               return uds_log_warning_strerror(UDS_CORRUPT_DATA,
+                                               "delta index files contain %llu collisions and %llu records",
+                                               (unsigned long long) collision_count,
+                                               (unsigned long long) record_count);
+       }
+
+       uds_reset_delta_index(delta_index);
+       delta_index->delta_zones[0].record_count = record_count;
+       delta_index->delta_zones[0].collision_count = collision_count;
+
+       /* Read the delta lists and distribute them to the proper zones. */
+       for (z = 0; z < zone_count; z++) {
+               u32 i;
+
+               delta_index->load_lists[z] = 0;
+               for (i = 0; i < list_count[z]; i++) {
+                       u16 delta_list_size;
+                       u32 list_number;
+                       unsigned int zone_number;
+                       u8 size_data[sizeof(u16)];
+
+                       result = uds_read_from_buffered_reader(buffered_readers[z],
+                                                              size_data,
+                                                              sizeof(size_data));
+                       if (result != UDS_SUCCESS) {
+                               return uds_log_warning_strerror(result,
+                                                               "failed to read delta index size");
+                       }
+
+                       delta_list_size = get_unaligned_le16(size_data);
+                       if (delta_list_size > 0)
+                               delta_index->load_lists[z] += 1;
+
+                       list_number = first_list[z] + i;
+                       zone_number = list_number / delta_index->lists_per_zone;
+                       delta_zone = &delta_index->delta_zones[zone_number];
+                       list_number -= delta_zone->first_list;
+                       delta_zone->delta_lists[list_number + 1].size = delta_list_size;
+               }
+       }
+
+       /* Prepare each zone to start receiving the delta list data. */
+       for (z = 0; z < delta_index->zone_count; z++)
+               rebalance_lists(&delta_index->delta_zones[z]);
+
+       return UDS_SUCCESS;
+}
+
+static int restore_delta_list_to_zone(struct delta_zone *delta_zone,
+                                     const struct delta_list_save_info *save_info,
+                                     const u8 *data)
+{
+       struct delta_list *delta_list;
+       u16 bit_count;
+       u16 byte_count;
+       u32 list_number = save_info->index - delta_zone->first_list;
+
+       if (list_number >= delta_zone->list_count) {
+               return uds_log_warning_strerror(UDS_CORRUPT_DATA,
+                                               "invalid delta list number %u not in range [%u,%u)",
+                                               save_info->index, delta_zone->first_list,
+                                               delta_zone->first_list + delta_zone->list_count);
+       }
+
+       delta_list = &delta_zone->delta_lists[list_number + 1];
+       if (delta_list->size == 0) {
+               return uds_log_warning_strerror(UDS_CORRUPT_DATA,
+                                               "unexpected delta list number %u",
+                                               save_info->index);
+       }
+
+       bit_count = delta_list->size + save_info->bit_offset;
+       byte_count = BITS_TO_BYTES(bit_count);
+       if (save_info->byte_count != byte_count) {
+               return uds_log_warning_strerror(UDS_CORRUPT_DATA,
+                                               "unexpected delta list size %u != %u",
+                                               save_info->byte_count, byte_count);
+       }
+
+       move_bits(data, save_info->bit_offset, delta_zone->memory, delta_list->start,
+                 delta_list->size);
+       return UDS_SUCCESS;
+}
+
+static int restore_delta_list_data(struct delta_index *delta_index, unsigned int load_zone,
+                                  struct buffered_reader *buffered_reader, u8 *data)
+{
+       int result;
+       struct delta_list_save_info save_info;
+       u8 buffer[sizeof(struct delta_list_save_info)];
+       unsigned int new_zone;
+
+       result = uds_read_from_buffered_reader(buffered_reader, buffer, sizeof(buffer));
+       if (result != UDS_SUCCESS) {
+               return uds_log_warning_strerror(result,
+                                               "failed to read delta list data");
+       }
+
+       save_info = (struct delta_list_save_info) {
+               .tag = buffer[0],
+               .bit_offset = buffer[1],
+               .byte_count = get_unaligned_le16(&buffer[2]),
+               .index = get_unaligned_le32(&buffer[4]),
+       };
+
+       if ((save_info.bit_offset >= BITS_PER_BYTE) ||
+           (save_info.byte_count > DELTA_LIST_MAX_BYTE_COUNT)) {
+               return uds_log_warning_strerror(UDS_CORRUPT_DATA,
+                                               "corrupt delta list data");
+       }
+
+       /* Make sure the data is intended for this delta index. */
+       if (save_info.tag != delta_index->tag)
+               return UDS_CORRUPT_DATA;
+
+       if (save_info.index >= delta_index->list_count) {
+               return uds_log_warning_strerror(UDS_CORRUPT_DATA,
+                                               "invalid delta list number %u of %u",
+                                               save_info.index,
+                                               delta_index->list_count);
+       }
+
+       result = uds_read_from_buffered_reader(buffered_reader, data,
+                                              save_info.byte_count);
+       if (result != UDS_SUCCESS) {
+               return uds_log_warning_strerror(result,
+                                               "failed to read delta list data");
+       }
+
+       delta_index->load_lists[load_zone] -= 1;
+       new_zone = save_info.index / delta_index->lists_per_zone;
+       return restore_delta_list_to_zone(&delta_index->delta_zones[new_zone],
+                                         &save_info, data);
+}
+
+/* Restore delta lists from saved data. */
+int uds_finish_restoring_delta_index(struct delta_index *delta_index,
+                                    struct buffered_reader **buffered_readers,
+                                    unsigned int reader_count)
+{
+       int result;
+       int saved_result = UDS_SUCCESS;
+       unsigned int z;
+       u8 *data;
+
+       result = uds_allocate(DELTA_LIST_MAX_BYTE_COUNT, u8, __func__, &data);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       for (z = 0; z < reader_count; z++) {
+               while (delta_index->load_lists[z] > 0) {
+                       result = restore_delta_list_data(delta_index, z,
+                                                        buffered_readers[z], data);
+                       if (result != UDS_SUCCESS) {
+                               saved_result = result;
+                               break;
+                       }
+               }
+       }
+
+       uds_free(data);
+       return saved_result;
+}
+
+int uds_check_guard_delta_lists(struct buffered_reader **buffered_readers,
+                               unsigned int reader_count)
+{
+       int result;
+       unsigned int z;
+       u8 buffer[sizeof(struct delta_list_save_info)];
+
+       for (z = 0; z < reader_count; z++) {
+               result = uds_read_from_buffered_reader(buffered_readers[z], buffer,
+                                                      sizeof(buffer));
+               if (result != UDS_SUCCESS)
+                       return result;
+
+               if (buffer[0] != 'z')
+                       return UDS_CORRUPT_DATA;
+       }
+
+       return UDS_SUCCESS;
+}
+
+static int flush_delta_list(struct delta_zone *zone, u32 flush_index)
+{
+       struct delta_list *delta_list;
+       u8 buffer[sizeof(struct delta_list_save_info)];
+       int result;
+
+       delta_list = &zone->delta_lists[flush_index + 1];
+
+       buffer[0] = zone->tag;
+       buffer[1] = delta_list->start % BITS_PER_BYTE;
+       put_unaligned_le16(get_delta_list_byte_size(delta_list), &buffer[2]);
+       put_unaligned_le32(zone->first_list + flush_index, &buffer[4]);
+
+       result = uds_write_to_buffered_writer(zone->buffered_writer, buffer,
+                                             sizeof(buffer));
+       if (result != UDS_SUCCESS) {
+               uds_log_warning_strerror(result, "failed to write delta list memory");
+               return result;
+       }
+
+       result = uds_write_to_buffered_writer(zone->buffered_writer,
+                                             zone->memory + get_delta_list_byte_start(delta_list),
+                                             get_delta_list_byte_size(delta_list));
+       if (result != UDS_SUCCESS)
+               uds_log_warning_strerror(result, "failed to write delta list memory");
+
+       return result;
+}
+
+/* Start saving a delta index zone to a buffered output stream. */
+int uds_start_saving_delta_index(const struct delta_index *delta_index,
+                                unsigned int zone_number,
+                                struct buffered_writer *buffered_writer)
+{
+       int result;
+       u32 i;
+       struct delta_zone *delta_zone;
+       u8 buffer[sizeof(struct delta_index_header)];
+       size_t offset = 0;
+
+       delta_zone = &delta_index->delta_zones[zone_number];
+       memcpy(buffer, DELTA_INDEX_MAGIC, MAGIC_SIZE);
+       offset += MAGIC_SIZE;
+       encode_u32_le(buffer, &offset, zone_number);
+       encode_u32_le(buffer, &offset, delta_index->zone_count);
+       encode_u32_le(buffer, &offset, delta_zone->first_list);
+       encode_u32_le(buffer, &offset, delta_zone->list_count);
+       encode_u64_le(buffer, &offset, delta_zone->record_count);
+       encode_u64_le(buffer, &offset, delta_zone->collision_count);
+
+       result = ASSERT(offset == sizeof(struct delta_index_header),
+                       "%zu bytes encoded of %zu expected", offset,
+                       sizeof(struct delta_index_header));
+       if (result != UDS_SUCCESS)
+               return result;
+
+       result = uds_write_to_buffered_writer(buffered_writer, buffer, offset);
+       if (result != UDS_SUCCESS)
+               return uds_log_warning_strerror(result,
+                                               "failed to write delta index header");
+
+       for (i = 0; i < delta_zone->list_count; i++) {
+               u8 data[sizeof(u16)];
+               struct delta_list *delta_list;
+
+               delta_list = &delta_zone->delta_lists[i + 1];
+               put_unaligned_le16(delta_list->size, data);
+               result = uds_write_to_buffered_writer(buffered_writer, data,
+                                                     sizeof(data));
+               if (result != UDS_SUCCESS)
+                       return uds_log_warning_strerror(result,
+                                                       "failed to write delta list size");
+       }
+
+       delta_zone->buffered_writer = buffered_writer;
+       return UDS_SUCCESS;
+}
+
+int uds_finish_saving_delta_index(const struct delta_index *delta_index,
+                                 unsigned int zone_number)
+{
+       int result;
+       int first_error = UDS_SUCCESS;
+       u32 i;
+       struct delta_zone *delta_zone;
+       struct delta_list *delta_list;
+
+       delta_zone = &delta_index->delta_zones[zone_number];
+       for (i = 0; i < delta_zone->list_count; i++) {
+               delta_list = &delta_zone->delta_lists[i + 1];
+               if (delta_list->size > 0) {
+                       result = flush_delta_list(delta_zone, i);
+                       if ((result != UDS_SUCCESS) && (first_error == UDS_SUCCESS))
+                               first_error = result;
+               }
+       }
+
+       delta_zone->buffered_writer = NULL;
+       return first_error;
+}
+
+int uds_write_guard_delta_list(struct buffered_writer *buffered_writer)
+{
+       int result;
+       u8 buffer[sizeof(struct delta_list_save_info)];
+
+       memset(buffer, 0, sizeof(struct delta_list_save_info));
+       buffer[0] = 'z';
+
+       result = uds_write_to_buffered_writer(buffered_writer, buffer, sizeof(buffer));
+       if (result != UDS_SUCCESS)
+               uds_log_warning_strerror(result, "failed to write guard delta list");
+
+       return UDS_SUCCESS;
+}
+
+size_t uds_compute_delta_index_save_bytes(u32 list_count, size_t memory_size)
+{
+       /* One zone will use at least as much memory as other zone counts. */
+       return (sizeof(struct delta_index_header) +
+               list_count * (sizeof(struct delta_list_save_info) + 1) +
+               get_zone_memory_size(1, memory_size));
+}
+
+static int assert_not_at_end(const struct delta_index_entry *delta_entry)
+{
+       int result = ASSERT(!delta_entry->at_end,
+                           "operation is invalid because the list entry is at the end of the delta list");
+       if (result != UDS_SUCCESS)
+               result = UDS_BAD_STATE;
+
+       return result;
+}
+
+/*
+ * Prepare to search for an entry in the specified delta list.
+ *
+ * This is always the first function to be called when dealing with delta index entries. It is
+ * always followed by calls to uds_next_delta_index_entry() to iterate through a delta list. The
+ * fields of the delta_index_entry argument will be set up for iteration, but will not contain an
+ * entry from the list.
+ */
+int uds_start_delta_index_search(const struct delta_index *delta_index, u32 list_number,
+                                u32 key, struct delta_index_entry *delta_entry)
+{
+       int result;
+       unsigned int zone_number;
+       struct delta_zone *delta_zone;
+       struct delta_list *delta_list;
+
+       result = ASSERT((list_number < delta_index->list_count),
+                       "Delta list number (%u) is out of range (%u)", list_number,
+                       delta_index->list_count);
+       if (result != UDS_SUCCESS)
+               return UDS_CORRUPT_DATA;
+
+       zone_number = list_number / delta_index->lists_per_zone;
+       delta_zone = &delta_index->delta_zones[zone_number];
+       list_number -= delta_zone->first_list;
+       result = ASSERT((list_number < delta_zone->list_count),
+                       "Delta list number (%u) is out of range (%u) for zone (%u)",
+                       list_number, delta_zone->list_count, zone_number);
+       if (result != UDS_SUCCESS)
+               return UDS_CORRUPT_DATA;
+
+       if (delta_index->mutable) {
+               delta_list = &delta_zone->delta_lists[list_number + 1];
+       } else {
+               u32 end_offset;
+
+               /*
+                * Translate the immutable delta list header into a temporary
+                * full delta list header.
+                */
+               delta_list = &delta_entry->temp_delta_list;
+               delta_list->start = get_immutable_start(delta_zone->memory, list_number);
+               end_offset = get_immutable_start(delta_zone->memory, list_number + 1);
+               delta_list->size = end_offset - delta_list->start;
+               delta_list->save_key = 0;
+               delta_list->save_offset = 0;
+       }
+
+       if (key > delta_list->save_key) {
+               delta_entry->key = delta_list->save_key;
+               delta_entry->offset = delta_list->save_offset;
+       } else {
+               delta_entry->key = 0;
+               delta_entry->offset = 0;
+               if (key == 0) {
+                       /*
+                        * This usually means we're about to walk the entire delta list, so get all
+                        * of it into the CPU cache.
+                        */
+                       uds_prefetch_range(&delta_zone->memory[delta_list->start / BITS_PER_BYTE],
+                                          delta_list->size / BITS_PER_BYTE, false);
+               }
+       }
+
+       delta_entry->at_end = false;
+       delta_entry->delta_zone = delta_zone;
+       delta_entry->delta_list = delta_list;
+       delta_entry->entry_bits = 0;
+       delta_entry->is_collision = false;
+       delta_entry->list_number = list_number;
+       delta_entry->list_overflow = false;
+       delta_entry->value_bits = delta_zone->value_bits;
+       return UDS_SUCCESS;
+}
+
+static inline u64 get_delta_entry_offset(const struct delta_index_entry *delta_entry)
+{
+       return delta_entry->delta_list->start + delta_entry->offset;
+}
+
+/*
+ * Decode a delta index entry delta value. The delta_index_entry basically describes the previous
+ * list entry, and has had its offset field changed to point to the subsequent entry. We decode the
+ * bit stream and update the delta_list_entry to describe the entry.
+ */
+static inline void decode_delta(struct delta_index_entry *delta_entry)
+{
+       int key_bits;
+       u32 delta;
+       const struct delta_zone *delta_zone = delta_entry->delta_zone;
+       const u8 *memory = delta_zone->memory;
+       u64 delta_offset = get_delta_entry_offset(delta_entry) + delta_entry->value_bits;
+       const u8 *addr = memory + delta_offset / BITS_PER_BYTE;
+       int offset = delta_offset % BITS_PER_BYTE;
+       u32 data = get_unaligned_le32(addr) >> offset;
+
+       addr += sizeof(u32);
+       key_bits = delta_zone->min_bits;
+       delta = data & ((1 << key_bits) - 1);
+       if (delta >= delta_zone->min_keys) {
+               data >>= key_bits;
+               if (data == 0) {
+                       key_bits = sizeof(u32) * BITS_PER_BYTE - offset;
+                       while ((data = get_unaligned_le32(addr)) == 0) {
+                               addr += sizeof(u32);
+                               key_bits += sizeof(u32) * BITS_PER_BYTE;
+                       }
+               }
+               key_bits += ffs(data);
+               delta += ((key_bits - delta_zone->min_bits - 1) * delta_zone->incr_keys);
+       }
+       delta_entry->delta = delta;
+       delta_entry->key += delta;
+
+       /* Check for a collision, a delta of zero after the start. */
+       if (unlikely((delta == 0) && (delta_entry->offset > 0))) {
+               delta_entry->is_collision = true;
+               delta_entry->entry_bits = delta_entry->value_bits + key_bits + COLLISION_BITS;
+       } else {
+               delta_entry->is_collision = false;
+               delta_entry->entry_bits = delta_entry->value_bits + key_bits;
+       }
+}
+
+noinline int uds_next_delta_index_entry(struct delta_index_entry *delta_entry)
+{
+       int result;
+       const struct delta_list *delta_list;
+       u32 next_offset;
+       u16 size;
+
+       result = assert_not_at_end(delta_entry);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       delta_list = delta_entry->delta_list;
+       delta_entry->offset += delta_entry->entry_bits;
+       size = delta_list->size;
+       if (unlikely(delta_entry->offset >= size)) {
+               delta_entry->at_end = true;
+               delta_entry->delta = 0;
+               delta_entry->is_collision = false;
+               result = ASSERT((delta_entry->offset == size),
+                               "next offset past end of delta list");
+               if (result != UDS_SUCCESS)
+                       result = UDS_CORRUPT_DATA;
+
+               return result;
+       }
+
+       decode_delta(delta_entry);
+
+       next_offset = delta_entry->offset + delta_entry->entry_bits;
+       if (next_offset > size) {
+               /*
+                * This is not an assertion because uds_validate_chapter_index_page() wants to
+                * handle this error.
+                */
+               uds_log_warning("Decoded past the end of the delta list");
+               return UDS_CORRUPT_DATA;
+       }
+
+       return UDS_SUCCESS;
+}
+
+int uds_remember_delta_index_offset(const struct delta_index_entry *delta_entry)
+{
+       int result;
+       struct delta_list *delta_list = delta_entry->delta_list;
+
+       result = ASSERT(!delta_entry->is_collision, "entry is not a collision");
+       if (result != UDS_SUCCESS)
+               return result;
+
+       delta_list->save_key = delta_entry->key - delta_entry->delta;
+       delta_list->save_offset = delta_entry->offset;
+       return UDS_SUCCESS;
+}
+
+static void set_delta(struct delta_index_entry *delta_entry, u32 delta)
+{
+       const struct delta_zone *delta_zone = delta_entry->delta_zone;
+       u32 key_bits = (delta_zone->min_bits +
+                       ((delta_zone->incr_keys - delta_zone->min_keys + delta) /
+                        delta_zone->incr_keys));
+
+       delta_entry->delta = delta;
+       delta_entry->entry_bits = delta_entry->value_bits + key_bits;
+}
+
+static void get_collision_name(const struct delta_index_entry *entry, u8 *name)
+{
+       u64 offset = get_delta_entry_offset(entry) + entry->entry_bits - COLLISION_BITS;
+       const u8 *addr = entry->delta_zone->memory + offset / BITS_PER_BYTE;
+       int size = COLLISION_BYTES;
+       int shift = offset % BITS_PER_BYTE;
+
+       while (--size >= 0)
+               *name++ = get_unaligned_le16(addr++) >> shift;
+}
+
+static void set_collision_name(const struct delta_index_entry *entry, const u8 *name)
+{
+       u64 offset = get_delta_entry_offset(entry) + entry->entry_bits - COLLISION_BITS;
+       u8 *addr = entry->delta_zone->memory + offset / BITS_PER_BYTE;
+       int size = COLLISION_BYTES;
+       int shift = offset % BITS_PER_BYTE;
+       u16 mask = ~((u16) 0xFF << shift);
+       u16 data;
+
+       while (--size >= 0) {
+               data = (get_unaligned_le16(addr) & mask) | (*name++ << shift);
+               put_unaligned_le16(data, addr++);
+       }
+}
+
+int uds_get_delta_index_entry(const struct delta_index *delta_index, u32 list_number,
+                             u32 key, const u8 *name,
+                             struct delta_index_entry *delta_entry)
+{
+       int result;
+
+       result = uds_start_delta_index_search(delta_index, list_number, key,
+                                             delta_entry);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       do {
+               result = uds_next_delta_index_entry(delta_entry);
+               if (result != UDS_SUCCESS)
+                       return result;
+       } while (!delta_entry->at_end && (key > delta_entry->key));
+
+       result = uds_remember_delta_index_offset(delta_entry);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       if (!delta_entry->at_end && (key == delta_entry->key)) {
+               struct delta_index_entry collision_entry = *delta_entry;
+
+               for (;;) {
+                       u8 full_name[COLLISION_BYTES];
+
+                       result = uds_next_delta_index_entry(&collision_entry);
+                       if (result != UDS_SUCCESS)
+                               return result;
+
+                       if (collision_entry.at_end || !collision_entry.is_collision)
+                               break;
+
+                       get_collision_name(&collision_entry, full_name);
+                       if (memcmp(full_name, name, COLLISION_BYTES) == 0) {
+                               *delta_entry = collision_entry;
+                               break;
+                       }
+               }
+       }
+
+       return UDS_SUCCESS;
+}
+
+int uds_get_delta_entry_collision(const struct delta_index_entry *delta_entry, u8 *name)
+{
+       int result;
+
+       result = assert_not_at_end(delta_entry);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       result = ASSERT(delta_entry->is_collision,
+                       "Cannot get full block name from a non-collision delta index entry");
+       if (result != UDS_SUCCESS)
+               return UDS_BAD_STATE;
+
+       get_collision_name(delta_entry, name);
+       return UDS_SUCCESS;
+}
+
+u32 uds_get_delta_entry_value(const struct delta_index_entry *delta_entry)
+{
+       return get_field(delta_entry->delta_zone->memory,
+                        get_delta_entry_offset(delta_entry), delta_entry->value_bits);
+}
+
+static int assert_mutable_entry(const struct delta_index_entry *delta_entry)
+{
+       int result = ASSERT((delta_entry->delta_list != &delta_entry->temp_delta_list),
+                           "delta index is mutable");
+       if (result != UDS_SUCCESS)
+               result = UDS_BAD_STATE;
+
+       return result;
+}
+
+int uds_set_delta_entry_value(const struct delta_index_entry *delta_entry, u32 value)
+{
+       int result;
+       u32 value_mask = (1 << delta_entry->value_bits) - 1;
+
+       result = assert_mutable_entry(delta_entry);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       result = assert_not_at_end(delta_entry);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       result = ASSERT((value & value_mask) == value,
+                       "Value (%u) being set in a delta index is too large (must fit in %u bits)",
+                       value, delta_entry->value_bits);
+       if (result != UDS_SUCCESS)
+               return UDS_INVALID_ARGUMENT;
+
+       set_field(value, delta_entry->delta_zone->memory,
+                 get_delta_entry_offset(delta_entry), delta_entry->value_bits);
+       return UDS_SUCCESS;
+}
+
+/*
+ * Extend the memory used by the delta lists by adding growing_size bytes before the list indicated
+ * by growing_index, then rebalancing the lists in the new chunk.
+ */
+static int extend_delta_zone(struct delta_zone *delta_zone, u32 growing_index,
+                            size_t growing_size)
+{
+       ktime_t start_time;
+       ktime_t end_time;
+       struct delta_list *delta_lists;
+       u32 i;
+       size_t used_space;
+
+
+       /* Calculate the amount of space that is or will be in use. */
+       start_time = current_time_ns(CLOCK_MONOTONIC);
+       delta_lists = delta_zone->delta_lists;
+       used_space = growing_size;
+       for (i = 0; i <= delta_zone->list_count + 1; i++)
+               used_space += get_delta_list_byte_size(&delta_lists[i]);
+
+       if (delta_zone->size < used_space)
+               return UDS_OVERFLOW;
+
+       /* Compute the new offsets of the delta lists. */
+       compute_new_list_offsets(delta_zone, growing_index, growing_size, used_space);
+
+       /*
+        * When we rebalance the delta list, we will include the end guard list in the rebalancing.
+        * It contains the end guard data, which must be copied.
+        */
+       rebalance_delta_zone(delta_zone, 1, delta_zone->list_count + 1);
+       end_time = current_time_ns(CLOCK_MONOTONIC);
+       delta_zone->rebalance_count++;
+       delta_zone->rebalance_time += ktime_sub(end_time, start_time);
+       return UDS_SUCCESS;
+}
+
+static int insert_bits(struct delta_index_entry *delta_entry, u16 size)
+{
+       u64 free_before;
+       u64 free_after;
+       u64 source;
+       u64 destination;
+       u32 count;
+       bool before_flag;
+       u8 *memory;
+       struct delta_zone *delta_zone = delta_entry->delta_zone;
+       struct delta_list *delta_list = delta_entry->delta_list;
+       /* Compute bits in use before and after the inserted bits. */
+       u32 total_size = delta_list->size;
+       u32 before_size = delta_entry->offset;
+       u32 after_size = total_size - delta_entry->offset;
+
+       if (total_size + size > U16_MAX) {
+               delta_entry->list_overflow = true;
+               delta_zone->overflow_count++;
+               return UDS_OVERFLOW;
+       }
+
+       /* Compute bits available before and after the delta list. */
+       free_before = (delta_list[0].start - (delta_list[-1].start + delta_list[-1].size));
+       free_after = (delta_list[1].start - (delta_list[0].start + delta_list[0].size));
+
+       if ((size <= free_before) && (size <= free_after)) {
+               /*
+                * We have enough space to use either before or after the list. Select the smaller
+                * amount of data. If it is exactly the same, try to take from the larger amount of
+                * free space.
+                */
+               if (before_size < after_size)
+                       before_flag = true;
+               else if (after_size < before_size)
+                       before_flag = false;
+               else
+                       before_flag = free_before > free_after;
+       } else if (size <= free_before) {
+               /* There is space before but not after. */
+               before_flag = true;
+       } else if (size <= free_after) {
+               /* There is space after but not before. */
+               before_flag = false;
+       } else {
+               /*
+                * Neither of the surrounding spaces is large enough for this request. Extend
+                * and/or rebalance the delta list memory choosing to move the least amount of
+                * data.
+                */
+               int result;
+               u32 growing_index = delta_entry->list_number + 1;
+
+               before_flag = before_size < after_size;
+               if (!before_flag)
+                       growing_index++;
+               result = extend_delta_zone(delta_zone, growing_index,
+                                          BITS_TO_BYTES(size));
+               if (result != UDS_SUCCESS)
+                       return result;
+       }
+
+       delta_list->size += size;
+       if (before_flag) {
+               source = delta_list->start;
+               destination = source - size;
+               delta_list->start -= size;
+               count = before_size;
+       } else {
+               source = delta_list->start + delta_entry->offset;
+               destination = source + size;
+               count = after_size;
+       }
+
+       memory = delta_zone->memory;
+       move_bits(memory, source, memory, destination, count);
+       return UDS_SUCCESS;
+}
+
+static void encode_delta(const struct delta_index_entry *delta_entry)
+{
+       u32 temp;
+       u32 t1;
+       u32 t2;
+       u64 offset;
+       const struct delta_zone *delta_zone = delta_entry->delta_zone;
+       u8 *memory = delta_zone->memory;
+
+       offset = get_delta_entry_offset(delta_entry) + delta_entry->value_bits;
+       if (delta_entry->delta < delta_zone->min_keys) {
+               set_field(delta_entry->delta, memory, offset, delta_zone->min_bits);
+               return;
+       }
+
+       temp = delta_entry->delta - delta_zone->min_keys;
+       t1 = (temp % delta_zone->incr_keys) + delta_zone->min_keys;
+       t2 = temp / delta_zone->incr_keys;
+       set_field(t1, memory, offset, delta_zone->min_bits);
+       set_zero(memory, offset + delta_zone->min_bits, t2);
+       set_field(1, memory, offset + delta_zone->min_bits + t2, 1);
+}
+
+static void encode_entry(const struct delta_index_entry *delta_entry, u32 value,
+                        const u8 *name)
+{
+       u8 *memory = delta_entry->delta_zone->memory;
+       u64 offset = get_delta_entry_offset(delta_entry);
+
+       set_field(value, memory, offset, delta_entry->value_bits);
+       encode_delta(delta_entry);
+       if (name != NULL)
+               set_collision_name(delta_entry, name);
+}
+
+/*
+ * Create a new entry in the delta index. If the entry is a collision, the full 256 bit name must
+ * be provided.
+ */
+int uds_put_delta_index_entry(struct delta_index_entry *delta_entry, u32 key, u32 value,
+                             const u8 *name)
+{
+       int result;
+       struct delta_zone *delta_zone;
+
+       result = assert_mutable_entry(delta_entry);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       if (delta_entry->is_collision) {
+               /*
+                * The caller wants us to insert a collision entry onto a collision entry. This
+                * happens when we find a collision and attempt to add the name again to the index.
+                * This is normally a fatal error unless we are replaying a closed chapter while we
+                * are rebuilding a volume index.
+                */
+               return UDS_DUPLICATE_NAME;
+       }
+
+       if (delta_entry->offset < delta_entry->delta_list->save_offset) {
+               /*
+                * The saved entry offset is after the new entry and will no longer be valid, so
+                * replace it with the insertion point.
+                */
+               result = uds_remember_delta_index_offset(delta_entry);
+               if (result != UDS_SUCCESS)
+                       return result;
+       }
+
+       if (name != NULL) {
+               /* Insert a collision entry which is placed after this entry. */
+               result = assert_not_at_end(delta_entry);
+               if (result != UDS_SUCCESS)
+                       return result;
+
+               result = ASSERT((key == delta_entry->key),
+                               "incorrect key for collision entry");
+               if (result != UDS_SUCCESS)
+                       return result;
+
+               delta_entry->offset += delta_entry->entry_bits;
+               set_delta(delta_entry, 0);
+               delta_entry->is_collision = true;
+               delta_entry->entry_bits += COLLISION_BITS;
+               result = insert_bits(delta_entry, delta_entry->entry_bits);
+       } else if (delta_entry->at_end) {
+               /* Insert a new entry at the end of the delta list. */
+               result = ASSERT((key >= delta_entry->key), "key past end of list");
+               if (result != UDS_SUCCESS)
+                       return result;
+
+               set_delta(delta_entry, key - delta_entry->key);
+               delta_entry->key = key;
+               delta_entry->at_end = false;
+               result = insert_bits(delta_entry, delta_entry->entry_bits);
+       } else {
+               u16 old_entry_size;
+               u16 additional_size;
+               struct delta_index_entry next_entry;
+               u32 next_value;
+
+               /*
+                * Insert a new entry which requires the delta in the following entry to be
+                * updated.
+                */
+               result = ASSERT((key < delta_entry->key),
+                               "key precedes following entry");
+               if (result != UDS_SUCCESS)
+                       return result;
+
+               result = ASSERT((key >= delta_entry->key - delta_entry->delta),
+                               "key effects following entry's delta");
+               if (result != UDS_SUCCESS)
+                       return result;
+
+               old_entry_size = delta_entry->entry_bits;
+               next_entry = *delta_entry;
+               next_value = uds_get_delta_entry_value(&next_entry);
+               set_delta(delta_entry, key - (delta_entry->key - delta_entry->delta));
+               delta_entry->key = key;
+               set_delta(&next_entry, next_entry.key - key);
+               next_entry.offset += delta_entry->entry_bits;
+               /* The two new entries are always bigger than the single entry being replaced. */
+               additional_size = (delta_entry->entry_bits +
+                                  next_entry.entry_bits - old_entry_size);
+               result = insert_bits(delta_entry, additional_size);
+               if (result != UDS_SUCCESS)
+                       return result;
+
+               encode_entry(&next_entry, next_value, NULL);
+       }
+
+       if (result != UDS_SUCCESS)
+               return result;
+
+       encode_entry(delta_entry, value, name);
+       delta_zone = delta_entry->delta_zone;
+       delta_zone->record_count++;
+       delta_zone->collision_count += delta_entry->is_collision ? 1 : 0;
+       return UDS_SUCCESS;
+}
+
+static void delete_bits(const struct delta_index_entry *delta_entry, int size)
+{
+       u64 source;
+       u64 destination;
+       u32 count;
+       bool before_flag;
+       struct delta_list *delta_list = delta_entry->delta_list;
+       u8 *memory = delta_entry->delta_zone->memory;
+       /* Compute bits retained before and after the deleted bits. */
+       u32 total_size = delta_list->size;
+       u32 before_size = delta_entry->offset;
+       u32 after_size = total_size - delta_entry->offset - size;
+
+       /*
+        * Determine whether to add to the available space either before or after the delta list.
+        * We prefer to move the least amount of data. If it is exactly the same, try to add to the
+        * smaller amount of free space.
+        */
+       if (before_size < after_size) {
+               before_flag = true;
+       } else if (after_size < before_size) {
+               before_flag = false;
+       } else {
+               u64 free_before =
+                       (delta_list[0].start - (delta_list[-1].start + delta_list[-1].size));
+               u64 free_after =
+                       (delta_list[1].start - (delta_list[0].start + delta_list[0].size));
+
+               before_flag = (free_before < free_after);
+       }
+
+       delta_list->size -= size;
+       if (before_flag) {
+               source = delta_list->start;
+               destination = source + size;
+               delta_list->start += size;
+               count = before_size;
+       } else {
+               destination = delta_list->start + delta_entry->offset;
+               source = destination + size;
+               count = after_size;
+       }
+
+       move_bits(memory, source, memory, destination, count);
+}
+
+int uds_remove_delta_index_entry(struct delta_index_entry *delta_entry)
+{
+       int result;
+       struct delta_index_entry next_entry;
+       struct delta_zone *delta_zone;
+       struct delta_list *delta_list;
+
+       result = assert_mutable_entry(delta_entry);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       next_entry = *delta_entry;
+       result = uds_next_delta_index_entry(&next_entry);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       delta_zone = delta_entry->delta_zone;
+
+       if (delta_entry->is_collision) {
+               /* This is a collision entry, so just remove it. */
+               delete_bits(delta_entry, delta_entry->entry_bits);
+               next_entry.offset = delta_entry->offset;
+               delta_zone->collision_count -= 1;
+       } else if (next_entry.at_end) {
+               /* This entry is at the end of the list, so just remove it. */
+               delete_bits(delta_entry, delta_entry->entry_bits);
+               next_entry.key -= delta_entry->delta;
+               next_entry.offset = delta_entry->offset;
+       } else {
+               /* The delta in the next entry needs to be updated. */
+               u32 next_value = uds_get_delta_entry_value(&next_entry);
+               u16 old_size = delta_entry->entry_bits + next_entry.entry_bits;
+
+               if (next_entry.is_collision) {
+                       next_entry.is_collision = false;
+                       delta_zone->collision_count -= 1;
+               }
+
+               set_delta(&next_entry, delta_entry->delta + next_entry.delta);
+               next_entry.offset = delta_entry->offset;
+               /* The one new entry is always smaller than the two entries being replaced. */
+               delete_bits(delta_entry, old_size - next_entry.entry_bits);
+               encode_entry(&next_entry, next_value, NULL);
+       }
+
+       delta_zone->record_count--;
+       delta_zone->discard_count++;
+       *delta_entry = next_entry;
+
+       delta_list = delta_entry->delta_list;
+       if (delta_entry->offset < delta_list->save_offset) {
+               /* The saved entry offset is no longer valid. */
+               delta_list->save_key = 0;
+               delta_list->save_offset = 0;
+       }
+
+       return UDS_SUCCESS;
+}
+
+void uds_get_delta_index_stats(const struct delta_index *delta_index,
+                              struct delta_index_stats *stats)
+{
+       unsigned int z;
+       const struct delta_zone *delta_zone;
+
+       memset(stats, 0, sizeof(struct delta_index_stats));
+       for (z = 0; z < delta_index->zone_count; z++) {
+               delta_zone = &delta_index->delta_zones[z];
+               stats->rebalance_time += delta_zone->rebalance_time;
+               stats->rebalance_count += delta_zone->rebalance_count;
+               stats->record_count += delta_zone->record_count;
+               stats->collision_count += delta_zone->collision_count;
+               stats->discard_count += delta_zone->discard_count;
+               stats->overflow_count += delta_zone->overflow_count;
+               stats->list_count += delta_zone->list_count;
+       }
+}
+
+size_t uds_compute_delta_index_size(u32 entry_count, u32 mean_delta, u32 payload_bits)
+{
+       u16 min_bits;
+       u32 incr_keys;
+       u32 min_keys;
+
+       compute_coding_constants(mean_delta, &min_bits, &min_keys, &incr_keys);
+       /* On average, each delta is encoded into about min_bits + 1.5 bits. */
+       return entry_count * (payload_bits + min_bits + 1) + entry_count / 2;
+}
+
+u32 uds_get_delta_index_page_count(u32 entry_count, u32 list_count, u32 mean_delta,
+                                  u32 payload_bits, size_t bytes_per_page)
+{
+       unsigned int bits_per_delta_list;
+       unsigned int bits_per_page;
+       size_t bits_per_index;
+
+       /* Compute the expected number of bits needed for all the entries. */
+       bits_per_index = uds_compute_delta_index_size(entry_count, mean_delta,
+                                                     payload_bits);
+       bits_per_delta_list = bits_per_index / list_count;
+
+       /* Add in the immutable delta list headers. */
+       bits_per_index += list_count * IMMUTABLE_HEADER_SIZE;
+       /* Compute the number of usable bits on an immutable index page. */
+       bits_per_page = ((bytes_per_page - sizeof(struct delta_page_header)) * BITS_PER_BYTE);
+       /*
+        * Reduce the bits per page by one immutable delta list header and one delta list to
+        * account for internal fragmentation.
+        */
+       bits_per_page -= IMMUTABLE_HEADER_SIZE + bits_per_delta_list;
+       /* Now compute the number of pages needed. */
+       return DIV_ROUND_UP(bits_per_index, bits_per_page);
+}
+
+void uds_log_delta_index_entry(struct delta_index_entry *delta_entry)
+{
+       uds_log_ratelimit(uds_log_info,
+                         "List 0x%X Key 0x%X Offset 0x%X%s%s List_size 0x%X%s",
+                         delta_entry->list_number, delta_entry->key,
+                         delta_entry->offset, delta_entry->at_end ? " end" : "",
+                         delta_entry->is_collision ? " collision" : "",
+                         delta_entry->delta_list->size,
+                         delta_entry->list_overflow ? " overflow" : "");
+       delta_entry->list_overflow = false;
+}
diff --git a/drivers/md/dm-vdo/indexer/delta-index.h b/drivers/md/dm-vdo/indexer/delta-index.h
new file mode 100644 (file)
index 0000000..3d2ea19
--- /dev/null
@@ -0,0 +1,279 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#ifndef UDS_DELTA_INDEX_H
+#define UDS_DELTA_INDEX_H
+
+#include <linux/cache.h>
+
+#include "numeric.h"
+#include "time-utils.h"
+
+#include "config.h"
+#include "io-factory.h"
+
+/*
+ * A delta index is a key-value store, where each entry maps an address (the key) to a payload (the
+ * value). The entries are sorted by address, and only the delta between successive addresses is
+ * stored in the entry. The addresses are assumed to be uniformly distributed, and the deltas are
+ * therefore exponentially distributed.
+ *
+ * A delta_index can either be mutable or immutable depending on its expected use. The immutable
+ * form of a delta index is used for the indexes of closed chapters committed to the volume. The
+ * mutable form of a delta index is used by the volume index, and also by the chapter index in an
+ * open chapter. Like the index as a whole, each mutable delta index is divided into a number of
+ * independent zones.
+ */
+
+struct delta_list {
+       /* The offset of the delta list start, in bits */
+       u64 start;
+       /* The number of bits in the delta list */
+       u16 size;
+       /* Where the last search "found" the key, in bits */
+       u16 save_offset;
+       /* The key for the record just before save_offset */
+       u32 save_key;
+};
+
+struct delta_zone {
+       /* The delta list memory */
+       u8 *memory;
+       /* The delta list headers */
+       struct delta_list *delta_lists;
+       /* Temporary starts of delta lists */
+       u64 *new_offsets;
+       /* Buffered writer for saving an index */
+       struct buffered_writer *buffered_writer;
+       /* The size of delta list memory */
+       size_t size;
+       /* Nanoseconds spent rebalancing */
+       ktime_t rebalance_time;
+       /* Number of memory rebalances */
+       u32 rebalance_count;
+       /* The number of bits in a stored value */
+       u8 value_bits;
+       /* The number of bits in the minimal key code */
+       u16 min_bits;
+       /* The number of keys used in a minimal code */
+       u32 min_keys;
+       /* The number of keys used for another code bit */
+       u32 incr_keys;
+       /* The number of records in the index */
+       u64 record_count;
+       /* The number of collision records */
+       u64 collision_count;
+       /* The number of records removed */
+       u64 discard_count;
+       /* The number of UDS_OVERFLOW errors detected */
+       u64 overflow_count;
+       /* The index of the first delta list */
+       u32 first_list;
+       /* The number of delta lists */
+       u32 list_count;
+       /* Tag belonging to this delta index */
+       u8 tag;
+} __aligned(L1_CACHE_BYTES);
+
+struct delta_list_save_info {
+       /* Tag identifying which delta index this list is in */
+       u8 tag;
+       /* Bit offset of the start of the list data */
+       u8 bit_offset;
+       /* Number of bytes of list data */
+       u16 byte_count;
+       /* The delta list number within the delta index */
+       u32 index;
+} __packed;
+
+struct delta_index {
+       /* The zones */
+       struct delta_zone *delta_zones;
+       /* The number of zones */
+       unsigned int zone_count;
+       /* The number of delta lists */
+       u32 list_count;
+       /* Maximum lists per zone */
+       u32 lists_per_zone;
+       /* Total memory allocated to this index */
+       size_t memory_size;
+       /* The number of non-empty lists at load time per zone */
+       u32 load_lists[MAX_ZONES];
+       /* True if this index is mutable */
+       bool mutable;
+       /* Tag belonging to this delta index */
+       u8 tag;
+};
+
+/*
+ * A delta_index_page describes a single page of a chapter index. The delta_index field allows the
+ * page to be treated as an immutable delta_index. We use the delta_zone field to treat the chapter
+ * index page as a single zone index, and without the need to do an additional memory allocation.
+ */
+struct delta_index_page {
+       struct delta_index delta_index;
+       /* These values are loaded from the delta_page_header */
+       u32 lowest_list_number;
+       u32 highest_list_number;
+       u64 virtual_chapter_number;
+       /* This structure describes the single zone of a delta index page. */
+       struct delta_zone delta_zone;
+};
+
+/*
+ * Notes on the delta_index_entries:
+ *
+ * The fields documented as "public" can be read by any code that uses a delta_index. The fields
+ * documented as "private" carry information between delta_index method calls and should not be
+ * used outside the delta_index module.
+ *
+ * (1) The delta_index_entry is used like an iterator when searching a delta list.
+ *
+ * (2) It is also the result of a successful search and can be used to refer to the element found
+ *     by the search.
+ *
+ * (3) It is also the result of an unsuccessful search and can be used to refer to the insertion
+ *     point for a new record.
+ *
+ * (4) If at_end is true, the delta_list entry can only be used as the insertion point for a new
+ *     record at the end of the list.
+ *
+ * (5) If at_end is false and is_collision is true, the delta_list entry fields refer to a
+ *     collision entry in the list, and the delta_list entry can be used a a reference to this
+ *     entry.
+ *
+ * (6) If at_end is false and is_collision is false, the delta_list entry fields refer to a
+ *     non-collision entry in the list. Such delta_list entries can be used as a reference to a
+ *     found entry, or an insertion point for a non-collision entry before this entry, or an
+ *     insertion point for a collision entry that collides with this entry.
+ */
+struct delta_index_entry {
+       /* Public fields */
+       /* The key for this entry */
+       u32 key;
+       /* We are after the last list entry */
+       bool at_end;
+       /* This record is a collision */
+       bool is_collision;
+
+       /* Private fields */
+       /* This delta list overflowed */
+       bool list_overflow;
+       /* The number of bits used for the value */
+       u8 value_bits;
+       /* The number of bits used for the entire entry */
+       u16 entry_bits;
+       /* The delta index zone */
+       struct delta_zone *delta_zone;
+       /* The delta list containing the entry */
+       struct delta_list *delta_list;
+       /* The delta list number */
+       u32 list_number;
+       /* Bit offset of this entry within the list */
+       u16 offset;
+       /* The delta between this and previous entry */
+       u32 delta;
+       /* Temporary delta list for immutable indices */
+       struct delta_list temp_delta_list;
+};
+
+struct delta_index_stats {
+       /* Number of bytes allocated */
+       size_t memory_allocated;
+       /* Nanoseconds spent rebalancing */
+       ktime_t rebalance_time;
+       /* Number of memory rebalances */
+       u32 rebalance_count;
+       /* The number of records in the index */
+       u64 record_count;
+       /* The number of collision records */
+       u64 collision_count;
+       /* The number of records removed */
+       u64 discard_count;
+       /* The number of UDS_OVERFLOW errors detected */
+       u64 overflow_count;
+       /* The number of delta lists */
+       u32 list_count;
+};
+
+int __must_check uds_initialize_delta_index(struct delta_index *delta_index,
+                                           unsigned int zone_count, u32 list_count,
+                                           u32 mean_delta, u32 payload_bits,
+                                           size_t memory_size, u8 tag);
+
+int __must_check uds_initialize_delta_index_page(struct delta_index_page *delta_index_page,
+                                                u64 expected_nonce, u32 mean_delta,
+                                                u32 payload_bits, u8 *memory,
+                                                size_t memory_size);
+
+void uds_uninitialize_delta_index(struct delta_index *delta_index);
+
+void uds_reset_delta_index(const struct delta_index *delta_index);
+
+int __must_check uds_pack_delta_index_page(const struct delta_index *delta_index,
+                                          u64 header_nonce, u8 *memory,
+                                          size_t memory_size,
+                                          u64 virtual_chapter_number, u32 first_list,
+                                          u32 *list_count);
+
+int __must_check uds_start_restoring_delta_index(struct delta_index *delta_index,
+                                                struct buffered_reader **buffered_readers,
+                                                unsigned int reader_count);
+
+int __must_check uds_finish_restoring_delta_index(struct delta_index *delta_index,
+                                                 struct buffered_reader **buffered_readers,
+                                                 unsigned int reader_count);
+
+int __must_check uds_check_guard_delta_lists(struct buffered_reader **buffered_readers,
+                                            unsigned int reader_count);
+
+int __must_check uds_start_saving_delta_index(const struct delta_index *delta_index,
+                                             unsigned int zone_number,
+                                             struct buffered_writer *buffered_writer);
+
+int __must_check uds_finish_saving_delta_index(const struct delta_index *delta_index,
+                                              unsigned int zone_number);
+
+int __must_check uds_write_guard_delta_list(struct buffered_writer *buffered_writer);
+
+size_t __must_check uds_compute_delta_index_save_bytes(u32 list_count,
+                                                      size_t memory_size);
+
+int __must_check uds_start_delta_index_search(const struct delta_index *delta_index,
+                                             u32 list_number, u32 key,
+                                             struct delta_index_entry *iterator);
+
+int __must_check uds_next_delta_index_entry(struct delta_index_entry *delta_entry);
+
+int __must_check uds_remember_delta_index_offset(const struct delta_index_entry *delta_entry);
+
+int __must_check uds_get_delta_index_entry(const struct delta_index *delta_index,
+                                          u32 list_number, u32 key, const u8 *name,
+                                          struct delta_index_entry *delta_entry);
+
+int __must_check uds_get_delta_entry_collision(const struct delta_index_entry *delta_entry,
+                                              u8 *name);
+
+u32 __must_check uds_get_delta_entry_value(const struct delta_index_entry *delta_entry);
+
+int __must_check uds_set_delta_entry_value(const struct delta_index_entry *delta_entry, u32 value);
+
+int __must_check uds_put_delta_index_entry(struct delta_index_entry *delta_entry, u32 key,
+                                          u32 value, const u8 *name);
+
+int __must_check uds_remove_delta_index_entry(struct delta_index_entry *delta_entry);
+
+void uds_get_delta_index_stats(const struct delta_index *delta_index,
+                              struct delta_index_stats *stats);
+
+size_t __must_check uds_compute_delta_index_size(u32 entry_count, u32 mean_delta,
+                                                u32 payload_bits);
+
+u32 uds_get_delta_index_page_count(u32 entry_count, u32 list_count, u32 mean_delta,
+                                  u32 payload_bits, size_t bytes_per_page);
+
+void uds_log_delta_index_entry(struct delta_index_entry *delta_entry);
+
+#endif /* UDS_DELTA_INDEX_H */
diff --git a/drivers/md/dm-vdo/indexer/funnel-requestqueue.c b/drivers/md/dm-vdo/indexer/funnel-requestqueue.c
new file mode 100644 (file)
index 0000000..d2b49e3
--- /dev/null
@@ -0,0 +1,279 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#include "funnel-requestqueue.h"
+
+#include <linux/atomic.h>
+#include <linux/compiler.h>
+#include <linux/wait.h>
+
+#include "funnel-queue.h"
+#include "logger.h"
+#include "memory-alloc.h"
+#include "thread-utils.h"
+
+/*
+ * This queue will attempt to handle requests in reasonably sized batches instead of reacting
+ * immediately to each new request. The wait time between batches is dynamically adjusted up or
+ * down to try to balance responsiveness against wasted thread run time.
+ *
+ * If the wait time becomes long enough, the queue will become dormant and must be explicitly
+ * awoken when a new request is enqueued. The enqueue operation updates "newest" in the funnel
+ * queue via xchg (which is a memory barrier), and later checks "dormant" to decide whether to do a
+ * wakeup of the worker thread.
+ *
+ * When deciding to go to sleep, the worker thread sets "dormant" and then examines "newest" to
+ * decide if the funnel queue is idle. In dormant mode, the last examination of "newest" before
+ * going to sleep is done inside the wait_event_interruptible() macro, after a point where one or
+ * more memory barriers have been issued. (Preparing to sleep uses spin locks.) Even if the funnel
+ * queue's "next" field update isn't visible yet to make the entry accessible, its existence will
+ * kick the worker thread out of dormant mode and back into timer-based mode.
+ *
+ * Unbatched requests are used to communicate between different zone threads and will also cause
+ * the queue to awaken immediately.
+ */
+
+enum {
+       NANOSECOND = 1,
+       MICROSECOND = 1000 * NANOSECOND,
+       MILLISECOND = 1000 * MICROSECOND,
+       DEFAULT_WAIT_TIME = 20 * MICROSECOND,
+       MINIMUM_WAIT_TIME = DEFAULT_WAIT_TIME / 2,
+       MAXIMUM_WAIT_TIME = MILLISECOND,
+       MINIMUM_BATCH = 32,
+       MAXIMUM_BATCH = 64,
+};
+
+struct uds_request_queue {
+       /* Wait queue for synchronizing producers and consumer */
+       struct wait_queue_head wait_head;
+       /* Function to process a request */
+       uds_request_queue_processor_fn processor;
+       /* Queue of new incoming requests */
+       struct funnel_queue *main_queue;
+       /* Queue of old requests to retry */
+       struct funnel_queue *retry_queue;
+       /* The thread id of the worker thread */
+       struct thread *thread;
+       /* True if the worker was started */
+       bool started;
+       /* When true, requests can be enqueued */
+       bool running;
+       /* A flag set when the worker is waiting without a timeout */
+       atomic_t dormant;
+};
+
+static inline struct uds_request *poll_queues(struct uds_request_queue *queue)
+{
+       struct funnel_queue_entry *entry;
+
+       entry = uds_funnel_queue_poll(queue->retry_queue);
+       if (entry != NULL)
+               return container_of(entry, struct uds_request, queue_link);
+
+       entry = uds_funnel_queue_poll(queue->main_queue);
+       if (entry != NULL)
+               return container_of(entry, struct uds_request, queue_link);
+
+       return NULL;
+}
+
+static inline bool are_queues_idle(struct uds_request_queue *queue)
+{
+       return uds_is_funnel_queue_idle(queue->retry_queue) &&
+              uds_is_funnel_queue_idle(queue->main_queue);
+}
+
+/*
+ * Determine if there is a next request to process, and return it if there is. Also return flags
+ * indicating whether the worker thread can sleep (for the use of wait_event() macros) and whether
+ * the thread did sleep before returning a new request.
+ */
+static inline bool dequeue_request(struct uds_request_queue *queue,
+                                  struct uds_request **request_ptr, bool *waited_ptr)
+{
+       struct uds_request *request = poll_queues(queue);
+
+       if (request != NULL) {
+               *request_ptr = request;
+               return true;
+       }
+
+       if (!READ_ONCE(queue->running)) {
+               /* Wake the worker thread so it can exit. */
+               *request_ptr = NULL;
+               return true;
+       }
+
+       *request_ptr = NULL;
+       *waited_ptr = true;
+       return false;
+}
+
+static void wait_for_request(struct uds_request_queue *queue, bool dormant,
+                            unsigned long timeout, struct uds_request **request,
+                            bool *waited)
+{
+       if (dormant) {
+               wait_event_interruptible(queue->wait_head,
+                                        (dequeue_request(queue, request, waited) ||
+                                         !are_queues_idle(queue)));
+               return;
+       }
+
+       wait_event_interruptible_hrtimeout(queue->wait_head,
+                                          dequeue_request(queue, request, waited),
+                                          ns_to_ktime(timeout));
+}
+
+static void request_queue_worker(void *arg)
+{
+       struct uds_request_queue *queue = arg;
+       struct uds_request *request = NULL;
+       unsigned long time_batch = DEFAULT_WAIT_TIME;
+       bool dormant = atomic_read(&queue->dormant);
+       bool waited = false;
+       long current_batch = 0;
+
+       for (;;) {
+               wait_for_request(queue, dormant, time_batch, &request, &waited);
+               if (likely(request != NULL)) {
+                       current_batch++;
+                       queue->processor(request);
+               } else if (!READ_ONCE(queue->running)) {
+                       break;
+               }
+
+               if (dormant) {
+                       /*
+                        * The queue has been roused from dormancy. Clear the flag so enqueuers can
+                        * stop broadcasting. No fence is needed for this transition.
+                        */
+                       atomic_set(&queue->dormant, false);
+                       dormant = false;
+                       time_batch = DEFAULT_WAIT_TIME;
+               } else if (waited) {
+                       /*
+                        * We waited for this request to show up. Adjust the wait time to smooth
+                        * out the batch size.
+                        */
+                       if (current_batch < MINIMUM_BATCH) {
+                               /*
+                                * If the last batch of requests was too small, increase the wait
+                                * time.
+                                */
+                               time_batch += time_batch / 4;
+                               if (time_batch >= MAXIMUM_WAIT_TIME) {
+                                       atomic_set(&queue->dormant, true);
+                                       dormant = true;
+                               }
+                       } else if (current_batch > MAXIMUM_BATCH) {
+                               /*
+                                * If the last batch of requests was too large, decrease the wait
+                                * time.
+                                */
+                               time_batch -= time_batch / 4;
+                               if (time_batch < MINIMUM_WAIT_TIME)
+                                       time_batch = MINIMUM_WAIT_TIME;
+                       }
+                       current_batch = 0;
+               }
+       }
+
+       /*
+        * Ensure that we process any remaining requests that were enqueued before trying to shut
+        * down. The corresponding write barrier is in uds_request_queue_finish().
+        */
+       smp_rmb();
+       while ((request = poll_queues(queue)) != NULL)
+               queue->processor(request);
+}
+
+int uds_make_request_queue(const char *queue_name,
+                          uds_request_queue_processor_fn processor,
+                          struct uds_request_queue **queue_ptr)
+{
+       int result;
+       struct uds_request_queue *queue;
+
+       result = uds_allocate(1, struct uds_request_queue, __func__, &queue);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       queue->processor = processor;
+       queue->running = true;
+       atomic_set(&queue->dormant, false);
+       init_waitqueue_head(&queue->wait_head);
+
+       result = uds_make_funnel_queue(&queue->main_queue);
+       if (result != UDS_SUCCESS) {
+               uds_request_queue_finish(queue);
+               return result;
+       }
+
+       result = uds_make_funnel_queue(&queue->retry_queue);
+       if (result != UDS_SUCCESS) {
+               uds_request_queue_finish(queue);
+               return result;
+       }
+
+       result = vdo_create_thread(request_queue_worker, queue, queue_name,
+                                  &queue->thread);
+       if (result != UDS_SUCCESS) {
+               uds_request_queue_finish(queue);
+               return result;
+       }
+
+       queue->started = true;
+       *queue_ptr = queue;
+       return UDS_SUCCESS;
+}
+
+static inline void wake_up_worker(struct uds_request_queue *queue)
+{
+       if (wq_has_sleeper(&queue->wait_head))
+               wake_up(&queue->wait_head);
+}
+
+void uds_request_queue_enqueue(struct uds_request_queue *queue,
+                              struct uds_request *request)
+{
+       struct funnel_queue *sub_queue;
+       bool unbatched = request->unbatched;
+
+       sub_queue = request->requeued ? queue->retry_queue : queue->main_queue;
+       uds_funnel_queue_put(sub_queue, &request->queue_link);
+
+       /*
+        * We must wake the worker thread when it is dormant. A read fence isn't needed here since
+        * we know the queue operation acts as one.
+        */
+       if (atomic_read(&queue->dormant) || unbatched)
+               wake_up_worker(queue);
+}
+
+void uds_request_queue_finish(struct uds_request_queue *queue)
+{
+       if (queue == NULL)
+               return;
+
+       /*
+        * This memory barrier ensures that any requests we queued will be seen. The point is that
+        * when dequeue_request() sees the following update to the running flag, it will also be
+        * able to see any change we made to a next field in the funnel queue entry. The
+        * corresponding read barrier is in request_queue_worker().
+        */
+       smp_wmb();
+       WRITE_ONCE(queue->running, false);
+
+       if (queue->started) {
+               wake_up_worker(queue);
+               vdo_join_threads(queue->thread);
+       }
+
+       uds_free_funnel_queue(queue->main_queue);
+       uds_free_funnel_queue(queue->retry_queue);
+       uds_free(queue);
+}
diff --git a/drivers/md/dm-vdo/indexer/funnel-requestqueue.h b/drivers/md/dm-vdo/indexer/funnel-requestqueue.h
new file mode 100644 (file)
index 0000000..9b0f539
--- /dev/null
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#ifndef UDS_REQUEST_QUEUE_H
+#define UDS_REQUEST_QUEUE_H
+
+#include "indexer.h"
+
+/*
+ * A simple request queue which will handle new requests in the order in which they are received,
+ * and will attempt to handle requeued requests before new ones. However, the nature of the
+ * implementation means that it cannot guarantee this ordering; the prioritization is merely a
+ * hint.
+ */
+
+struct uds_request_queue;
+
+typedef void (*uds_request_queue_processor_fn)(struct uds_request *);
+
+int __must_check uds_make_request_queue(const char *queue_name,
+                                       uds_request_queue_processor_fn processor,
+                                       struct uds_request_queue **queue_ptr);
+
+void uds_request_queue_enqueue(struct uds_request_queue *queue,
+                              struct uds_request *request);
+
+void uds_request_queue_finish(struct uds_request_queue *queue);
+
+#endif /* UDS_REQUEST_QUEUE_H */
diff --git a/drivers/md/dm-vdo/indexer/geometry.c b/drivers/md/dm-vdo/indexer/geometry.c
new file mode 100644 (file)
index 0000000..38c1828
--- /dev/null
@@ -0,0 +1,201 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#include "geometry.h"
+
+#include <linux/compiler.h>
+#include <linux/log2.h>
+
+#include "errors.h"
+#include "logger.h"
+#include "memory-alloc.h"
+#include "permassert.h"
+
+#include "delta-index.h"
+#include "indexer.h"
+
+/*
+ * An index volume is divided into a fixed number of fixed-size chapters, each consisting of a
+ * fixed number of fixed-size pages. The volume layout is defined by two constants and four
+ * parameters. The constants are that index records are 32 bytes long (16-byte block name plus
+ * 16-byte metadata) and that open chapter index hash slots are one byte long. The four parameters
+ * are the number of bytes in a page, the number of record pages in a chapter, the number of
+ * chapters in a volume, and the number of chapters that are sparse. From these parameters, we can
+ * derive the rest of the layout and other index properties.
+ *
+ * The index volume is sized by its maximum memory footprint. For a dense index, the persistent
+ * storage is about 10 times the size of the memory footprint. For a sparse index, the persistent
+ * storage is about 100 times the size of the memory footprint.
+ *
+ * For a small index with a memory footprint less than 1GB, there are three possible memory
+ * configurations: 0.25GB, 0.5GB and 0.75GB. The default geometry for each is 1024 index records
+ * per 32 KB page, 1024 chapters per volume, and either 64, 128, or 192 record pages per chapter
+ * (resulting in 6, 13, or 20 index pages per chapter) depending on the memory configuration. For
+ * the VDO default of a 0.25 GB index, this yields a deduplication window of 256 GB using about 2.5
+ * GB for the persistent storage and 256 MB of RAM.
+ *
+ * For a larger index with a memory footprint that is a multiple of 1 GB, the geometry is 1024
+ * index records per 32 KB page, 256 record pages per chapter, 26 index pages per chapter, and 1024
+ * chapters for every GB of memory footprint. For a 1 GB volume, this yields a deduplication window
+ * of 1 TB using about 9GB of persistent storage and 1 GB of RAM.
+ *
+ * The above numbers hold for volumes which have no sparse chapters. A sparse volume has 10 times
+ * as many chapters as the corresponding non-sparse volume, which provides 10 times the
+ * deduplication window while using 10 times as much persistent storage as the equivalent
+ * non-sparse volume with the same memory footprint.
+ *
+ * If the volume has been converted from a non-lvm format to an lvm volume, the number of chapters
+ * per volume will have been reduced by one by eliminating physical chapter 0, and the virtual
+ * chapter that formerly mapped to physical chapter 0 may be remapped to another physical chapter.
+ * This remapping is expressed by storing which virtual chapter was remapped, and which physical
+ * chapter it was moved to.
+ */
+
+int uds_make_index_geometry(size_t bytes_per_page, u32 record_pages_per_chapter,
+                           u32 chapters_per_volume, u32 sparse_chapters_per_volume,
+                           u64 remapped_virtual, u64 remapped_physical,
+                           struct index_geometry **geometry_ptr)
+{
+       int result;
+       struct index_geometry *geometry;
+
+       result = uds_allocate(1, struct index_geometry, "geometry", &geometry);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       geometry->bytes_per_page = bytes_per_page;
+       geometry->record_pages_per_chapter = record_pages_per_chapter;
+       geometry->chapters_per_volume = chapters_per_volume;
+       geometry->sparse_chapters_per_volume = sparse_chapters_per_volume;
+       geometry->dense_chapters_per_volume = chapters_per_volume - sparse_chapters_per_volume;
+       geometry->remapped_virtual = remapped_virtual;
+       geometry->remapped_physical = remapped_physical;
+
+       geometry->records_per_page = bytes_per_page / BYTES_PER_RECORD;
+       geometry->records_per_chapter = geometry->records_per_page * record_pages_per_chapter;
+       geometry->records_per_volume = (u64) geometry->records_per_chapter * chapters_per_volume;
+
+       geometry->chapter_mean_delta = 1 << DEFAULT_CHAPTER_MEAN_DELTA_BITS;
+       geometry->chapter_payload_bits = bits_per(record_pages_per_chapter - 1);
+       /*
+        * We want 1 delta list for every 64 records in the chapter.
+        * The "| 077" ensures that the chapter_delta_list_bits computation
+        * does not underflow.
+        */
+       geometry->chapter_delta_list_bits =
+               bits_per((geometry->records_per_chapter - 1) | 077) - 6;
+       geometry->delta_lists_per_chapter = 1 << geometry->chapter_delta_list_bits;
+       /* We need enough address bits to achieve the desired mean delta. */
+       geometry->chapter_address_bits =
+               (DEFAULT_CHAPTER_MEAN_DELTA_BITS -
+                geometry->chapter_delta_list_bits +
+                bits_per(geometry->records_per_chapter - 1));
+       geometry->index_pages_per_chapter =
+               uds_get_delta_index_page_count(geometry->records_per_chapter,
+                                              geometry->delta_lists_per_chapter,
+                                              geometry->chapter_mean_delta,
+                                              geometry->chapter_payload_bits,
+                                              bytes_per_page);
+
+       geometry->pages_per_chapter = geometry->index_pages_per_chapter + record_pages_per_chapter;
+       geometry->pages_per_volume = geometry->pages_per_chapter * chapters_per_volume;
+       geometry->bytes_per_volume =
+               bytes_per_page * (geometry->pages_per_volume + HEADER_PAGES_PER_VOLUME);
+
+       *geometry_ptr = geometry;
+       return UDS_SUCCESS;
+}
+
+int uds_copy_index_geometry(struct index_geometry *source,
+                           struct index_geometry **geometry_ptr)
+{
+       return uds_make_index_geometry(source->bytes_per_page,
+                                      source->record_pages_per_chapter,
+                                      source->chapters_per_volume,
+                                      source->sparse_chapters_per_volume,
+                                      source->remapped_virtual, source->remapped_physical,
+                                      geometry_ptr);
+}
+
+void uds_free_index_geometry(struct index_geometry *geometry)
+{
+       uds_free(geometry);
+}
+
+u32 __must_check uds_map_to_physical_chapter(const struct index_geometry *geometry,
+                                            u64 virtual_chapter)
+{
+       u64 delta;
+
+       if (!uds_is_reduced_index_geometry(geometry))
+               return virtual_chapter % geometry->chapters_per_volume;
+
+       if (likely(virtual_chapter > geometry->remapped_virtual)) {
+               delta = virtual_chapter - geometry->remapped_virtual;
+               if (likely(delta > geometry->remapped_physical))
+                       return delta % geometry->chapters_per_volume;
+               else
+                       return delta - 1;
+       }
+
+       if (virtual_chapter == geometry->remapped_virtual)
+               return geometry->remapped_physical;
+
+       delta = geometry->remapped_virtual - virtual_chapter;
+       if (delta < geometry->chapters_per_volume)
+               return geometry->chapters_per_volume - delta;
+
+       /* This chapter is so old the answer doesn't matter. */
+       return 0;
+}
+
+/* Check whether any sparse chapters are in use. */
+bool uds_has_sparse_chapters(const struct index_geometry *geometry,
+                            u64 oldest_virtual_chapter, u64 newest_virtual_chapter)
+{
+       return uds_is_sparse_index_geometry(geometry) &&
+               ((newest_virtual_chapter - oldest_virtual_chapter + 1) >
+                geometry->dense_chapters_per_volume);
+}
+
+bool uds_is_chapter_sparse(const struct index_geometry *geometry,
+                          u64 oldest_virtual_chapter, u64 newest_virtual_chapter,
+                          u64 virtual_chapter_number)
+{
+       return uds_has_sparse_chapters(geometry, oldest_virtual_chapter,
+                                      newest_virtual_chapter) &&
+               ((virtual_chapter_number + geometry->dense_chapters_per_volume) <=
+                newest_virtual_chapter);
+}
+
+/* Calculate how many chapters to expire after opening the newest chapter. */
+u32 uds_chapters_to_expire(const struct index_geometry *geometry, u64 newest_chapter)
+{
+       /* If the index isn't full yet, don't expire anything. */
+       if (newest_chapter < geometry->chapters_per_volume)
+               return 0;
+
+       /* If a chapter is out of order... */
+       if (geometry->remapped_physical > 0) {
+               u64 oldest_chapter = newest_chapter - geometry->chapters_per_volume;
+
+               /*
+                * ... expire an extra chapter when expiring the moved chapter to free physical
+                * space for the new chapter ...
+                */
+               if (oldest_chapter == geometry->remapped_virtual)
+                       return 2;
+
+               /*
+                * ... but don't expire anything when the new chapter will use the physical chapter
+                * freed by expiring the moved chapter.
+                */
+               if (oldest_chapter == (geometry->remapped_virtual + geometry->remapped_physical))
+                       return 0;
+       }
+
+       /* Normally, just expire one. */
+       return 1;
+}
diff --git a/drivers/md/dm-vdo/indexer/geometry.h b/drivers/md/dm-vdo/indexer/geometry.h
new file mode 100644 (file)
index 0000000..a2ecdb2
--- /dev/null
@@ -0,0 +1,140 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#ifndef UDS_INDEX_GEOMETRY_H
+#define UDS_INDEX_GEOMETRY_H
+
+#include "indexer.h"
+
+/*
+ * The index_geometry records parameters that define the layout of a UDS index volume, and the size and
+ * shape of various index structures. It is created when the index is created, and is referenced by
+ * many index sub-components.
+ */
+
+struct index_geometry {
+       /* Size of a chapter page, in bytes */
+       size_t bytes_per_page;
+       /* Number of record pages in a chapter */
+       u32 record_pages_per_chapter;
+       /* Total number of chapters in a volume */
+       u32 chapters_per_volume;
+       /* Number of sparsely-indexed chapters in a volume */
+       u32 sparse_chapters_per_volume;
+       /* Number of bits used to determine delta list numbers */
+       u8 chapter_delta_list_bits;
+       /* Virtual chapter remapped from physical chapter 0 */
+       u64 remapped_virtual;
+       /* New physical chapter where the remapped chapter can be found */
+       u64 remapped_physical;
+
+       /*
+        * The following properties are derived from the ones above, but they are computed and
+        * recorded as fields for convenience.
+        */
+       /* Total number of pages in a volume, excluding the header */
+       u32 pages_per_volume;
+       /* Total number of bytes in a volume, including the header */
+       size_t bytes_per_volume;
+       /* Number of pages in a chapter */
+       u32 pages_per_chapter;
+       /* Number of index pages in a chapter index */
+       u32 index_pages_per_chapter;
+       /* Number of records that fit on a page */
+       u32 records_per_page;
+       /* Number of records that fit in a chapter */
+       u32 records_per_chapter;
+       /* Number of records that fit in a volume */
+       u64 records_per_volume;
+       /* Number of delta lists per chapter index */
+       u32 delta_lists_per_chapter;
+       /* Mean delta for chapter indexes */
+       u32 chapter_mean_delta;
+       /* Number of bits needed for record page numbers */
+       u8 chapter_payload_bits;
+       /* Number of bits used to compute addresses for chapter delta lists */
+       u8 chapter_address_bits;
+       /* Number of densely-indexed chapters in a volume */
+       u32 dense_chapters_per_volume;
+};
+
+enum {
+       /* The number of bytes in a record (name + metadata) */
+       BYTES_PER_RECORD = (UDS_RECORD_NAME_SIZE + UDS_RECORD_DATA_SIZE),
+
+       /* The default length of a page in a chapter, in bytes */
+       DEFAULT_BYTES_PER_PAGE = 1024 * BYTES_PER_RECORD,
+
+       /* The default maximum number of records per page */
+       DEFAULT_RECORDS_PER_PAGE = DEFAULT_BYTES_PER_PAGE / BYTES_PER_RECORD,
+
+       /* The default number of record pages in a chapter */
+       DEFAULT_RECORD_PAGES_PER_CHAPTER = 256,
+
+       /* The default number of record pages in a chapter for a small index */
+       SMALL_RECORD_PAGES_PER_CHAPTER = 64,
+
+       /* The default number of chapters in a volume */
+       DEFAULT_CHAPTERS_PER_VOLUME = 1024,
+
+       /* The default number of sparsely-indexed chapters in a volume */
+       DEFAULT_SPARSE_CHAPTERS_PER_VOLUME = 0,
+
+       /* The log2 of the default mean delta */
+       DEFAULT_CHAPTER_MEAN_DELTA_BITS = 16,
+
+       /* The log2 of the number of delta lists in a large chapter */
+       DEFAULT_CHAPTER_DELTA_LIST_BITS = 12,
+
+       /* The log2 of the number of delta lists in a small chapter */
+       SMALL_CHAPTER_DELTA_LIST_BITS = 10,
+
+       /* The number of header pages per volume */
+       HEADER_PAGES_PER_VOLUME = 1,
+};
+
+int __must_check uds_make_index_geometry(size_t bytes_per_page, u32 record_pages_per_chapter,
+                                        u32 chapters_per_volume,
+                                        u32 sparse_chapters_per_volume, u64 remapped_virtual,
+                                        u64 remapped_physical,
+                                        struct index_geometry **geometry_ptr);
+
+int __must_check uds_copy_index_geometry(struct index_geometry *source,
+                                        struct index_geometry **geometry_ptr);
+
+void uds_free_index_geometry(struct index_geometry *geometry);
+
+u32 __must_check uds_map_to_physical_chapter(const struct index_geometry *geometry,
+                                            u64 virtual_chapter);
+
+/*
+ * Check whether this geometry is reduced by a chapter. This will only be true if the volume was
+ * converted from a non-lvm volume to an lvm volume.
+ */
+static inline bool __must_check
+uds_is_reduced_index_geometry(const struct index_geometry *geometry)
+{
+       return !!(geometry->chapters_per_volume & 1);
+}
+
+static inline bool __must_check
+uds_is_sparse_index_geometry(const struct index_geometry *geometry)
+{
+       return geometry->sparse_chapters_per_volume > 0;
+}
+
+bool __must_check uds_has_sparse_chapters(const struct index_geometry *geometry,
+                                         u64 oldest_virtual_chapter,
+                                         u64 newest_virtual_chapter);
+
+bool __must_check uds_is_chapter_sparse(const struct index_geometry *geometry,
+                                       u64 oldest_virtual_chapter,
+                                       u64 newest_virtual_chapter,
+                                       u64 virtual_chapter_number);
+
+u32 __must_check uds_chapters_to_expire(const struct index_geometry *geometry,
+                                       u64 newest_chapter);
+
+#endif /* UDS_INDEX_GEOMETRY_H */
diff --git a/drivers/md/dm-vdo/indexer/hash-utils.h b/drivers/md/dm-vdo/indexer/hash-utils.h
new file mode 100644 (file)
index 0000000..6a8dd8f
--- /dev/null
@@ -0,0 +1,66 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#ifndef UDS_HASH_UTILS_H
+#define UDS_HASH_UTILS_H
+
+#include "numeric.h"
+
+#include "geometry.h"
+#include "indexer.h"
+
+/* Utilities for extracting portions of a request name for various uses. */
+
+/* How various portions of a record name are apportioned. */
+enum {
+       VOLUME_INDEX_BYTES_OFFSET = 0,
+       VOLUME_INDEX_BYTES_COUNT = 8,
+       CHAPTER_INDEX_BYTES_OFFSET = 8,
+       CHAPTER_INDEX_BYTES_COUNT = 6,
+       SAMPLE_BYTES_OFFSET = 14,
+       SAMPLE_BYTES_COUNT = 2,
+};
+
+static inline u64 uds_extract_chapter_index_bytes(const struct uds_record_name *name)
+{
+       const u8 *chapter_bits = &name->name[CHAPTER_INDEX_BYTES_OFFSET];
+       u64 bytes = (u64) get_unaligned_be16(chapter_bits) << 32;
+
+       bytes |= get_unaligned_be32(chapter_bits + 2);
+       return bytes;
+}
+
+static inline u64 uds_extract_volume_index_bytes(const struct uds_record_name *name)
+{
+       return get_unaligned_be64(&name->name[VOLUME_INDEX_BYTES_OFFSET]);
+}
+
+static inline u32 uds_extract_sampling_bytes(const struct uds_record_name *name)
+{
+       return get_unaligned_be16(&name->name[SAMPLE_BYTES_OFFSET]);
+}
+
+/* Compute the chapter delta list for a given name. */
+static inline u32 uds_hash_to_chapter_delta_list(const struct uds_record_name *name,
+                                                const struct index_geometry *geometry)
+{
+       return ((uds_extract_chapter_index_bytes(name) >> geometry->chapter_address_bits) &
+               ((1 << geometry->chapter_delta_list_bits) - 1));
+}
+
+/* Compute the chapter delta address for a given name. */
+static inline u32 uds_hash_to_chapter_delta_address(const struct uds_record_name *name,
+                                                   const struct index_geometry *geometry)
+{
+       return uds_extract_chapter_index_bytes(name) & ((1 << geometry->chapter_address_bits) - 1);
+}
+
+static inline unsigned int uds_name_to_hash_slot(const struct uds_record_name *name,
+                                                unsigned int slot_count)
+{
+       return (unsigned int) (uds_extract_chapter_index_bytes(name) % slot_count);
+}
+
+#endif /* UDS_HASH_UTILS_H */
diff --git a/drivers/md/dm-vdo/indexer/index-layout.c b/drivers/md/dm-vdo/indexer/index-layout.c
new file mode 100644 (file)
index 0000000..af533aa
--- /dev/null
@@ -0,0 +1,1769 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#include "index-layout.h"
+
+#include <linux/random.h>
+
+#include "logger.h"
+#include "memory-alloc.h"
+#include "murmurhash3.h"
+#include "numeric.h"
+#include "time-utils.h"
+
+#include "config.h"
+#include "open-chapter.h"
+#include "volume-index.h"
+
+/*
+ * The UDS layout on storage media is divided into a number of fixed-size regions, the sizes of
+ * which are computed when the index is created. Every header and region begins on 4K block
+ * boundary. Save regions are further sub-divided into regions of their own.
+ *
+ * Each region has a kind and an instance number. Some kinds only have one instance and therefore
+ * use RL_SOLE_INSTANCE (-1) as the instance number. The RL_KIND_INDEX used to use instances to
+ * represent sub-indices; now, however there is only ever one sub-index and therefore one instance.
+ * The RL_KIND_VOLUME_INDEX uses instances to record which zone is being saved.
+ *
+ * Every region header has a type and version.
+ *
+ *     +-+-+---------+--------+--------+-+
+ *     | | |   I N D E X  0   101, 0   | |
+ *     |H|C+---------+--------+--------+S|
+ *     |D|f| Volume  | Save   | Save   |e|
+ *     |R|g| Region  | Region | Region |a|
+ *     | | | 201, -1 | 202, 0 | 202, 1 |l|
+ *     +-+-+--------+---------+--------+-+
+ *
+ * The header contains the encoded region layout table as well as some index configuration data.
+ * The sub-index region and its subdivisions are maintained in the same table.
+ *
+ * There are two save regions to preserve the old state in case saving the new state is incomplete.
+ * They are used in alternation. Each save region is further divided into sub-regions.
+ *
+ *     +-+-----+------+------+-----+-----+
+ *     |H| IPM | MI   | MI   |     | OC  |
+ *     |D|     | zone | zone | ... |     |
+ *     |R| 301 | 302  | 302  |     | 303 |
+ *     | | -1  |  0   |  1   |     | -1  |
+ *     +-+-----+------+------+-----+-----+
+ *
+ * The header contains the encoded region layout table as well as index state data for that save.
+ * Each save also has a unique nonce.
+ */
+
+enum {
+       MAGIC_SIZE = 32,
+       NONCE_INFO_SIZE = 32,
+       MAX_SAVES = 2,
+};
+
+enum region_kind {
+       RL_KIND_EMPTY = 0,
+       RL_KIND_HEADER = 1,
+       RL_KIND_CONFIG = 100,
+       RL_KIND_INDEX = 101,
+       RL_KIND_SEAL = 102,
+       RL_KIND_VOLUME = 201,
+       RL_KIND_SAVE = 202,
+       RL_KIND_INDEX_PAGE_MAP = 301,
+       RL_KIND_VOLUME_INDEX = 302,
+       RL_KIND_OPEN_CHAPTER = 303,
+};
+
+/* Some region types are historical and are no longer used. */
+enum region_type {
+       RH_TYPE_FREE = 0, /* unused */
+       RH_TYPE_SUPER = 1,
+       RH_TYPE_SAVE = 2,
+       RH_TYPE_CHECKPOINT = 3, /* unused */
+       RH_TYPE_UNSAVED = 4,
+};
+
+enum {
+       RL_SOLE_INSTANCE = 65535,
+};
+
+/*
+ * Super block version 2 is the first released version.
+ *
+ * Super block version 3 is the normal version used from RHEL 8.2 onwards.
+ *
+ * Super block versions 4 through 6 were incremental development versions and
+ * are not supported.
+ *
+ * Super block version 7 is used for volumes which have been reduced in size by one chapter in
+ * order to make room to prepend LVM metadata to a volume originally created without lvm. This
+ * allows the index to retain most its deduplication records.
+ */
+enum {
+       SUPER_VERSION_MINIMUM = 3,
+       SUPER_VERSION_CURRENT = 3,
+       SUPER_VERSION_MAXIMUM = 7,
+};
+
+static const u8 LAYOUT_MAGIC[MAGIC_SIZE] = "*ALBIREO*SINGLE*FILE*LAYOUT*001*";
+static const u64 REGION_MAGIC = 0x416c6252676e3031; /* 'AlbRgn01' */
+
+struct region_header {
+       u64 magic;
+       u64 region_blocks;
+       u16 type;
+       /* Currently always version 1 */
+       u16 version;
+       u16 region_count;
+       u16 payload;
+};
+
+struct layout_region {
+       u64 start_block;
+       u64 block_count;
+       u32 __unused;
+       u16 kind;
+       u16 instance;
+};
+
+struct region_table {
+       size_t encoded_size;
+       struct region_header header;
+       struct layout_region regions[];
+};
+
+struct index_save_data {
+       u64 timestamp;
+       u64 nonce;
+       /* Currently always version 1 */
+       u32 version;
+       u32 unused__;
+};
+
+struct index_state_version {
+       s32 signature;
+       s32 version_id;
+};
+
+static const struct index_state_version INDEX_STATE_VERSION_301 = {
+       .signature  = -1,
+       .version_id = 301,
+};
+
+struct index_state_data301 {
+       struct index_state_version version;
+       u64 newest_chapter;
+       u64 oldest_chapter;
+       u64 last_save;
+       u32 unused;
+       u32 padding;
+};
+
+struct index_save_layout {
+       unsigned int zone_count;
+       struct layout_region index_save;
+       struct layout_region header;
+       struct layout_region index_page_map;
+       struct layout_region free_space;
+       struct layout_region volume_index_zones[MAX_ZONES];
+       struct layout_region open_chapter;
+       struct index_save_data save_data;
+       struct index_state_data301 state_data;
+};
+
+struct sub_index_layout {
+       u64 nonce;
+       struct layout_region sub_index;
+       struct layout_region volume;
+       struct index_save_layout *saves;
+};
+
+struct super_block_data {
+       u8 magic_label[MAGIC_SIZE];
+       u8 nonce_info[NONCE_INFO_SIZE];
+       u64 nonce;
+       u32 version;
+       u32 block_size;
+       u16 index_count;
+       u16 max_saves;
+       /* Padding reflects a blank field on permanent storage */
+       u8 padding[4];
+       u64 open_chapter_blocks;
+       u64 page_map_blocks;
+       u64 volume_offset;
+       u64 start_offset;
+};
+
+struct index_layout {
+       struct io_factory *factory;
+       size_t factory_size;
+       off_t offset;
+       struct super_block_data super;
+       struct layout_region header;
+       struct layout_region config;
+       struct sub_index_layout index;
+       struct layout_region seal;
+       u64 total_blocks;
+};
+
+struct save_layout_sizes {
+       unsigned int save_count;
+       size_t block_size;
+       u64 volume_blocks;
+       u64 volume_index_blocks;
+       u64 page_map_blocks;
+       u64 open_chapter_blocks;
+       u64 save_blocks;
+       u64 sub_index_blocks;
+       u64 total_blocks;
+       size_t total_size;
+};
+
+static inline bool is_converted_super_block(struct super_block_data *super)
+{
+       return super->version == 7;
+}
+
+static int __must_check compute_sizes(const struct uds_configuration *config,
+                                     struct save_layout_sizes *sls)
+{
+       int result;
+       struct index_geometry *geometry = config->geometry;
+
+       memset(sls, 0, sizeof(*sls));
+       sls->save_count = MAX_SAVES;
+       sls->block_size = UDS_BLOCK_SIZE;
+       sls->volume_blocks = geometry->bytes_per_volume / sls->block_size;
+
+       result = uds_compute_volume_index_save_blocks(config, sls->block_size,
+                                                     &sls->volume_index_blocks);
+       if (result != UDS_SUCCESS)
+               return uds_log_error_strerror(result, "cannot compute index save size");
+
+       sls->page_map_blocks =
+               DIV_ROUND_UP(uds_compute_index_page_map_save_size(geometry),
+                            sls->block_size);
+       sls->open_chapter_blocks =
+               DIV_ROUND_UP(uds_compute_saved_open_chapter_size(geometry),
+                            sls->block_size);
+       sls->save_blocks =
+               1 + (sls->volume_index_blocks + sls->page_map_blocks + sls->open_chapter_blocks);
+       sls->sub_index_blocks = sls->volume_blocks + (sls->save_count * sls->save_blocks);
+       sls->total_blocks = 3 + sls->sub_index_blocks;
+       sls->total_size = sls->total_blocks * sls->block_size;
+
+       return UDS_SUCCESS;
+}
+
+int uds_compute_index_size(const struct uds_parameters *parameters, u64 *index_size)
+{
+       int result;
+       struct uds_configuration *index_config;
+       struct save_layout_sizes sizes;
+
+       if (index_size == NULL) {
+               uds_log_error("Missing output size pointer");
+               return -EINVAL;
+       }
+
+       result = uds_make_configuration(parameters, &index_config);
+       if (result != UDS_SUCCESS) {
+               uds_log_error_strerror(result, "cannot compute index size");
+               return uds_status_to_errno(result);
+       }
+
+       result = compute_sizes(index_config, &sizes);
+       uds_free_configuration(index_config);
+       if (result != UDS_SUCCESS)
+               return uds_status_to_errno(result);
+
+       *index_size = sizes.total_size;
+       return UDS_SUCCESS;
+}
+
+/* Create unique data using the current time and a pseudorandom number. */
+static void create_unique_nonce_data(u8 *buffer)
+{
+       ktime_t now = current_time_ns(CLOCK_REALTIME);
+       u32 rand;
+       size_t offset = 0;
+
+       get_random_bytes(&rand, sizeof(u32));
+       memcpy(buffer + offset, &now, sizeof(now));
+       offset += sizeof(now);
+       memcpy(buffer + offset, &rand, sizeof(rand));
+       offset += sizeof(rand);
+       while (offset < NONCE_INFO_SIZE) {
+               size_t len = min(NONCE_INFO_SIZE - offset, offset);
+
+               memcpy(buffer + offset, buffer, len);
+               offset += len;
+       }
+}
+
+static u64 hash_stuff(u64 start, const void *data, size_t len)
+{
+       u32 seed = start ^ (start >> 27);
+       u8 hash_buffer[16];
+
+       murmurhash3_128(data, len, seed, hash_buffer);
+       return get_unaligned_le64(hash_buffer + 4);
+}
+
+/* Generate a primary nonce from the provided data. */
+static u64 generate_primary_nonce(const void *data, size_t len)
+{
+       return hash_stuff(0xa1b1e0fc, data, len);
+}
+
+/*
+ * Deterministically generate a secondary nonce from an existing nonce and some arbitrary data by
+ * hashing the original nonce and the data to produce a new nonce.
+ */
+static u64 generate_secondary_nonce(u64 nonce, const void *data, size_t len)
+{
+       return hash_stuff(nonce + 1, data, len);
+}
+
+static int __must_check open_layout_reader(struct index_layout *layout,
+                                          struct layout_region *lr, off_t offset,
+                                          struct buffered_reader **reader_ptr)
+{
+       return uds_make_buffered_reader(layout->factory, lr->start_block + offset,
+                                       lr->block_count, reader_ptr);
+}
+
+static int open_region_reader(struct index_layout *layout, struct layout_region *region,
+                             struct buffered_reader **reader_ptr)
+{
+       return open_layout_reader(layout, region, -layout->super.start_offset,
+                                 reader_ptr);
+}
+
+static int __must_check open_layout_writer(struct index_layout *layout,
+                                          struct layout_region *lr, off_t offset,
+                                          struct buffered_writer **writer_ptr)
+{
+       return uds_make_buffered_writer(layout->factory, lr->start_block + offset,
+                                       lr->block_count, writer_ptr);
+}
+
+static int open_region_writer(struct index_layout *layout, struct layout_region *region,
+                             struct buffered_writer **writer_ptr)
+{
+       return open_layout_writer(layout, region, -layout->super.start_offset,
+                                 writer_ptr);
+}
+
+static void generate_super_block_data(struct save_layout_sizes *sls,
+                                     struct super_block_data *super)
+{
+       memset(super, 0, sizeof(*super));
+       memcpy(super->magic_label, LAYOUT_MAGIC, MAGIC_SIZE);
+       create_unique_nonce_data(super->nonce_info);
+
+       super->nonce = generate_primary_nonce(super->nonce_info,
+                                             sizeof(super->nonce_info));
+       super->version = SUPER_VERSION_CURRENT;
+       super->block_size = sls->block_size;
+       super->index_count = 1;
+       super->max_saves = sls->save_count;
+       super->open_chapter_blocks = sls->open_chapter_blocks;
+       super->page_map_blocks = sls->page_map_blocks;
+       super->volume_offset = 0;
+       super->start_offset = 0;
+}
+
+static void define_sub_index_nonce(struct index_layout *layout)
+{
+       struct sub_index_nonce_data {
+               u64 offset;
+               u16 index_id;
+       };
+       struct sub_index_layout *sil = &layout->index;
+       u64 primary_nonce = layout->super.nonce;
+       u8 buffer[sizeof(struct sub_index_nonce_data)] = { 0 };
+       size_t offset = 0;
+
+       encode_u64_le(buffer, &offset, sil->sub_index.start_block);
+       encode_u16_le(buffer, &offset, 0);
+       sil->nonce = generate_secondary_nonce(primary_nonce, buffer, sizeof(buffer));
+       if (sil->nonce == 0) {
+               sil->nonce = generate_secondary_nonce(~primary_nonce + 1, buffer,
+                                                     sizeof(buffer));
+       }
+}
+
+static void setup_sub_index(struct index_layout *layout, u64 start_block,
+                           struct save_layout_sizes *sls)
+{
+       struct sub_index_layout *sil = &layout->index;
+       u64 next_block = start_block;
+       unsigned int i;
+
+       sil->sub_index = (struct layout_region) {
+               .start_block = start_block,
+               .block_count = sls->sub_index_blocks,
+               .kind = RL_KIND_INDEX,
+               .instance = 0,
+       };
+
+       sil->volume = (struct layout_region) {
+               .start_block = next_block,
+               .block_count = sls->volume_blocks,
+               .kind = RL_KIND_VOLUME,
+               .instance = RL_SOLE_INSTANCE,
+       };
+
+       next_block += sls->volume_blocks;
+
+       for (i = 0; i < sls->save_count; i++) {
+               sil->saves[i].index_save = (struct layout_region) {
+                       .start_block = next_block,
+                       .block_count = sls->save_blocks,
+                       .kind = RL_KIND_SAVE,
+                       .instance = i,
+               };
+
+               next_block += sls->save_blocks;
+       }
+
+       define_sub_index_nonce(layout);
+}
+
+static void initialize_layout(struct index_layout *layout, struct save_layout_sizes *sls)
+{
+       u64 next_block = layout->offset / sls->block_size;
+
+       layout->total_blocks = sls->total_blocks;
+       generate_super_block_data(sls, &layout->super);
+       layout->header = (struct layout_region) {
+               .start_block = next_block++,
+               .block_count = 1,
+               .kind = RL_KIND_HEADER,
+               .instance = RL_SOLE_INSTANCE,
+       };
+
+       layout->config = (struct layout_region) {
+               .start_block = next_block++,
+               .block_count = 1,
+               .kind = RL_KIND_CONFIG,
+               .instance = RL_SOLE_INSTANCE,
+       };
+
+       setup_sub_index(layout, next_block, sls);
+       next_block += sls->sub_index_blocks;
+
+       layout->seal = (struct layout_region) {
+               .start_block = next_block,
+               .block_count = 1,
+               .kind = RL_KIND_SEAL,
+               .instance = RL_SOLE_INSTANCE,
+       };
+}
+
+static int __must_check make_index_save_region_table(struct index_save_layout *isl,
+                                                    struct region_table **table_ptr)
+{
+       int result;
+       unsigned int z;
+       struct region_table *table;
+       struct layout_region *lr;
+       u16 region_count;
+       size_t payload;
+       size_t type;
+
+       if (isl->zone_count > 0) {
+               /*
+                * Normal save regions: header, page map, volume index zones,
+                * open chapter, and possibly free space.
+                */
+               region_count = 3 + isl->zone_count;
+               if (isl->free_space.block_count > 0)
+                       region_count++;
+
+               payload = sizeof(isl->save_data) + sizeof(isl->state_data);
+               type = RH_TYPE_SAVE;
+       } else {
+               /* Empty save regions: header, page map, free space. */
+               region_count = 3;
+               payload = sizeof(isl->save_data);
+               type = RH_TYPE_UNSAVED;
+       }
+
+       result = uds_allocate_extended(struct region_table, region_count,
+                                      struct layout_region,
+                                      "layout region table for ISL", &table);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       lr = &table->regions[0];
+       *lr++ = isl->header;
+       *lr++ = isl->index_page_map;
+       for (z = 0; z < isl->zone_count; z++)
+               *lr++ = isl->volume_index_zones[z];
+
+       if (isl->zone_count > 0)
+               *lr++ = isl->open_chapter;
+
+       if (isl->free_space.block_count > 0)
+               *lr++ = isl->free_space;
+
+       table->header = (struct region_header) {
+               .magic = REGION_MAGIC,
+               .region_blocks = isl->index_save.block_count,
+               .type = type,
+               .version = 1,
+               .region_count = region_count,
+               .payload = payload,
+       };
+
+       table->encoded_size = (sizeof(struct region_header) + payload +
+                              region_count * sizeof(struct layout_region));
+       *table_ptr = table;
+       return UDS_SUCCESS;
+}
+
+static void encode_region_table(u8 *buffer, size_t *offset, struct region_table *table)
+{
+       unsigned int i;
+
+       encode_u64_le(buffer, offset, REGION_MAGIC);
+       encode_u64_le(buffer, offset, table->header.region_blocks);
+       encode_u16_le(buffer, offset, table->header.type);
+       encode_u16_le(buffer, offset, table->header.version);
+       encode_u16_le(buffer, offset, table->header.region_count);
+       encode_u16_le(buffer, offset, table->header.payload);
+
+       for (i = 0; i < table->header.region_count; i++) {
+               encode_u64_le(buffer, offset, table->regions[i].start_block);
+               encode_u64_le(buffer, offset, table->regions[i].block_count);
+               encode_u32_le(buffer, offset, 0);
+               encode_u16_le(buffer, offset, table->regions[i].kind);
+               encode_u16_le(buffer, offset, table->regions[i].instance);
+       }
+}
+
+static int __must_check write_index_save_header(struct index_save_layout *isl,
+                                               struct region_table *table,
+                                               struct buffered_writer *writer)
+{
+       int result;
+       u8 *buffer;
+       size_t offset = 0;
+
+       result = uds_allocate(table->encoded_size, u8, "index save data", &buffer);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       encode_region_table(buffer, &offset, table);
+       encode_u64_le(buffer, &offset, isl->save_data.timestamp);
+       encode_u64_le(buffer, &offset, isl->save_data.nonce);
+       encode_u32_le(buffer, &offset, isl->save_data.version);
+       encode_u32_le(buffer, &offset, 0);
+       if (isl->zone_count > 0) {
+               encode_u32_le(buffer, &offset, INDEX_STATE_VERSION_301.signature);
+               encode_u32_le(buffer, &offset, INDEX_STATE_VERSION_301.version_id);
+               encode_u64_le(buffer, &offset, isl->state_data.newest_chapter);
+               encode_u64_le(buffer, &offset, isl->state_data.oldest_chapter);
+               encode_u64_le(buffer, &offset, isl->state_data.last_save);
+               encode_u64_le(buffer, &offset, 0);
+       }
+
+       result = uds_write_to_buffered_writer(writer, buffer, offset);
+       uds_free(buffer);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       return uds_flush_buffered_writer(writer);
+}
+
+static int write_index_save_layout(struct index_layout *layout,
+                                  struct index_save_layout *isl)
+{
+       int result;
+       struct region_table *table;
+       struct buffered_writer *writer;
+
+       result = make_index_save_region_table(isl, &table);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       result = open_region_writer(layout, &isl->header, &writer);
+       if (result != UDS_SUCCESS) {
+               uds_free(table);
+               return result;
+       }
+
+       result = write_index_save_header(isl, table, writer);
+       uds_free(table);
+       uds_free_buffered_writer(writer);
+
+       return result;
+}
+
+static void reset_index_save_layout(struct index_save_layout *isl, u64 page_map_blocks)
+{
+       u64 free_blocks;
+       u64 next_block = isl->index_save.start_block;
+
+       isl->zone_count = 0;
+       memset(&isl->save_data, 0, sizeof(isl->save_data));
+
+       isl->header = (struct layout_region) {
+               .start_block = next_block++,
+               .block_count = 1,
+               .kind = RL_KIND_HEADER,
+               .instance = RL_SOLE_INSTANCE,
+       };
+
+       isl->index_page_map = (struct layout_region) {
+               .start_block = next_block,
+               .block_count = page_map_blocks,
+               .kind = RL_KIND_INDEX_PAGE_MAP,
+               .instance = RL_SOLE_INSTANCE,
+       };
+
+       next_block += page_map_blocks;
+
+       free_blocks = isl->index_save.block_count - page_map_blocks - 1;
+       isl->free_space = (struct layout_region) {
+               .start_block = next_block,
+               .block_count = free_blocks,
+               .kind = RL_KIND_EMPTY,
+               .instance = RL_SOLE_INSTANCE,
+       };
+}
+
+static int __must_check invalidate_old_save(struct index_layout *layout,
+                                           struct index_save_layout *isl)
+{
+       reset_index_save_layout(isl, layout->super.page_map_blocks);
+       return write_index_save_layout(layout, isl);
+}
+
+static int discard_index_state_data(struct index_layout *layout)
+{
+       int result;
+       int saved_result = UDS_SUCCESS;
+       unsigned int i;
+
+       for (i = 0; i < layout->super.max_saves; i++) {
+               result = invalidate_old_save(layout, &layout->index.saves[i]);
+               if (result != UDS_SUCCESS)
+                       saved_result = result;
+       }
+
+       if (saved_result != UDS_SUCCESS) {
+               return uds_log_error_strerror(result,
+                                             "%s: cannot destroy all index saves",
+                                             __func__);
+       }
+
+       return UDS_SUCCESS;
+}
+
+static int __must_check make_layout_region_table(struct index_layout *layout,
+                                                struct region_table **table_ptr)
+{
+       int result;
+       unsigned int i;
+       /* Regions: header, config, index, volume, saves, seal */
+       u16 region_count = 5 + layout->super.max_saves;
+       u16 payload;
+       struct region_table *table;
+       struct layout_region *lr;
+
+       result = uds_allocate_extended(struct region_table, region_count,
+                                      struct layout_region, "layout region table",
+                                      &table);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       lr = &table->regions[0];
+       *lr++ = layout->header;
+       *lr++ = layout->config;
+       *lr++ = layout->index.sub_index;
+       *lr++ = layout->index.volume;
+
+       for (i = 0; i < layout->super.max_saves; i++)
+               *lr++ = layout->index.saves[i].index_save;
+
+       *lr++ = layout->seal;
+
+       if (is_converted_super_block(&layout->super)) {
+               payload = sizeof(struct super_block_data);
+       } else {
+               payload = (sizeof(struct super_block_data) -
+                          sizeof(layout->super.volume_offset) -
+                          sizeof(layout->super.start_offset));
+       }
+
+       table->header = (struct region_header) {
+               .magic = REGION_MAGIC,
+               .region_blocks = layout->total_blocks,
+               .type = RH_TYPE_SUPER,
+               .version = 1,
+               .region_count = region_count,
+               .payload = payload,
+       };
+
+       table->encoded_size = (sizeof(struct region_header) + payload +
+                              region_count * sizeof(struct layout_region));
+       *table_ptr = table;
+       return UDS_SUCCESS;
+}
+
+static int __must_check write_layout_header(struct index_layout *layout,
+                                           struct region_table *table,
+                                           struct buffered_writer *writer)
+{
+       int result;
+       u8 *buffer;
+       size_t offset = 0;
+
+       result = uds_allocate(table->encoded_size, u8, "layout data", &buffer);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       encode_region_table(buffer, &offset, table);
+       memcpy(buffer + offset, &layout->super.magic_label, MAGIC_SIZE);
+       offset += MAGIC_SIZE;
+       memcpy(buffer + offset, &layout->super.nonce_info, NONCE_INFO_SIZE);
+       offset += NONCE_INFO_SIZE;
+       encode_u64_le(buffer, &offset, layout->super.nonce);
+       encode_u32_le(buffer, &offset, layout->super.version);
+       encode_u32_le(buffer, &offset, layout->super.block_size);
+       encode_u16_le(buffer, &offset, layout->super.index_count);
+       encode_u16_le(buffer, &offset, layout->super.max_saves);
+       encode_u32_le(buffer, &offset, 0);
+       encode_u64_le(buffer, &offset, layout->super.open_chapter_blocks);
+       encode_u64_le(buffer, &offset, layout->super.page_map_blocks);
+
+       if (is_converted_super_block(&layout->super)) {
+               encode_u64_le(buffer, &offset, layout->super.volume_offset);
+               encode_u64_le(buffer, &offset, layout->super.start_offset);
+       }
+
+       result = uds_write_to_buffered_writer(writer, buffer, offset);
+       uds_free(buffer);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       return uds_flush_buffered_writer(writer);
+}
+
+static int __must_check write_uds_index_config(struct index_layout *layout,
+                                              struct uds_configuration *config,
+                                              off_t offset)
+{
+       int result;
+       struct buffered_writer *writer = NULL;
+
+       result = open_layout_writer(layout, &layout->config, offset, &writer);
+       if (result != UDS_SUCCESS)
+               return uds_log_error_strerror(result, "failed to open config region");
+
+       result = uds_write_config_contents(writer, config, layout->super.version);
+       if (result != UDS_SUCCESS) {
+               uds_free_buffered_writer(writer);
+               return uds_log_error_strerror(result, "failed to write config region");
+       }
+
+       result = uds_flush_buffered_writer(writer);
+       if (result != UDS_SUCCESS) {
+               uds_free_buffered_writer(writer);
+               return uds_log_error_strerror(result, "cannot flush config writer");
+       }
+
+       uds_free_buffered_writer(writer);
+       return UDS_SUCCESS;
+}
+
+static int __must_check save_layout(struct index_layout *layout, off_t offset)
+{
+       int result;
+       struct buffered_writer *writer = NULL;
+       struct region_table *table;
+
+       result = make_layout_region_table(layout, &table);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       result = open_layout_writer(layout, &layout->header, offset, &writer);
+       if (result != UDS_SUCCESS) {
+               uds_free(table);
+               return result;
+       }
+
+       result = write_layout_header(layout, table, writer);
+       uds_free(table);
+       uds_free_buffered_writer(writer);
+
+       return result;
+}
+
+static int create_index_layout(struct index_layout *layout, struct uds_configuration *config)
+{
+       int result;
+       struct save_layout_sizes sizes;
+
+       result = compute_sizes(config, &sizes);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       result = uds_allocate(sizes.save_count, struct index_save_layout, __func__,
+                             &layout->index.saves);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       initialize_layout(layout, &sizes);
+
+       result = discard_index_state_data(layout);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       result = write_uds_index_config(layout, config, 0);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       return save_layout(layout, 0);
+}
+
+static u64 generate_index_save_nonce(u64 volume_nonce, struct index_save_layout *isl)
+{
+       struct save_nonce_data {
+               struct index_save_data data;
+               u64 offset;
+       } nonce_data;
+       u8 buffer[sizeof(nonce_data)];
+       size_t offset = 0;
+
+       encode_u64_le(buffer, &offset, isl->save_data.timestamp);
+       encode_u64_le(buffer, &offset, 0);
+       encode_u32_le(buffer, &offset, isl->save_data.version);
+       encode_u32_le(buffer, &offset, 0U);
+       encode_u64_le(buffer, &offset, isl->index_save.start_block);
+       ASSERT_LOG_ONLY(offset == sizeof(nonce_data),
+                       "%zu bytes encoded of %zu expected", offset, sizeof(nonce_data));
+       return generate_secondary_nonce(volume_nonce, buffer, sizeof(buffer));
+}
+
+static u64 validate_index_save_layout(struct index_save_layout *isl, u64 volume_nonce)
+{
+       if ((isl->zone_count == 0) || (isl->save_data.timestamp == 0))
+               return 0;
+
+       if (isl->save_data.nonce != generate_index_save_nonce(volume_nonce, isl))
+               return 0;
+
+       return isl->save_data.timestamp;
+}
+
+static int find_latest_uds_index_save_slot(struct index_layout *layout,
+                                          struct index_save_layout **isl_ptr)
+{
+       struct index_save_layout *latest = NULL;
+       struct index_save_layout *isl;
+       unsigned int i;
+       u64 save_time = 0;
+       u64 latest_time = 0;
+
+       for (i = 0; i < layout->super.max_saves; i++) {
+               isl = &layout->index.saves[i];
+               save_time = validate_index_save_layout(isl, layout->index.nonce);
+               if (save_time > latest_time) {
+                       latest = isl;
+                       latest_time = save_time;
+               }
+       }
+
+       if (latest == NULL) {
+               uds_log_error("No valid index save found");
+               return UDS_INDEX_NOT_SAVED_CLEANLY;
+       }
+
+       *isl_ptr = latest;
+       return UDS_SUCCESS;
+}
+
+int uds_discard_open_chapter(struct index_layout *layout)
+{
+       int result;
+       struct index_save_layout *isl;
+       struct buffered_writer *writer;
+
+       result = find_latest_uds_index_save_slot(layout, &isl);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       result = open_region_writer(layout, &isl->open_chapter, &writer);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       result = uds_write_to_buffered_writer(writer, NULL, UDS_BLOCK_SIZE);
+       if (result != UDS_SUCCESS) {
+               uds_free_buffered_writer(writer);
+               return result;
+       }
+
+       result = uds_flush_buffered_writer(writer);
+       uds_free_buffered_writer(writer);
+       return result;
+}
+
+int uds_load_index_state(struct index_layout *layout, struct uds_index *index)
+{
+       int result;
+       unsigned int zone;
+       struct index_save_layout *isl;
+       struct buffered_reader *readers[MAX_ZONES];
+
+       result = find_latest_uds_index_save_slot(layout, &isl);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       index->newest_virtual_chapter = isl->state_data.newest_chapter;
+       index->oldest_virtual_chapter = isl->state_data.oldest_chapter;
+       index->last_save = isl->state_data.last_save;
+
+       result = open_region_reader(layout, &isl->open_chapter, &readers[0]);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       result = uds_load_open_chapter(index, readers[0]);
+       uds_free_buffered_reader(readers[0]);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       for (zone = 0; zone < isl->zone_count; zone++) {
+               result = open_region_reader(layout, &isl->volume_index_zones[zone],
+                                           &readers[zone]);
+               if (result != UDS_SUCCESS) {
+                       for (; zone > 0; zone--)
+                               uds_free_buffered_reader(readers[zone - 1]);
+
+                       return result;
+               }
+       }
+
+       result = uds_load_volume_index(index->volume_index, readers, isl->zone_count);
+       for (zone = 0; zone < isl->zone_count; zone++)
+               uds_free_buffered_reader(readers[zone]);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       result = open_region_reader(layout, &isl->index_page_map, &readers[0]);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       result = uds_read_index_page_map(index->volume->index_page_map, readers[0]);
+       uds_free_buffered_reader(readers[0]);
+
+       return result;
+}
+
+static struct index_save_layout *select_oldest_index_save_layout(struct index_layout *layout)
+{
+       struct index_save_layout *oldest = NULL;
+       struct index_save_layout *isl;
+       unsigned int i;
+       u64 save_time = 0;
+       u64 oldest_time = 0;
+
+       for (i = 0; i < layout->super.max_saves; i++) {
+               isl = &layout->index.saves[i];
+               save_time = validate_index_save_layout(isl, layout->index.nonce);
+               if (oldest == NULL || save_time < oldest_time) {
+                       oldest = isl;
+                       oldest_time = save_time;
+               }
+       }
+
+       return oldest;
+}
+
+static void instantiate_index_save_layout(struct index_save_layout *isl,
+                                         struct super_block_data *super,
+                                         u64 volume_nonce, unsigned int zone_count)
+{
+       unsigned int z;
+       u64 next_block;
+       u64 free_blocks;
+       u64 volume_index_blocks;
+
+       isl->zone_count = zone_count;
+       memset(&isl->save_data, 0, sizeof(isl->save_data));
+       isl->save_data.timestamp = ktime_to_ms(current_time_ns(CLOCK_REALTIME));
+       isl->save_data.version = 1;
+       isl->save_data.nonce = generate_index_save_nonce(volume_nonce, isl);
+
+       next_block = isl->index_save.start_block;
+       isl->header = (struct layout_region) {
+               .start_block = next_block++,
+               .block_count = 1,
+               .kind = RL_KIND_HEADER,
+               .instance = RL_SOLE_INSTANCE,
+       };
+
+       isl->index_page_map = (struct layout_region) {
+               .start_block = next_block,
+               .block_count = super->page_map_blocks,
+               .kind = RL_KIND_INDEX_PAGE_MAP,
+               .instance = RL_SOLE_INSTANCE,
+       };
+       next_block += super->page_map_blocks;
+
+       free_blocks = (isl->index_save.block_count - 1 -
+                      super->page_map_blocks -
+                      super->open_chapter_blocks);
+       volume_index_blocks = free_blocks / isl->zone_count;
+       for (z = 0; z < isl->zone_count; z++) {
+               isl->volume_index_zones[z] = (struct layout_region) {
+                       .start_block = next_block,
+                       .block_count = volume_index_blocks,
+                       .kind = RL_KIND_VOLUME_INDEX,
+                       .instance = z,
+               };
+
+               next_block += volume_index_blocks;
+               free_blocks -= volume_index_blocks;
+       }
+
+       isl->open_chapter = (struct layout_region) {
+               .start_block = next_block,
+               .block_count = super->open_chapter_blocks,
+               .kind = RL_KIND_OPEN_CHAPTER,
+               .instance = RL_SOLE_INSTANCE,
+       };
+
+       next_block += super->open_chapter_blocks;
+
+       isl->free_space = (struct layout_region) {
+               .start_block = next_block,
+               .block_count = free_blocks,
+               .kind = RL_KIND_EMPTY,
+               .instance = RL_SOLE_INSTANCE,
+       };
+}
+
+static int setup_uds_index_save_slot(struct index_layout *layout,
+                                    unsigned int zone_count,
+                                    struct index_save_layout **isl_ptr)
+{
+       int result;
+       struct index_save_layout *isl;
+
+       isl = select_oldest_index_save_layout(layout);
+       result = invalidate_old_save(layout, isl);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       instantiate_index_save_layout(isl, &layout->super, layout->index.nonce,
+                                     zone_count);
+
+       *isl_ptr = isl;
+       return UDS_SUCCESS;
+}
+
+static void cancel_uds_index_save(struct index_save_layout *isl)
+{
+       memset(&isl->save_data, 0, sizeof(isl->save_data));
+       memset(&isl->state_data, 0, sizeof(isl->state_data));
+       isl->zone_count = 0;
+}
+
+int uds_save_index_state(struct index_layout *layout, struct uds_index *index)
+{
+       int result;
+       unsigned int zone;
+       struct index_save_layout *isl;
+       struct buffered_writer *writers[MAX_ZONES];
+
+       result = setup_uds_index_save_slot(layout, index->zone_count, &isl);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       isl->state_data = (struct index_state_data301) {
+               .newest_chapter = index->newest_virtual_chapter,
+               .oldest_chapter = index->oldest_virtual_chapter,
+               .last_save = index->last_save,
+       };
+
+       result = open_region_writer(layout, &isl->open_chapter, &writers[0]);
+       if (result != UDS_SUCCESS) {
+               cancel_uds_index_save(isl);
+               return result;
+       }
+
+       result = uds_save_open_chapter(index, writers[0]);
+       uds_free_buffered_writer(writers[0]);
+       if (result != UDS_SUCCESS) {
+               cancel_uds_index_save(isl);
+               return result;
+       }
+
+       for (zone = 0; zone < index->zone_count; zone++) {
+               result = open_region_writer(layout, &isl->volume_index_zones[zone],
+                                           &writers[zone]);
+               if (result != UDS_SUCCESS) {
+                       for (; zone > 0; zone--)
+                               uds_free_buffered_writer(writers[zone - 1]);
+
+                       cancel_uds_index_save(isl);
+                       return result;
+               }
+       }
+
+       result = uds_save_volume_index(index->volume_index, writers, index->zone_count);
+       for (zone = 0; zone < index->zone_count; zone++)
+               uds_free_buffered_writer(writers[zone]);
+       if (result != UDS_SUCCESS) {
+               cancel_uds_index_save(isl);
+               return result;
+       }
+
+       result = open_region_writer(layout, &isl->index_page_map, &writers[0]);
+       if (result != UDS_SUCCESS) {
+               cancel_uds_index_save(isl);
+               return result;
+       }
+
+       result = uds_write_index_page_map(index->volume->index_page_map, writers[0]);
+       uds_free_buffered_writer(writers[0]);
+       if (result != UDS_SUCCESS) {
+               cancel_uds_index_save(isl);
+               return result;
+       }
+
+       return write_index_save_layout(layout, isl);
+}
+
+static int __must_check load_region_table(struct buffered_reader *reader,
+                                         struct region_table **table_ptr)
+{
+       int result;
+       unsigned int i;
+       struct region_header header;
+       struct region_table *table;
+       u8 buffer[sizeof(struct region_header)];
+       size_t offset = 0;
+
+       result = uds_read_from_buffered_reader(reader, buffer, sizeof(buffer));
+       if (result != UDS_SUCCESS)
+               return uds_log_error_strerror(result, "cannot read region table header");
+
+       decode_u64_le(buffer, &offset, &header.magic);
+       decode_u64_le(buffer, &offset, &header.region_blocks);
+       decode_u16_le(buffer, &offset, &header.type);
+       decode_u16_le(buffer, &offset, &header.version);
+       decode_u16_le(buffer, &offset, &header.region_count);
+       decode_u16_le(buffer, &offset, &header.payload);
+
+       if (header.magic != REGION_MAGIC)
+               return UDS_NO_INDEX;
+
+       if (header.version != 1) {
+               return uds_log_error_strerror(UDS_UNSUPPORTED_VERSION,
+                                             "unknown region table version %hu",
+                                             header.version);
+       }
+
+       result = uds_allocate_extended(struct region_table, header.region_count,
+                                      struct layout_region,
+                                      "single file layout region table", &table);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       table->header = header;
+       for (i = 0; i < header.region_count; i++) {
+               u8 region_buffer[sizeof(struct layout_region)];
+
+               offset = 0;
+               result = uds_read_from_buffered_reader(reader, region_buffer,
+                                                      sizeof(region_buffer));
+               if (result != UDS_SUCCESS) {
+                       uds_free(table);
+                       return uds_log_error_strerror(UDS_CORRUPT_DATA,
+                                                     "cannot read region table layouts");
+               }
+
+               decode_u64_le(region_buffer, &offset, &table->regions[i].start_block);
+               decode_u64_le(region_buffer, &offset, &table->regions[i].block_count);
+               offset += sizeof(u32);
+               decode_u16_le(region_buffer, &offset, &table->regions[i].kind);
+               decode_u16_le(region_buffer, &offset, &table->regions[i].instance);
+       }
+
+       *table_ptr = table;
+       return UDS_SUCCESS;
+}
+
+static int __must_check read_super_block_data(struct buffered_reader *reader,
+                                             struct index_layout *layout,
+                                             size_t saved_size)
+{
+       int result;
+       struct super_block_data *super = &layout->super;
+       u8 *buffer;
+       size_t offset = 0;
+
+       result = uds_allocate(saved_size, u8, "super block data", &buffer);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       result = uds_read_from_buffered_reader(reader, buffer, saved_size);
+       if (result != UDS_SUCCESS) {
+               uds_free(buffer);
+               return uds_log_error_strerror(result, "cannot read region table header");
+       }
+
+       memcpy(&super->magic_label, buffer, MAGIC_SIZE);
+       offset += MAGIC_SIZE;
+       memcpy(&super->nonce_info, buffer + offset, NONCE_INFO_SIZE);
+       offset += NONCE_INFO_SIZE;
+       decode_u64_le(buffer, &offset, &super->nonce);
+       decode_u32_le(buffer, &offset, &super->version);
+       decode_u32_le(buffer, &offset, &super->block_size);
+       decode_u16_le(buffer, &offset, &super->index_count);
+       decode_u16_le(buffer, &offset, &super->max_saves);
+       offset += sizeof(u32);
+       decode_u64_le(buffer, &offset, &super->open_chapter_blocks);
+       decode_u64_le(buffer, &offset, &super->page_map_blocks);
+
+       if (is_converted_super_block(super)) {
+               decode_u64_le(buffer, &offset, &super->volume_offset);
+               decode_u64_le(buffer, &offset, &super->start_offset);
+       } else {
+               super->volume_offset = 0;
+               super->start_offset = 0;
+       }
+
+       uds_free(buffer);
+
+       if (memcmp(super->magic_label, LAYOUT_MAGIC, MAGIC_SIZE) != 0)
+               return uds_log_error_strerror(UDS_CORRUPT_DATA,
+                                             "unknown superblock magic label");
+
+       if ((super->version < SUPER_VERSION_MINIMUM) ||
+           (super->version == 4) || (super->version == 5) || (super->version == 6) ||
+           (super->version > SUPER_VERSION_MAXIMUM)) {
+               return uds_log_error_strerror(UDS_UNSUPPORTED_VERSION,
+                                             "unknown superblock version number %u",
+                                             super->version);
+       }
+
+       if (super->volume_offset < super->start_offset) {
+               return uds_log_error_strerror(UDS_CORRUPT_DATA,
+                                             "inconsistent offsets (start %llu, volume %llu)",
+                                             (unsigned long long) super->start_offset,
+                                             (unsigned long long) super->volume_offset);
+       }
+
+       /* Sub-indexes are no longer used but the layout retains this field. */
+       if (super->index_count != 1) {
+               return uds_log_error_strerror(UDS_CORRUPT_DATA,
+                                             "invalid subindex count %u",
+                                             super->index_count);
+       }
+
+       if (generate_primary_nonce(super->nonce_info, sizeof(super->nonce_info)) != super->nonce) {
+               return uds_log_error_strerror(UDS_CORRUPT_DATA,
+                                             "inconsistent superblock nonce");
+       }
+
+       return UDS_SUCCESS;
+}
+
+static int __must_check verify_region(struct layout_region *lr, u64 start_block,
+                                     enum region_kind kind, unsigned int instance)
+{
+       if (lr->start_block != start_block)
+               return uds_log_error_strerror(UDS_CORRUPT_DATA,
+                                             "incorrect layout region offset");
+
+       if (lr->kind != kind)
+               return uds_log_error_strerror(UDS_CORRUPT_DATA,
+                                             "incorrect layout region kind");
+
+       if (lr->instance != instance) {
+               return uds_log_error_strerror(UDS_CORRUPT_DATA,
+                                             "incorrect layout region instance");
+       }
+
+       return UDS_SUCCESS;
+}
+
+static int __must_check verify_sub_index(struct index_layout *layout, u64 start_block,
+                                        struct region_table *table)
+{
+       int result;
+       unsigned int i;
+       struct sub_index_layout *sil = &layout->index;
+       u64 next_block = start_block;
+
+       sil->sub_index = table->regions[2];
+       result = verify_region(&sil->sub_index, next_block, RL_KIND_INDEX, 0);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       define_sub_index_nonce(layout);
+
+       sil->volume = table->regions[3];
+       result = verify_region(&sil->volume, next_block, RL_KIND_VOLUME,
+                              RL_SOLE_INSTANCE);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       next_block += sil->volume.block_count + layout->super.volume_offset;
+
+       for (i = 0; i < layout->super.max_saves; i++) {
+               sil->saves[i].index_save = table->regions[i + 4];
+               result = verify_region(&sil->saves[i].index_save, next_block,
+                                      RL_KIND_SAVE, i);
+               if (result != UDS_SUCCESS)
+                       return result;
+
+               next_block += sil->saves[i].index_save.block_count;
+       }
+
+       next_block -= layout->super.volume_offset;
+       if (next_block != start_block + sil->sub_index.block_count) {
+               return uds_log_error_strerror(UDS_CORRUPT_DATA,
+                                             "sub index region does not span all saves");
+       }
+
+       return UDS_SUCCESS;
+}
+
+static int __must_check reconstitute_layout(struct index_layout *layout,
+                                           struct region_table *table, u64 first_block)
+{
+       int result;
+       u64 next_block = first_block;
+
+       result = uds_allocate(layout->super.max_saves, struct index_save_layout,
+                             __func__, &layout->index.saves);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       layout->total_blocks = table->header.region_blocks;
+
+       layout->header = table->regions[0];
+       result = verify_region(&layout->header, next_block++, RL_KIND_HEADER,
+                              RL_SOLE_INSTANCE);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       layout->config = table->regions[1];
+       result = verify_region(&layout->config, next_block++, RL_KIND_CONFIG,
+                              RL_SOLE_INSTANCE);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       result = verify_sub_index(layout, next_block, table);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       next_block += layout->index.sub_index.block_count;
+
+       layout->seal = table->regions[table->header.region_count - 1];
+       result = verify_region(&layout->seal, next_block + layout->super.volume_offset,
+                              RL_KIND_SEAL, RL_SOLE_INSTANCE);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       if (++next_block != (first_block + layout->total_blocks)) {
+               return uds_log_error_strerror(UDS_CORRUPT_DATA,
+                                             "layout table does not span total blocks");
+       }
+
+       return UDS_SUCCESS;
+}
+
+static int __must_check load_super_block(struct index_layout *layout, size_t block_size,
+                                        u64 first_block, struct buffered_reader *reader)
+{
+       int result;
+       struct region_table *table = NULL;
+       struct super_block_data *super = &layout->super;
+
+       result = load_region_table(reader, &table);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       if (table->header.type != RH_TYPE_SUPER) {
+               uds_free(table);
+               return uds_log_error_strerror(UDS_CORRUPT_DATA,
+                                             "not a superblock region table");
+       }
+
+       result = read_super_block_data(reader, layout, table->header.payload);
+       if (result != UDS_SUCCESS) {
+               uds_free(table);
+               return uds_log_error_strerror(result, "unknown superblock format");
+       }
+
+       if (super->block_size != block_size) {
+               uds_free(table);
+               return uds_log_error_strerror(UDS_CORRUPT_DATA,
+                                             "superblock saved block_size %u differs from supplied block_size %zu",
+                                             super->block_size, block_size);
+       }
+
+       first_block -= (super->volume_offset - super->start_offset);
+       result = reconstitute_layout(layout, table, first_block);
+       uds_free(table);
+       return result;
+}
+
+static int __must_check read_index_save_data(struct buffered_reader *reader,
+                                            struct index_save_layout *isl,
+                                            size_t saved_size)
+{
+       int result;
+       struct index_state_version file_version;
+       u8 buffer[sizeof(struct index_save_data) + sizeof(struct index_state_data301)];
+       size_t offset = 0;
+
+       if (saved_size != sizeof(buffer)) {
+               return uds_log_error_strerror(UDS_CORRUPT_DATA,
+                                             "unexpected index save data size %zu",
+                                             saved_size);
+       }
+
+       result = uds_read_from_buffered_reader(reader, buffer, sizeof(buffer));
+       if (result != UDS_SUCCESS)
+               return uds_log_error_strerror(result, "cannot read index save data");
+
+       decode_u64_le(buffer, &offset, &isl->save_data.timestamp);
+       decode_u64_le(buffer, &offset, &isl->save_data.nonce);
+       decode_u32_le(buffer, &offset, &isl->save_data.version);
+       offset += sizeof(u32);
+
+       if (isl->save_data.version > 1) {
+               return uds_log_error_strerror(UDS_UNSUPPORTED_VERSION,
+                                             "unknown index save version number %u",
+                                             isl->save_data.version);
+       }
+
+       decode_s32_le(buffer, &offset, &file_version.signature);
+       decode_s32_le(buffer, &offset, &file_version.version_id);
+
+       if ((file_version.signature != INDEX_STATE_VERSION_301.signature) ||
+           (file_version.version_id != INDEX_STATE_VERSION_301.version_id)) {
+               return uds_log_error_strerror(UDS_UNSUPPORTED_VERSION,
+                                             "index state version %d,%d is unsupported",
+                                             file_version.signature,
+                                             file_version.version_id);
+       }
+
+       decode_u64_le(buffer, &offset, &isl->state_data.newest_chapter);
+       decode_u64_le(buffer, &offset, &isl->state_data.oldest_chapter);
+       decode_u64_le(buffer, &offset, &isl->state_data.last_save);
+       /* Skip past some historical fields that are now unused */
+       offset += sizeof(u32) + sizeof(u32);
+       return UDS_SUCCESS;
+}
+
+static int __must_check reconstruct_index_save(struct index_save_layout *isl,
+                                              struct region_table *table)
+{
+       int result;
+       unsigned int z;
+       struct layout_region *last_region;
+       u64 next_block = isl->index_save.start_block;
+       u64 last_block = next_block + isl->index_save.block_count;
+
+       isl->zone_count = table->header.region_count - 3;
+
+       last_region = &table->regions[table->header.region_count - 1];
+       if (last_region->kind == RL_KIND_EMPTY) {
+               isl->free_space = *last_region;
+               isl->zone_count--;
+       } else {
+               isl->free_space = (struct layout_region) {
+                       .start_block = last_block,
+                       .block_count = 0,
+                       .kind = RL_KIND_EMPTY,
+                       .instance = RL_SOLE_INSTANCE,
+               };
+       }
+
+       isl->header = table->regions[0];
+       result = verify_region(&isl->header, next_block++, RL_KIND_HEADER,
+                              RL_SOLE_INSTANCE);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       isl->index_page_map = table->regions[1];
+       result = verify_region(&isl->index_page_map, next_block, RL_KIND_INDEX_PAGE_MAP,
+                              RL_SOLE_INSTANCE);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       next_block += isl->index_page_map.block_count;
+
+       for (z = 0; z < isl->zone_count; z++) {
+               isl->volume_index_zones[z] = table->regions[z + 2];
+               result = verify_region(&isl->volume_index_zones[z], next_block,
+                                      RL_KIND_VOLUME_INDEX, z);
+               if (result != UDS_SUCCESS)
+                       return result;
+
+               next_block += isl->volume_index_zones[z].block_count;
+       }
+
+       isl->open_chapter = table->regions[isl->zone_count + 2];
+       result = verify_region(&isl->open_chapter, next_block, RL_KIND_OPEN_CHAPTER,
+                              RL_SOLE_INSTANCE);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       next_block += isl->open_chapter.block_count;
+
+       result = verify_region(&isl->free_space, next_block, RL_KIND_EMPTY,
+                              RL_SOLE_INSTANCE);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       next_block += isl->free_space.block_count;
+       if (next_block != last_block) {
+               return uds_log_error_strerror(UDS_CORRUPT_DATA,
+                                             "index save layout table incomplete");
+       }
+
+       return UDS_SUCCESS;
+}
+
+static int __must_check load_index_save(struct index_save_layout *isl,
+                                       struct buffered_reader *reader,
+                                       unsigned int instance)
+{
+       int result;
+       struct region_table *table = NULL;
+
+       result = load_region_table(reader, &table);
+       if (result != UDS_SUCCESS) {
+               return uds_log_error_strerror(result, "cannot read index save %u header",
+                                             instance);
+       }
+
+       if (table->header.region_blocks != isl->index_save.block_count) {
+               u64 region_blocks = table->header.region_blocks;
+
+               uds_free(table);
+               return uds_log_error_strerror(UDS_CORRUPT_DATA,
+                                             "unexpected index save %u region block count %llu",
+                                             instance,
+                                             (unsigned long long) region_blocks);
+       }
+
+       if (table->header.type == RH_TYPE_UNSAVED) {
+               uds_free(table);
+               reset_index_save_layout(isl, 0);
+               return UDS_SUCCESS;
+       }
+
+
+       if (table->header.type != RH_TYPE_SAVE) {
+               uds_free(table);
+               return uds_log_error_strerror(UDS_CORRUPT_DATA,
+                                             "unexpected index save %u header type %u",
+                                             instance, table->header.type);
+       }
+
+       result = read_index_save_data(reader, isl, table->header.payload);
+       if (result != UDS_SUCCESS) {
+               uds_free(table);
+               return uds_log_error_strerror(result,
+                                             "unknown index save %u data format",
+                                             instance);
+       }
+
+       result = reconstruct_index_save(isl, table);
+       uds_free(table);
+       if (result != UDS_SUCCESS) {
+               return uds_log_error_strerror(result, "cannot reconstruct index save %u",
+                                             instance);
+       }
+
+       return UDS_SUCCESS;
+}
+
+static int __must_check load_sub_index_regions(struct index_layout *layout)
+{
+       int result;
+       unsigned int j;
+       struct index_save_layout *isl;
+       struct buffered_reader *reader;
+
+       for (j = 0; j < layout->super.max_saves; j++) {
+               isl = &layout->index.saves[j];
+               result = open_region_reader(layout, &isl->index_save, &reader);
+
+               if (result != UDS_SUCCESS) {
+                       uds_log_error_strerror(result,
+                                              "cannot get reader for index 0 save %u",
+                                              j);
+                       return result;
+               }
+
+               result = load_index_save(isl, reader, j);
+               uds_free_buffered_reader(reader);
+               if (result != UDS_SUCCESS) {
+                       /* Another save slot might be valid. */
+                       reset_index_save_layout(isl, 0);
+                       continue;
+               }
+       }
+
+       return UDS_SUCCESS;
+}
+
+static int __must_check verify_uds_index_config(struct index_layout *layout,
+                                               struct uds_configuration *config)
+{
+       int result;
+       struct buffered_reader *reader = NULL;
+       u64 offset;
+
+       offset = layout->super.volume_offset - layout->super.start_offset;
+       result = open_layout_reader(layout, &layout->config, offset, &reader);
+       if (result != UDS_SUCCESS)
+               return uds_log_error_strerror(result, "failed to open config reader");
+
+       result = uds_validate_config_contents(reader, config);
+       if (result != UDS_SUCCESS) {
+               uds_free_buffered_reader(reader);
+               return uds_log_error_strerror(result, "failed to read config region");
+       }
+
+       uds_free_buffered_reader(reader);
+       return UDS_SUCCESS;
+}
+
+static int load_index_layout(struct index_layout *layout, struct uds_configuration *config)
+{
+       int result;
+       struct buffered_reader *reader;
+
+       result = uds_make_buffered_reader(layout->factory,
+                                         layout->offset / UDS_BLOCK_SIZE, 1, &reader);
+       if (result != UDS_SUCCESS)
+               return uds_log_error_strerror(result, "unable to read superblock");
+
+       result = load_super_block(layout, UDS_BLOCK_SIZE,
+                                 layout->offset / UDS_BLOCK_SIZE, reader);
+       uds_free_buffered_reader(reader);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       result = verify_uds_index_config(layout, config);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       return load_sub_index_regions(layout);
+}
+
+static int create_layout_factory(struct index_layout *layout,
+                                const struct uds_configuration *config)
+{
+       int result;
+       size_t writable_size;
+       struct io_factory *factory = NULL;
+
+       result = uds_make_io_factory(config->bdev, &factory);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       writable_size = uds_get_writable_size(factory) & -UDS_BLOCK_SIZE;
+       if (writable_size < config->size + config->offset) {
+               uds_put_io_factory(factory);
+               uds_log_error("index storage (%zu) is smaller than the requested size %zu",
+                             writable_size, config->size + config->offset);
+               return -ENOSPC;
+       }
+
+       layout->factory = factory;
+       layout->factory_size = (config->size > 0) ? config->size : writable_size;
+       layout->offset = config->offset;
+       return UDS_SUCCESS;
+}
+
+int uds_make_index_layout(struct uds_configuration *config, bool new_layout,
+                         struct index_layout **layout_ptr)
+{
+       int result;
+       struct index_layout *layout = NULL;
+       struct save_layout_sizes sizes;
+
+       result = compute_sizes(config, &sizes);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       result = uds_allocate(1, struct index_layout, __func__, &layout);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       result = create_layout_factory(layout, config);
+       if (result != UDS_SUCCESS) {
+               uds_free_index_layout(layout);
+               return result;
+       }
+
+       if (layout->factory_size < sizes.total_size) {
+               uds_log_error("index storage (%zu) is smaller than the required size %llu",
+                             layout->factory_size,
+                             (unsigned long long) sizes.total_size);
+               uds_free_index_layout(layout);
+               return -ENOSPC;
+       }
+
+       if (new_layout)
+               result = create_index_layout(layout, config);
+       else
+               result = load_index_layout(layout, config);
+       if (result != UDS_SUCCESS) {
+               uds_free_index_layout(layout);
+               return result;
+       }
+
+       *layout_ptr = layout;
+       return UDS_SUCCESS;
+}
+
+void uds_free_index_layout(struct index_layout *layout)
+{
+       if (layout == NULL)
+               return;
+
+       uds_free(layout->index.saves);
+       if (layout->factory != NULL)
+               uds_put_io_factory(layout->factory);
+
+       uds_free(layout);
+}
+
+int uds_replace_index_layout_storage(struct index_layout *layout,
+                                    struct block_device *bdev)
+{
+       return uds_replace_storage(layout->factory, bdev);
+}
+
+/* Obtain a dm_bufio_client for the volume region. */
+int uds_open_volume_bufio(struct index_layout *layout, size_t block_size,
+                         unsigned int reserved_buffers,
+                         struct dm_bufio_client **client_ptr)
+{
+       off_t offset = (layout->index.volume.start_block +
+                       layout->super.volume_offset -
+                       layout->super.start_offset);
+
+       return uds_make_bufio(layout->factory, offset, block_size, reserved_buffers,
+                             client_ptr);
+}
+
+u64 uds_get_volume_nonce(struct index_layout *layout)
+{
+       return layout->index.nonce;
+}
diff --git a/drivers/md/dm-vdo/indexer/index-layout.h b/drivers/md/dm-vdo/indexer/index-layout.h
new file mode 100644 (file)
index 0000000..e9ac6f4
--- /dev/null
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#ifndef UDS_INDEX_LAYOUT_H
+#define UDS_INDEX_LAYOUT_H
+
+#include "config.h"
+#include "indexer.h"
+#include "io-factory.h"
+
+/*
+ * The index layout describes the format of the index on the underlying storage, and is responsible
+ * for creating those structures when the index is first created. It also validates the index data
+ * when loading a saved index, and updates it when saving the index.
+ */
+
+struct index_layout;
+
+int __must_check uds_make_index_layout(struct uds_configuration *config, bool new_layout,
+                                      struct index_layout **layout_ptr);
+
+void uds_free_index_layout(struct index_layout *layout);
+
+int __must_check uds_replace_index_layout_storage(struct index_layout *layout,
+                                                 struct block_device *bdev);
+
+int __must_check uds_load_index_state(struct index_layout *layout,
+                                     struct uds_index *index);
+
+int __must_check uds_save_index_state(struct index_layout *layout,
+                                     struct uds_index *index);
+
+int __must_check uds_discard_open_chapter(struct index_layout *layout);
+
+u64 __must_check uds_get_volume_nonce(struct index_layout *layout);
+
+int __must_check uds_open_volume_bufio(struct index_layout *layout, size_t block_size,
+                                      unsigned int reserved_buffers,
+                                      struct dm_bufio_client **client_ptr);
+
+#endif /* UDS_INDEX_LAYOUT_H */
diff --git a/drivers/md/dm-vdo/indexer/index-page-map.c b/drivers/md/dm-vdo/indexer/index-page-map.c
new file mode 100644 (file)
index 0000000..90d97c3
--- /dev/null
@@ -0,0 +1,175 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#include "index-page-map.h"
+
+#include "errors.h"
+#include "logger.h"
+#include "memory-alloc.h"
+#include "numeric.h"
+#include "permassert.h"
+#include "string-utils.h"
+#include "thread-utils.h"
+
+#include "hash-utils.h"
+#include "indexer.h"
+
+/*
+ * The index page map is conceptually a two-dimensional array indexed by chapter number and index
+ * page number within the chapter. Each entry contains the number of the last delta list on that
+ * index page. In order to save memory, the information for the last page in each chapter is not
+ * recorded, as it is known from the geometry.
+ */
+
+static const u8 PAGE_MAP_MAGIC[] = "ALBIPM02";
+
+enum {
+       PAGE_MAP_MAGIC_LENGTH = sizeof(PAGE_MAP_MAGIC) - 1,
+};
+
+static inline u32 get_entry_count(const struct index_geometry *geometry)
+{
+       return geometry->chapters_per_volume * (geometry->index_pages_per_chapter - 1);
+}
+
+int uds_make_index_page_map(const struct index_geometry *geometry,
+                           struct index_page_map **map_ptr)
+{
+       int result;
+       struct index_page_map *map;
+
+       result = uds_allocate(1, struct index_page_map, "page map", &map);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       map->geometry = geometry;
+       map->entries_per_chapter = geometry->index_pages_per_chapter - 1;
+       result = uds_allocate(get_entry_count(geometry), u16, "Index Page Map Entries",
+                             &map->entries);
+       if (result != UDS_SUCCESS) {
+               uds_free_index_page_map(map);
+               return result;
+       }
+
+       *map_ptr = map;
+       return UDS_SUCCESS;
+}
+
+void uds_free_index_page_map(struct index_page_map *map)
+{
+       if (map != NULL) {
+               uds_free(map->entries);
+               uds_free(map);
+       }
+}
+
+void uds_update_index_page_map(struct index_page_map *map, u64 virtual_chapter_number,
+                              u32 chapter_number, u32 index_page_number,
+                              u32 delta_list_number)
+{
+       size_t slot;
+
+       map->last_update = virtual_chapter_number;
+       if (index_page_number == map->entries_per_chapter)
+               return;
+
+       slot = (chapter_number * map->entries_per_chapter) + index_page_number;
+       map->entries[slot] = delta_list_number;
+}
+
+u32 uds_find_index_page_number(const struct index_page_map *map,
+                              const struct uds_record_name *name, u32 chapter_number)
+{
+       u32 delta_list_number = uds_hash_to_chapter_delta_list(name, map->geometry);
+       u32 slot = chapter_number * map->entries_per_chapter;
+       u32 page;
+
+       for (page = 0; page < map->entries_per_chapter; page++) {
+               if (delta_list_number <= map->entries[slot + page])
+                       break;
+       }
+
+       return page;
+}
+
+void uds_get_list_number_bounds(const struct index_page_map *map, u32 chapter_number,
+                               u32 index_page_number, u32 *lowest_list,
+                               u32 *highest_list)
+{
+       u32 slot = chapter_number * map->entries_per_chapter;
+
+       *lowest_list = ((index_page_number == 0) ?
+                       0 : map->entries[slot + index_page_number - 1] + 1);
+       *highest_list = ((index_page_number < map->entries_per_chapter) ?
+                        map->entries[slot + index_page_number] :
+                        map->geometry->delta_lists_per_chapter - 1);
+}
+
+u64 uds_compute_index_page_map_save_size(const struct index_geometry *geometry)
+{
+       return PAGE_MAP_MAGIC_LENGTH + sizeof(u64) + sizeof(u16) * get_entry_count(geometry);
+}
+
+int uds_write_index_page_map(struct index_page_map *map, struct buffered_writer *writer)
+{
+       int result;
+       u8 *buffer;
+       size_t offset = 0;
+       u64 saved_size = uds_compute_index_page_map_save_size(map->geometry);
+       u32 i;
+
+       result = uds_allocate(saved_size, u8, "page map data", &buffer);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       memcpy(buffer, PAGE_MAP_MAGIC, PAGE_MAP_MAGIC_LENGTH);
+       offset += PAGE_MAP_MAGIC_LENGTH;
+       encode_u64_le(buffer, &offset, map->last_update);
+       for (i = 0; i < get_entry_count(map->geometry); i++)
+               encode_u16_le(buffer, &offset, map->entries[i]);
+
+       result = uds_write_to_buffered_writer(writer, buffer, offset);
+       uds_free(buffer);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       return uds_flush_buffered_writer(writer);
+}
+
+int uds_read_index_page_map(struct index_page_map *map, struct buffered_reader *reader)
+{
+       int result;
+       u8 magic[PAGE_MAP_MAGIC_LENGTH];
+       u8 *buffer;
+       size_t offset = 0;
+       u64 saved_size = uds_compute_index_page_map_save_size(map->geometry);
+       u32 i;
+
+       result = uds_allocate(saved_size, u8, "page map data", &buffer);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       result = uds_read_from_buffered_reader(reader, buffer, saved_size);
+       if (result != UDS_SUCCESS) {
+               uds_free(buffer);
+               return result;
+       }
+
+       memcpy(&magic, buffer, PAGE_MAP_MAGIC_LENGTH);
+       offset += PAGE_MAP_MAGIC_LENGTH;
+       if (memcmp(magic, PAGE_MAP_MAGIC, PAGE_MAP_MAGIC_LENGTH) != 0) {
+               uds_free(buffer);
+               return UDS_CORRUPT_DATA;
+       }
+
+       decode_u64_le(buffer, &offset, &map->last_update);
+       for (i = 0; i < get_entry_count(map->geometry); i++)
+               decode_u16_le(buffer, &offset, &map->entries[i]);
+
+       uds_free(buffer);
+       uds_log_debug("read index page map, last update %llu",
+                     (unsigned long long) map->last_update);
+       return UDS_SUCCESS;
+}
diff --git a/drivers/md/dm-vdo/indexer/index-page-map.h b/drivers/md/dm-vdo/indexer/index-page-map.h
new file mode 100644 (file)
index 0000000..b327c0b
--- /dev/null
@@ -0,0 +1,50 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#ifndef UDS_INDEX_PAGE_MAP_H
+#define UDS_INDEX_PAGE_MAP_H
+
+#include "geometry.h"
+#include "io-factory.h"
+
+/*
+ * The index maintains a page map which records how the chapter delta lists are distributed among
+ * the index pages for each chapter, allowing the volume to be efficient about reading only pages
+ * that it knows it will need.
+ */
+
+struct index_page_map {
+       const struct index_geometry *geometry;
+       u64 last_update;
+       u32 entries_per_chapter;
+       u16 *entries;
+};
+
+int __must_check uds_make_index_page_map(const struct index_geometry *geometry,
+                                        struct index_page_map **map_ptr);
+
+void uds_free_index_page_map(struct index_page_map *map);
+
+int __must_check uds_read_index_page_map(struct index_page_map *map,
+                                        struct buffered_reader *reader);
+
+int __must_check uds_write_index_page_map(struct index_page_map *map,
+                                         struct buffered_writer *writer);
+
+void uds_update_index_page_map(struct index_page_map *map, u64 virtual_chapter_number,
+                              u32 chapter_number, u32 index_page_number,
+                              u32 delta_list_number);
+
+u32 __must_check uds_find_index_page_number(const struct index_page_map *map,
+                                           const struct uds_record_name *name,
+                                           u32 chapter_number);
+
+void uds_get_list_number_bounds(const struct index_page_map *map, u32 chapter_number,
+                               u32 index_page_number, u32 *lowest_list,
+                               u32 *highest_list);
+
+u64 uds_compute_index_page_map_save_size(const struct index_geometry *geometry);
+
+#endif /* UDS_INDEX_PAGE_MAP_H */
diff --git a/drivers/md/dm-vdo/indexer/index-session.c b/drivers/md/dm-vdo/indexer/index-session.c
new file mode 100644 (file)
index 0000000..07b478f
--- /dev/null
@@ -0,0 +1,739 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#include "index-session.h"
+
+#include <linux/atomic.h>
+
+#include "logger.h"
+#include "memory-alloc.h"
+#include "time-utils.h"
+
+#include "funnel-requestqueue.h"
+#include "index.h"
+#include "index-layout.h"
+
+/*
+ * The index session contains a lock (the request_mutex) which ensures that only one thread can
+ * change the state of its index at a time. The state field indicates the current state of the
+ * index through a set of descriptive flags. The request_mutex must be notified whenever a
+ * non-transient state flag is cleared. The request_mutex is also used to count the number of
+ * requests currently in progress so that they can be drained when suspending or closing the index.
+ *
+ * If the index session is suspended shortly after opening an index, it may have to suspend during
+ * a rebuild. Depending on the size of the index, a rebuild may take a significant amount of time,
+ * so UDS allows the rebuild to be paused in order to suspend the session in a timely manner. When
+ * the index session is resumed, the rebuild can continue from where it left off. If the index
+ * session is shut down with a suspended rebuild, the rebuild progress is abandoned and the rebuild
+ * will start from the beginning the next time the index is loaded. The mutex and status fields in
+ * the index_load_context are used to record the state of any interrupted rebuild.
+ */
+
+enum index_session_flag_bit {
+       IS_FLAG_BIT_START = 8,
+       /* The session has started loading an index but not completed it. */
+       IS_FLAG_BIT_LOADING = IS_FLAG_BIT_START,
+       /* The session has loaded an index, which can handle requests. */
+       IS_FLAG_BIT_LOADED,
+       /* The session's index has been permanently disabled. */
+       IS_FLAG_BIT_DISABLED,
+       /* The session's index is suspended. */
+       IS_FLAG_BIT_SUSPENDED,
+       /* The session is handling some index state change. */
+       IS_FLAG_BIT_WAITING,
+       /* The session's index is closing and draining requests. */
+       IS_FLAG_BIT_CLOSING,
+       /* The session is being destroyed and is draining requests. */
+       IS_FLAG_BIT_DESTROYING,
+};
+
+enum index_session_flag {
+       IS_FLAG_LOADED = (1 << IS_FLAG_BIT_LOADED),
+       IS_FLAG_LOADING = (1 << IS_FLAG_BIT_LOADING),
+       IS_FLAG_DISABLED = (1 << IS_FLAG_BIT_DISABLED),
+       IS_FLAG_SUSPENDED = (1 << IS_FLAG_BIT_SUSPENDED),
+       IS_FLAG_WAITING = (1 << IS_FLAG_BIT_WAITING),
+       IS_FLAG_CLOSING = (1 << IS_FLAG_BIT_CLOSING),
+       IS_FLAG_DESTROYING = (1 << IS_FLAG_BIT_DESTROYING),
+};
+
+/* Release a reference to an index session. */
+static void release_index_session(struct uds_index_session *index_session)
+{
+       mutex_lock(&index_session->request_mutex);
+       if (--index_session->request_count == 0)
+               uds_broadcast_cond(&index_session->request_cond);
+       mutex_unlock(&index_session->request_mutex);
+}
+
+/*
+ * Acquire a reference to the index session for an asynchronous index request. The reference must
+ * eventually be released with a corresponding call to release_index_session().
+ */
+static int get_index_session(struct uds_index_session *index_session)
+{
+       unsigned int state;
+       int result = UDS_SUCCESS;
+
+       mutex_lock(&index_session->request_mutex);
+       index_session->request_count++;
+       state = index_session->state;
+       mutex_unlock(&index_session->request_mutex);
+
+       if (state == IS_FLAG_LOADED) {
+               return UDS_SUCCESS;
+       } else if (state & IS_FLAG_DISABLED) {
+               result = UDS_DISABLED;
+       } else if ((state & IS_FLAG_LOADING) ||
+                  (state & IS_FLAG_SUSPENDED) ||
+                  (state & IS_FLAG_WAITING)) {
+               result = -EBUSY;
+       } else {
+               result = UDS_NO_INDEX;
+       }
+
+       release_index_session(index_session);
+       return result;
+}
+
+int uds_launch_request(struct uds_request *request)
+{
+       size_t internal_size;
+       int result;
+
+       if (request->callback == NULL) {
+               uds_log_error("missing required callback");
+               return -EINVAL;
+       }
+
+       switch (request->type) {
+       case UDS_DELETE:
+       case UDS_POST:
+       case UDS_QUERY:
+       case UDS_QUERY_NO_UPDATE:
+       case UDS_UPDATE:
+               break;
+       default:
+               uds_log_error("received invalid callback type");
+               return -EINVAL;
+       }
+
+       /* Reset all internal fields before processing. */
+       internal_size =
+               sizeof(struct uds_request) - offsetof(struct uds_request, zone_number);
+       // FIXME should be using struct_group for this instead
+       memset((char *) request + sizeof(*request) - internal_size, 0, internal_size);
+
+       result = get_index_session(request->session);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       request->found = false;
+       request->unbatched = false;
+       request->index = request->session->index;
+
+       uds_enqueue_request(request, STAGE_TRIAGE);
+       return UDS_SUCCESS;
+}
+
+static void enter_callback_stage(struct uds_request *request)
+{
+       if (request->status != UDS_SUCCESS) {
+               /* All request errors are considered unrecoverable */
+               mutex_lock(&request->session->request_mutex);
+               request->session->state |= IS_FLAG_DISABLED;
+               mutex_unlock(&request->session->request_mutex);
+       }
+
+       uds_request_queue_enqueue(request->session->callback_queue, request);
+}
+
+static inline void count_once(u64 *count_ptr)
+{
+       WRITE_ONCE(*count_ptr, READ_ONCE(*count_ptr) + 1);
+}
+
+static void update_session_stats(struct uds_request *request)
+{
+       struct session_stats *session_stats = &request->session->stats;
+
+       count_once(&session_stats->requests);
+
+       switch (request->type) {
+       case UDS_POST:
+               if (request->found)
+                       count_once(&session_stats->posts_found);
+               else
+                       count_once(&session_stats->posts_not_found);
+
+               if (request->location == UDS_LOCATION_IN_OPEN_CHAPTER)
+                       count_once(&session_stats->posts_found_open_chapter);
+               else if (request->location == UDS_LOCATION_IN_DENSE)
+                       count_once(&session_stats->posts_found_dense);
+               else if (request->location == UDS_LOCATION_IN_SPARSE)
+                       count_once(&session_stats->posts_found_sparse);
+               break;
+
+       case UDS_UPDATE:
+               if (request->found)
+                       count_once(&session_stats->updates_found);
+               else
+                       count_once(&session_stats->updates_not_found);
+               break;
+
+       case UDS_DELETE:
+               if (request->found)
+                       count_once(&session_stats->deletions_found);
+               else
+                       count_once(&session_stats->deletions_not_found);
+               break;
+
+       case UDS_QUERY:
+       case UDS_QUERY_NO_UPDATE:
+               if (request->found)
+                       count_once(&session_stats->queries_found);
+               else
+                       count_once(&session_stats->queries_not_found);
+               break;
+
+       default:
+               request->status = ASSERT(false, "unknown request type: %d",
+                                        request->type);
+       }
+}
+
+static void handle_callbacks(struct uds_request *request)
+{
+       struct uds_index_session *index_session = request->session;
+
+       if (request->status == UDS_SUCCESS)
+               update_session_stats(request);
+
+       request->status = uds_status_to_errno(request->status);
+       request->callback(request);
+       release_index_session(index_session);
+}
+
+static int __must_check make_empty_index_session(struct uds_index_session **index_session_ptr)
+{
+       int result;
+       struct uds_index_session *session;
+
+       result = uds_allocate(1, struct uds_index_session, __func__, &session);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       mutex_init(&session->request_mutex);
+       uds_init_cond(&session->request_cond);
+       mutex_init(&session->load_context.mutex);
+       uds_init_cond(&session->load_context.cond);
+
+       result = uds_make_request_queue("callbackW", &handle_callbacks,
+                                       &session->callback_queue);
+       if (result != UDS_SUCCESS) {
+               uds_free(session);
+               return result;
+       }
+
+       *index_session_ptr = session;
+       return UDS_SUCCESS;
+}
+
+int uds_create_index_session(struct uds_index_session **session)
+{
+       if (session == NULL) {
+               uds_log_error("missing session pointer");
+               return -EINVAL;
+       }
+
+       return uds_status_to_errno(make_empty_index_session(session));
+}
+
+static int __must_check start_loading_index_session(struct uds_index_session *index_session)
+{
+       int result;
+
+       mutex_lock(&index_session->request_mutex);
+       if (index_session->state & IS_FLAG_SUSPENDED) {
+               uds_log_info("Index session is suspended");
+               result = -EBUSY;
+       } else if (index_session->state != 0) {
+               uds_log_info("Index is already loaded");
+               result = -EBUSY;
+       } else {
+               index_session->state |= IS_FLAG_LOADING;
+               result = UDS_SUCCESS;
+       }
+       mutex_unlock(&index_session->request_mutex);
+       return result;
+}
+
+static void finish_loading_index_session(struct uds_index_session *index_session,
+                                        int result)
+{
+       mutex_lock(&index_session->request_mutex);
+       index_session->state &= ~IS_FLAG_LOADING;
+       if (result == UDS_SUCCESS)
+               index_session->state |= IS_FLAG_LOADED;
+
+       uds_broadcast_cond(&index_session->request_cond);
+       mutex_unlock(&index_session->request_mutex);
+}
+
+static int initialize_index_session(struct uds_index_session *index_session,
+                                   enum uds_open_index_type open_type)
+{
+       int result;
+       struct uds_configuration *config;
+
+       result = uds_make_configuration(&index_session->parameters, &config);
+       if (result != UDS_SUCCESS) {
+               uds_log_error_strerror(result, "Failed to allocate config");
+               return result;
+       }
+
+       memset(&index_session->stats, 0, sizeof(index_session->stats));
+       result = uds_make_index(config, open_type, &index_session->load_context,
+                               enter_callback_stage, &index_session->index);
+       if (result != UDS_SUCCESS)
+               uds_log_error_strerror(result, "Failed to make index");
+       else
+               uds_log_configuration(config);
+
+       uds_free_configuration(config);
+       return result;
+}
+
+static const char *get_open_type_string(enum uds_open_index_type open_type)
+{
+       switch (open_type) {
+       case UDS_CREATE:
+               return "creating index";
+       case UDS_LOAD:
+               return "loading or rebuilding index";
+       case UDS_NO_REBUILD:
+               return "loading index";
+       default:
+               return "unknown open method";
+       }
+}
+
+/*
+ * Open an index under the given session. This operation will fail if the
+ * index session is suspended, or if there is already an open index.
+ */
+int uds_open_index(enum uds_open_index_type open_type,
+                  const struct uds_parameters *parameters,
+                  struct uds_index_session *session)
+{
+       int result;
+       char name[BDEVNAME_SIZE];
+
+       if (parameters == NULL) {
+               uds_log_error("missing required parameters");
+               return -EINVAL;
+       }
+       if (parameters->bdev == NULL) {
+               uds_log_error("missing required block device");
+               return -EINVAL;
+       }
+       if (session == NULL) {
+               uds_log_error("missing required session pointer");
+               return -EINVAL;
+       }
+
+       result = start_loading_index_session(session);
+       if (result != UDS_SUCCESS)
+               return uds_status_to_errno(result);
+
+       session->parameters = *parameters;
+       format_dev_t(name, parameters->bdev->bd_dev);
+       uds_log_info("%s: %s", get_open_type_string(open_type), name);
+
+       result = initialize_index_session(session, open_type);
+       if (result != UDS_SUCCESS)
+               uds_log_error_strerror(result, "Failed %s",
+                                      get_open_type_string(open_type));
+
+       finish_loading_index_session(session, result);
+       return uds_status_to_errno(result);
+}
+
+static void wait_for_no_requests_in_progress(struct uds_index_session *index_session)
+{
+       mutex_lock(&index_session->request_mutex);
+       while (index_session->request_count > 0) {
+               uds_wait_cond(&index_session->request_cond,
+                             &index_session->request_mutex);
+       }
+       mutex_unlock(&index_session->request_mutex);
+}
+
+static int __must_check save_index(struct uds_index_session *index_session)
+{
+       wait_for_no_requests_in_progress(index_session);
+       return uds_save_index(index_session->index);
+}
+
+static void suspend_rebuild(struct uds_index_session *session)
+{
+       mutex_lock(&session->load_context.mutex);
+       switch (session->load_context.status) {
+       case INDEX_OPENING:
+               session->load_context.status = INDEX_SUSPENDING;
+
+               /* Wait until the index indicates that it is not replaying. */
+               while ((session->load_context.status != INDEX_SUSPENDED) &&
+                      (session->load_context.status != INDEX_READY)) {
+                       uds_wait_cond(&session->load_context.cond,
+                                     &session->load_context.mutex);
+               }
+
+               break;
+
+       case INDEX_READY:
+               /* Index load does not need to be suspended. */
+               break;
+
+       case INDEX_SUSPENDED:
+       case INDEX_SUSPENDING:
+       case INDEX_FREEING:
+       default:
+               /* These cases should not happen. */
+               ASSERT_LOG_ONLY(false, "Bad load context state %u",
+                               session->load_context.status);
+               break;
+       }
+       mutex_unlock(&session->load_context.mutex);
+}
+
+/*
+ * Suspend index operation, draining all current index requests and preventing new index requests
+ * from starting. Optionally saves all index data before returning.
+ */
+int uds_suspend_index_session(struct uds_index_session *session, bool save)
+{
+       int result = UDS_SUCCESS;
+       bool no_work = false;
+       bool rebuilding = false;
+
+       /* Wait for any current index state change to complete. */
+       mutex_lock(&session->request_mutex);
+       while (session->state & IS_FLAG_CLOSING)
+               uds_wait_cond(&session->request_cond, &session->request_mutex);
+
+       if ((session->state & IS_FLAG_WAITING) || (session->state & IS_FLAG_DESTROYING)) {
+               no_work = true;
+               uds_log_info("Index session is already changing state");
+               result = -EBUSY;
+       } else if (session->state & IS_FLAG_SUSPENDED) {
+               no_work = true;
+       } else if (session->state & IS_FLAG_LOADING) {
+               session->state |= IS_FLAG_WAITING;
+               rebuilding = true;
+       } else if (session->state & IS_FLAG_LOADED) {
+               session->state |= IS_FLAG_WAITING;
+       } else {
+               no_work = true;
+               session->state |= IS_FLAG_SUSPENDED;
+               uds_broadcast_cond(&session->request_cond);
+       }
+       mutex_unlock(&session->request_mutex);
+
+       if (no_work)
+               return uds_status_to_errno(result);
+
+       if (rebuilding)
+               suspend_rebuild(session);
+       else if (save)
+               result = save_index(session);
+       else
+               result = uds_flush_index_session(session);
+
+       mutex_lock(&session->request_mutex);
+       session->state &= ~IS_FLAG_WAITING;
+       session->state |= IS_FLAG_SUSPENDED;
+       uds_broadcast_cond(&session->request_cond);
+       mutex_unlock(&session->request_mutex);
+       return uds_status_to_errno(result);
+}
+
+static int replace_device(struct uds_index_session *session, struct block_device *bdev)
+{
+       int result;
+
+       result = uds_replace_index_storage(session->index, bdev);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       session->parameters.bdev = bdev;
+       return UDS_SUCCESS;
+}
+
+/*
+ * Resume index operation after being suspended. If the index is suspended and the supplied block
+ * device differs from the current backing store, the index will start using the new backing store.
+ */
+int uds_resume_index_session(struct uds_index_session *session,
+                            struct block_device *bdev)
+{
+       int result = UDS_SUCCESS;
+       bool no_work = false;
+       bool resume_replay = false;
+
+       mutex_lock(&session->request_mutex);
+       if (session->state & IS_FLAG_WAITING) {
+               uds_log_info("Index session is already changing state");
+               no_work = true;
+               result = -EBUSY;
+       } else if (!(session->state & IS_FLAG_SUSPENDED)) {
+               /* If not suspended, just succeed. */
+               no_work = true;
+               result = UDS_SUCCESS;
+       } else {
+               session->state |= IS_FLAG_WAITING;
+               if (session->state & IS_FLAG_LOADING)
+                       resume_replay = true;
+       }
+       mutex_unlock(&session->request_mutex);
+
+       if (no_work)
+               return result;
+
+       if ((session->index != NULL) && (bdev != session->parameters.bdev)) {
+               result = replace_device(session, bdev);
+               if (result != UDS_SUCCESS) {
+                       mutex_lock(&session->request_mutex);
+                       session->state &= ~IS_FLAG_WAITING;
+                       uds_broadcast_cond(&session->request_cond);
+                       mutex_unlock(&session->request_mutex);
+                       return uds_status_to_errno(result);
+               }
+       }
+
+       if (resume_replay) {
+               mutex_lock(&session->load_context.mutex);
+               switch (session->load_context.status) {
+               case INDEX_SUSPENDED:
+                       session->load_context.status = INDEX_OPENING;
+                       /* Notify the index to start replaying again. */
+                       uds_broadcast_cond(&session->load_context.cond);
+                       break;
+
+               case INDEX_READY:
+                       /* There is no index rebuild to resume. */
+                       break;
+
+               case INDEX_OPENING:
+               case INDEX_SUSPENDING:
+               case INDEX_FREEING:
+               default:
+                       /* These cases should not happen; do nothing. */
+                       ASSERT_LOG_ONLY(false, "Bad load context state %u",
+                                       session->load_context.status);
+                       break;
+               }
+               mutex_unlock(&session->load_context.mutex);
+       }
+
+       mutex_lock(&session->request_mutex);
+       session->state &= ~IS_FLAG_WAITING;
+       session->state &= ~IS_FLAG_SUSPENDED;
+       uds_broadcast_cond(&session->request_cond);
+       mutex_unlock(&session->request_mutex);
+       return UDS_SUCCESS;
+}
+
+static int save_and_free_index(struct uds_index_session *index_session)
+{
+       int result = UDS_SUCCESS;
+       bool suspended;
+       struct uds_index *index = index_session->index;
+
+       if (index == NULL)
+               return UDS_SUCCESS;
+
+       mutex_lock(&index_session->request_mutex);
+       suspended = (index_session->state & IS_FLAG_SUSPENDED);
+       mutex_unlock(&index_session->request_mutex);
+
+       if (!suspended) {
+               result = uds_save_index(index);
+               if (result != UDS_SUCCESS)
+                       uds_log_warning_strerror(result,
+                                                "ignoring error from save_index");
+       }
+       uds_free_index(index);
+       index_session->index = NULL;
+
+       /*
+        * Reset all index state that happens to be in the index
+        * session, so it doesn't affect any future index.
+        */
+       mutex_lock(&index_session->load_context.mutex);
+       index_session->load_context.status = INDEX_OPENING;
+       mutex_unlock(&index_session->load_context.mutex);
+
+       mutex_lock(&index_session->request_mutex);
+       /* Only the suspend bit will remain relevant. */
+       index_session->state &= IS_FLAG_SUSPENDED;
+       mutex_unlock(&index_session->request_mutex);
+
+       return result;
+}
+
+/* Save and close the current index. */
+int uds_close_index(struct uds_index_session *index_session)
+{
+       int result = UDS_SUCCESS;
+
+       /* Wait for any current index state change to complete. */
+       mutex_lock(&index_session->request_mutex);
+       while ((index_session->state & IS_FLAG_WAITING) ||
+              (index_session->state & IS_FLAG_CLOSING)) {
+               uds_wait_cond(&index_session->request_cond,
+                             &index_session->request_mutex);
+       }
+
+       if (index_session->state & IS_FLAG_SUSPENDED) {
+               uds_log_info("Index session is suspended");
+               result = -EBUSY;
+       } else if ((index_session->state & IS_FLAG_DESTROYING) ||
+                  !(index_session->state & IS_FLAG_LOADED)) {
+               /* The index doesn't exist, hasn't finished loading, or is being destroyed. */
+               result = UDS_NO_INDEX;
+       } else {
+               index_session->state |= IS_FLAG_CLOSING;
+       }
+       mutex_unlock(&index_session->request_mutex);
+       if (result != UDS_SUCCESS)
+               return uds_status_to_errno(result);
+
+       uds_log_debug("Closing index");
+       wait_for_no_requests_in_progress(index_session);
+       result = save_and_free_index(index_session);
+       uds_log_debug("Closed index");
+
+       mutex_lock(&index_session->request_mutex);
+       index_session->state &= ~IS_FLAG_CLOSING;
+       uds_broadcast_cond(&index_session->request_cond);
+       mutex_unlock(&index_session->request_mutex);
+       return uds_status_to_errno(result);
+}
+
+/* This will save and close an open index before destroying the session. */
+int uds_destroy_index_session(struct uds_index_session *index_session)
+{
+       int result;
+       bool load_pending = false;
+
+       uds_log_debug("Destroying index session");
+
+       /* Wait for any current index state change to complete. */
+       mutex_lock(&index_session->request_mutex);
+       while ((index_session->state & IS_FLAG_WAITING) ||
+              (index_session->state & IS_FLAG_CLOSING)) {
+               uds_wait_cond(&index_session->request_cond,
+                             &index_session->request_mutex);
+       }
+
+       if (index_session->state & IS_FLAG_DESTROYING) {
+               mutex_unlock(&index_session->request_mutex);
+               uds_log_info("Index session is already closing");
+               return -EBUSY;
+       }
+
+       index_session->state |= IS_FLAG_DESTROYING;
+       load_pending = ((index_session->state & IS_FLAG_LOADING) &&
+                       (index_session->state & IS_FLAG_SUSPENDED));
+       mutex_unlock(&index_session->request_mutex);
+
+       if (load_pending) {
+               /* Tell the index to terminate the rebuild. */
+               mutex_lock(&index_session->load_context.mutex);
+               if (index_session->load_context.status == INDEX_SUSPENDED) {
+                       index_session->load_context.status = INDEX_FREEING;
+                       uds_broadcast_cond(&index_session->load_context.cond);
+               }
+               mutex_unlock(&index_session->load_context.mutex);
+
+               /* Wait until the load exits before proceeding. */
+               mutex_lock(&index_session->request_mutex);
+               while (index_session->state & IS_FLAG_LOADING) {
+                       uds_wait_cond(&index_session->request_cond,
+                                     &index_session->request_mutex);
+               }
+               mutex_unlock(&index_session->request_mutex);
+       }
+
+       wait_for_no_requests_in_progress(index_session);
+       result = save_and_free_index(index_session);
+       uds_request_queue_finish(index_session->callback_queue);
+       index_session->callback_queue = NULL;
+       uds_log_debug("Destroyed index session");
+       uds_free(index_session);
+       return uds_status_to_errno(result);
+}
+
+/* Wait until all callbacks for index operations are complete. */
+int uds_flush_index_session(struct uds_index_session *index_session)
+{
+       wait_for_no_requests_in_progress(index_session);
+       uds_wait_for_idle_index(index_session->index);
+       return UDS_SUCCESS;
+}
+
+/* Statistics collection is intended to be thread-safe. */
+static void collect_stats(const struct uds_index_session *index_session,
+                         struct uds_index_stats *stats)
+{
+       const struct session_stats *session_stats = &index_session->stats;
+
+       stats->current_time = ktime_to_seconds(current_time_ns(CLOCK_REALTIME));
+       stats->posts_found = READ_ONCE(session_stats->posts_found);
+       stats->in_memory_posts_found = READ_ONCE(session_stats->posts_found_open_chapter);
+       stats->dense_posts_found = READ_ONCE(session_stats->posts_found_dense);
+       stats->sparse_posts_found = READ_ONCE(session_stats->posts_found_sparse);
+       stats->posts_not_found = READ_ONCE(session_stats->posts_not_found);
+       stats->updates_found = READ_ONCE(session_stats->updates_found);
+       stats->updates_not_found = READ_ONCE(session_stats->updates_not_found);
+       stats->deletions_found = READ_ONCE(session_stats->deletions_found);
+       stats->deletions_not_found = READ_ONCE(session_stats->deletions_not_found);
+       stats->queries_found = READ_ONCE(session_stats->queries_found);
+       stats->queries_not_found = READ_ONCE(session_stats->queries_not_found);
+       stats->requests = READ_ONCE(session_stats->requests);
+}
+
+int uds_get_index_session_stats(struct uds_index_session *index_session,
+                               struct uds_index_stats *stats)
+{
+       if (stats == NULL) {
+               uds_log_error("received a NULL index stats pointer");
+               return -EINVAL;
+       }
+
+       collect_stats(index_session, stats);
+       if (index_session->index != NULL) {
+               uds_get_index_stats(index_session->index, stats);
+       } else {
+               stats->entries_indexed = 0;
+               stats->memory_used = 0;
+               stats->collisions = 0;
+               stats->entries_discarded = 0;
+       }
+
+       return UDS_SUCCESS;
+}
+
+void uds_wait_cond(struct cond_var *cv, struct mutex *mutex)
+{
+       DEFINE_WAIT(__wait);
+
+       prepare_to_wait(&cv->wait_queue, &__wait, TASK_IDLE);
+       mutex_unlock(mutex);
+       schedule();
+       finish_wait(&cv->wait_queue, &__wait);
+       mutex_lock(mutex);
+}
diff --git a/drivers/md/dm-vdo/indexer/index-session.h b/drivers/md/dm-vdo/indexer/index-session.h
new file mode 100644 (file)
index 0000000..066648f
--- /dev/null
@@ -0,0 +1,85 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#ifndef UDS_INDEX_SESSION_H
+#define UDS_INDEX_SESSION_H
+
+#include <linux/atomic.h>
+#include <linux/cache.h>
+
+#include "thread-utils.h"
+
+#include "config.h"
+#include "indexer.h"
+
+/*
+ * The index session mediates all interactions with a UDS index. Once the index session is created,
+ * it can be used to open, close, suspend, or recreate an index. It implements the majority of the
+ * functions in the top-level UDS API.
+ *
+ * If any deduplication request fails due to an internal error, the index is marked disabled. It
+ * will not accept any further requests and can only be closed. Closing the index will clear the
+ * disabled flag, and the index can then be reopened and recovered using the same index session.
+ */
+
+struct __aligned(L1_CACHE_BYTES) session_stats {
+       /* Post requests that found an entry */
+       u64 posts_found;
+       /* Post requests found in the open chapter */
+       u64 posts_found_open_chapter;
+       /* Post requests found in the dense index */
+       u64 posts_found_dense;
+       /* Post requests found in the sparse index */
+       u64 posts_found_sparse;
+       /* Post requests that did not find an entry */
+       u64 posts_not_found;
+       /* Update requests that found an entry */
+       u64 updates_found;
+       /* Update requests that did not find an entry */
+       u64 updates_not_found;
+       /* Delete requests that found an entry */
+       u64 deletions_found;
+       /* Delete requests that did not find an entry */
+       u64 deletions_not_found;
+       /* Query requests that found an entry */
+       u64 queries_found;
+       /* Query requests that did not find an entry */
+       u64 queries_not_found;
+       /* Total number of requests */
+       u64 requests;
+};
+
+enum index_suspend_status {
+       /* An index load has started but the index is not ready for use. */
+       INDEX_OPENING = 0,
+       /* The index is able to handle requests. */
+       INDEX_READY,
+       /* The index is attempting to suspend a rebuild. */
+       INDEX_SUSPENDING,
+       /* An index rebuild has been suspended. */
+       INDEX_SUSPENDED,
+       /* An index rebuild is being stopped in order to shut down. */
+       INDEX_FREEING,
+};
+
+struct index_load_context {
+       struct mutex mutex;
+       struct cond_var cond;
+       enum index_suspend_status status;
+};
+
+struct uds_index_session {
+       unsigned int state;
+       struct uds_index *index;
+       struct uds_request_queue *callback_queue;
+       struct uds_parameters parameters;
+       struct index_load_context load_context;
+       struct mutex request_mutex;
+       struct cond_var request_cond;
+       int request_count;
+       struct session_stats stats;
+};
+
+#endif /* UDS_INDEX_SESSION_H */
diff --git a/drivers/md/dm-vdo/indexer/index.c b/drivers/md/dm-vdo/indexer/index.c
new file mode 100644 (file)
index 0000000..35e3b45
--- /dev/null
@@ -0,0 +1,1388 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2023 Red Hat
+ */
+
+
+#include "index.h"
+
+#include "logger.h"
+#include "memory-alloc.h"
+
+#include "funnel-requestqueue.h"
+#include "hash-utils.h"
+#include "sparse-cache.h"
+
+static const u64 NO_LAST_SAVE = U64_MAX;
+
+/*
+ * When searching for deduplication records, the index first searches the volume index, and then
+ * searches the chapter index for the relevant chapter. If the chapter has been fully committed to
+ * storage, the chapter pages are loaded into the page cache. If the chapter has not yet been
+ * committed (either the open chapter or a recently closed one), the index searches the in-memory
+ * representation of the chapter. Finally, if the volume index does not find a record and the index
+ * is sparse, the index will search the sparse cache.
+ *
+ * The index send two kinds of messages to coordinate between zones: chapter close messages for the
+ * chapter writer, and sparse cache barrier messages for the sparse cache.
+ *
+ * The chapter writer is responsible for committing chapters of records to storage. Since zones can
+ * get different numbers of records, some zones may fall behind others. Each time a zone fills up
+ * its available space in a chapter, it informs the chapter writer that the chapter is complete,
+ * and also informs all other zones that it has closed the chapter. Each other zone will then close
+ * the chapter immediately, regardless of how full it is, in order to minimize skew between zones.
+ * Once every zone has closed the chapter, the chapter writer will commit that chapter to storage.
+ *
+ * The last zone to close the chapter also removes the oldest chapter from the volume index.
+ * Although that chapter is invalid for zones that have moved on, the existence of the open chapter
+ * means that those zones will never ask the volume index about it. No zone is allowed to get more
+ * than one chapter ahead of any other. If a zone is so far ahead that it tries to close another
+ * chapter before the previous one has been closed by all zones, it is forced to wait.
+ *
+ * The sparse cache relies on having the same set of chapter indexes available to all zones. When a
+ * request wants to add a chapter to the sparse cache, it sends a barrier message to each zone
+ * during the triage stage that acts as a rendezvous. Once every zone has reached the barrier and
+ * paused its operations, the cache membership is changed and each zone is then informed that it
+ * can proceed. More details can be found in the sparse cache documentation.
+ *
+ * If a sparse cache has only one zone, it will not create a triage queue, but it still needs the
+ * barrier message to change the sparse cache membership, so the index simulates the message by
+ * invoking the handler directly.
+ */
+
+struct chapter_writer {
+       /* The index to which we belong */
+       struct uds_index *index;
+       /* The thread to do the writing */
+       struct thread *thread;
+       /* The lock protecting the following fields */
+       struct mutex mutex;
+       /* The condition signalled on state changes */
+       struct cond_var cond;
+       /* Set to true to stop the thread */
+       bool stop;
+       /* The result from the most recent write */
+       int result;
+       /* The number of bytes allocated by the chapter writer */
+       size_t memory_size;
+       /* The number of zones which have submitted a chapter for writing */
+       unsigned int zones_to_write;
+       /* Open chapter index used by uds_close_open_chapter() */
+       struct open_chapter_index *open_chapter_index;
+       /* Collated records used by uds_close_open_chapter() */
+       struct uds_volume_record *collated_records;
+       /* The chapters to write (one per zone) */
+       struct open_chapter_zone *chapters[];
+};
+
+static bool is_zone_chapter_sparse(const struct index_zone *zone, u64 virtual_chapter)
+{
+       return uds_is_chapter_sparse(zone->index->volume->geometry,
+                                    zone->oldest_virtual_chapter,
+                                    zone->newest_virtual_chapter, virtual_chapter);
+}
+
+static int launch_zone_message(struct uds_zone_message message, unsigned int zone,
+                              struct uds_index *index)
+{
+       int result;
+       struct uds_request *request;
+
+       result = uds_allocate(1, struct uds_request, __func__, &request);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       request->index = index;
+       request->unbatched = true;
+       request->zone_number = zone;
+       request->zone_message = message;
+
+       uds_enqueue_request(request, STAGE_MESSAGE);
+       return UDS_SUCCESS;
+}
+
+static void enqueue_barrier_messages(struct uds_index *index, u64 virtual_chapter)
+{
+       struct uds_zone_message message = {
+               .type = UDS_MESSAGE_SPARSE_CACHE_BARRIER,
+               .virtual_chapter = virtual_chapter,
+       };
+       unsigned int zone;
+
+       for (zone = 0; zone < index->zone_count; zone++) {
+               int result = launch_zone_message(message, zone, index);
+
+               ASSERT_LOG_ONLY((result == UDS_SUCCESS), "barrier message allocation");
+       }
+}
+
+/*
+ * Determine whether this request should trigger a sparse cache barrier message to change the
+ * membership of the sparse cache. If a change in membership is desired, the function returns the
+ * chapter number to add.
+ */
+static u64 triage_index_request(struct uds_index *index, struct uds_request *request)
+{
+       u64 virtual_chapter;
+       struct index_zone *zone;
+
+       virtual_chapter = uds_lookup_volume_index_name(index->volume_index,
+                                                      &request->record_name);
+       if (virtual_chapter == NO_CHAPTER)
+               return NO_CHAPTER;
+
+       zone = index->zones[request->zone_number];
+       if (!is_zone_chapter_sparse(zone, virtual_chapter))
+               return NO_CHAPTER;
+
+       /*
+        * FIXME: Optimize for a common case by remembering the chapter from the most recent
+        * barrier message and skipping this chapter if is it the same.
+        */
+
+       return virtual_chapter;
+}
+
+/*
+ * Simulate a message to change the sparse cache membership for a single-zone sparse index. This
+ * allows us to forgo the complicated locking required by a multi-zone sparse index. Any other kind
+ * of index does nothing here.
+ */
+static int simulate_index_zone_barrier_message(struct index_zone *zone,
+                                              struct uds_request *request)
+{
+       u64 sparse_virtual_chapter;
+
+       if ((zone->index->zone_count > 1) ||
+           !uds_is_sparse_index_geometry(zone->index->volume->geometry))
+               return UDS_SUCCESS;
+
+       sparse_virtual_chapter = triage_index_request(zone->index, request);
+       if (sparse_virtual_chapter == NO_CHAPTER)
+               return UDS_SUCCESS;
+
+       return uds_update_sparse_cache(zone, sparse_virtual_chapter);
+}
+
+/* This is the request processing function for the triage queue. */
+static void triage_request(struct uds_request *request)
+{
+       struct uds_index *index = request->index;
+       u64 sparse_virtual_chapter = triage_index_request(index, request);
+
+       if (sparse_virtual_chapter != NO_CHAPTER)
+               enqueue_barrier_messages(index, sparse_virtual_chapter);
+
+       uds_enqueue_request(request, STAGE_INDEX);
+}
+
+static int finish_previous_chapter(struct uds_index *index, u64 current_chapter_number)
+{
+       int result;
+       struct chapter_writer *writer = index->chapter_writer;
+
+       mutex_lock(&writer->mutex);
+       while (index->newest_virtual_chapter < current_chapter_number)
+               uds_wait_cond(&writer->cond, &writer->mutex);
+       result = writer->result;
+       mutex_unlock(&writer->mutex);
+
+       if (result != UDS_SUCCESS)
+               return uds_log_error_strerror(result,
+                                             "Writing of previous open chapter failed");
+
+       return UDS_SUCCESS;
+}
+
+static int swap_open_chapter(struct index_zone *zone)
+{
+       int result;
+       struct open_chapter_zone *temporary_chapter;
+
+       result = finish_previous_chapter(zone->index, zone->newest_virtual_chapter);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       temporary_chapter = zone->open_chapter;
+       zone->open_chapter = zone->writing_chapter;
+       zone->writing_chapter = temporary_chapter;
+       return UDS_SUCCESS;
+}
+
+/*
+ * Inform the chapter writer that this zone is done with this chapter. The chapter won't start
+ * writing until all zones have closed it.
+ */
+static unsigned int start_closing_chapter(struct uds_index *index,
+                                         unsigned int zone_number,
+                                         struct open_chapter_zone *chapter)
+{
+       unsigned int finished_zones;
+       struct chapter_writer *writer = index->chapter_writer;
+
+       mutex_lock(&writer->mutex);
+       finished_zones = ++writer->zones_to_write;
+       writer->chapters[zone_number] = chapter;
+       uds_broadcast_cond(&writer->cond);
+       mutex_unlock(&writer->mutex);
+
+       return finished_zones;
+}
+
+static int announce_chapter_closed(struct index_zone *zone, u64 closed_chapter)
+{
+       int result;
+       unsigned int i;
+       struct uds_zone_message zone_message = {
+               .type = UDS_MESSAGE_ANNOUNCE_CHAPTER_CLOSED,
+               .virtual_chapter = closed_chapter,
+       };
+
+       for (i = 0; i < zone->index->zone_count; i++) {
+               if (zone->id == i)
+                       continue;
+
+               result = launch_zone_message(zone_message, i, zone->index);
+               if (result != UDS_SUCCESS)
+                       return result;
+       }
+
+       return UDS_SUCCESS;
+}
+
+static int open_next_chapter(struct index_zone *zone)
+{
+       int result;
+       u64 closed_chapter;
+       u64 expiring;
+       unsigned int finished_zones;
+       u32 expire_chapters;
+
+       uds_log_debug("closing chapter %llu of zone %u after %u entries (%u short)",
+                     (unsigned long long) zone->newest_virtual_chapter, zone->id,
+                     zone->open_chapter->size,
+                     zone->open_chapter->capacity - zone->open_chapter->size);
+
+       result = swap_open_chapter(zone);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       closed_chapter = zone->newest_virtual_chapter++;
+       uds_set_volume_index_zone_open_chapter(zone->index->volume_index, zone->id,
+                                              zone->newest_virtual_chapter);
+       uds_reset_open_chapter(zone->open_chapter);
+
+       finished_zones = start_closing_chapter(zone->index, zone->id,
+                                              zone->writing_chapter);
+       if ((finished_zones == 1) && (zone->index->zone_count > 1)) {
+               result = announce_chapter_closed(zone, closed_chapter);
+               if (result != UDS_SUCCESS)
+                       return result;
+       }
+
+       expiring = zone->oldest_virtual_chapter;
+       expire_chapters = uds_chapters_to_expire(zone->index->volume->geometry,
+                                                zone->newest_virtual_chapter);
+       zone->oldest_virtual_chapter += expire_chapters;
+
+       if (finished_zones < zone->index->zone_count)
+               return UDS_SUCCESS;
+
+       while (expire_chapters-- > 0)
+               uds_forget_chapter(zone->index->volume, expiring++);
+
+       return UDS_SUCCESS;
+}
+
+static int handle_chapter_closed(struct index_zone *zone, u64 virtual_chapter)
+{
+       if (zone->newest_virtual_chapter == virtual_chapter)
+               return open_next_chapter(zone);
+
+       return UDS_SUCCESS;
+}
+
+static int dispatch_index_zone_control_request(struct uds_request *request)
+{
+       struct uds_zone_message *message = &request->zone_message;
+       struct index_zone *zone = request->index->zones[request->zone_number];
+
+       switch (message->type) {
+       case UDS_MESSAGE_SPARSE_CACHE_BARRIER:
+               return uds_update_sparse_cache(zone, message->virtual_chapter);
+
+       case UDS_MESSAGE_ANNOUNCE_CHAPTER_CLOSED:
+               return handle_chapter_closed(zone, message->virtual_chapter);
+
+       default:
+               uds_log_error("invalid message type: %d", message->type);
+               return UDS_INVALID_ARGUMENT;
+       }
+}
+
+static void set_request_location(struct uds_request *request,
+                                enum uds_index_region new_location)
+{
+       request->location = new_location;
+       request->found = ((new_location == UDS_LOCATION_IN_OPEN_CHAPTER) ||
+                         (new_location == UDS_LOCATION_IN_DENSE) ||
+                         (new_location == UDS_LOCATION_IN_SPARSE));
+}
+
+static void set_chapter_location(struct uds_request *request,
+                                const struct index_zone *zone, u64 virtual_chapter)
+{
+       request->found = true;
+       if (virtual_chapter == zone->newest_virtual_chapter)
+               request->location = UDS_LOCATION_IN_OPEN_CHAPTER;
+       else if (is_zone_chapter_sparse(zone, virtual_chapter))
+               request->location = UDS_LOCATION_IN_SPARSE;
+       else
+               request->location = UDS_LOCATION_IN_DENSE;
+}
+
+static int search_sparse_cache_in_zone(struct index_zone *zone, struct uds_request *request,
+                                      u64 virtual_chapter, bool *found)
+{
+       int result;
+       struct volume *volume;
+       u16 record_page_number;
+       u32 chapter;
+
+       result = uds_search_sparse_cache(zone, &request->record_name, &virtual_chapter,
+                                        &record_page_number);
+       if ((result != UDS_SUCCESS) || (virtual_chapter == NO_CHAPTER))
+               return result;
+
+       request->virtual_chapter = virtual_chapter;
+       volume = zone->index->volume;
+       chapter = uds_map_to_physical_chapter(volume->geometry, virtual_chapter);
+       return uds_search_cached_record_page(volume, request, chapter,
+                                            record_page_number, found);
+}
+
+static int get_record_from_zone(struct index_zone *zone, struct uds_request *request,
+                               bool *found)
+{
+       struct volume *volume;
+
+       if (request->location == UDS_LOCATION_RECORD_PAGE_LOOKUP) {
+               *found = true;
+               return UDS_SUCCESS;
+       } else if (request->location == UDS_LOCATION_UNAVAILABLE) {
+               *found = false;
+               return UDS_SUCCESS;
+       }
+
+       if (request->virtual_chapter == zone->newest_virtual_chapter) {
+               uds_search_open_chapter(zone->open_chapter, &request->record_name,
+                                       &request->old_metadata, found);
+               return UDS_SUCCESS;
+       }
+
+       if ((zone->newest_virtual_chapter > 0) &&
+           (request->virtual_chapter == (zone->newest_virtual_chapter - 1)) &&
+           (zone->writing_chapter->size > 0)) {
+               uds_search_open_chapter(zone->writing_chapter, &request->record_name,
+                                       &request->old_metadata, found);
+               return UDS_SUCCESS;
+       }
+
+       volume = zone->index->volume;
+       if (is_zone_chapter_sparse(zone, request->virtual_chapter) &&
+           uds_sparse_cache_contains(volume->sparse_cache, request->virtual_chapter,
+                                     request->zone_number))
+               return search_sparse_cache_in_zone(zone, request,
+                                                  request->virtual_chapter, found);
+
+       return uds_search_volume_page_cache(volume, request, found);
+}
+
+static int put_record_in_zone(struct index_zone *zone, struct uds_request *request,
+                             const struct uds_record_data *metadata)
+{
+       unsigned int remaining;
+
+       remaining = uds_put_open_chapter(zone->open_chapter, &request->record_name,
+                                        metadata);
+       if (remaining == 0)
+               return open_next_chapter(zone);
+
+       return UDS_SUCCESS;
+}
+
+static int search_index_zone(struct index_zone *zone, struct uds_request *request)
+{
+       int result;
+       struct volume_index_record record;
+       bool overflow_record, found = false;
+       struct uds_record_data *metadata;
+       u64 chapter;
+
+       result = uds_get_volume_index_record(zone->index->volume_index,
+                                            &request->record_name, &record);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       if (record.is_found) {
+               if (request->requeued && request->virtual_chapter != record.virtual_chapter)
+                       set_request_location(request, UDS_LOCATION_UNKNOWN);
+
+               request->virtual_chapter = record.virtual_chapter;
+               result = get_record_from_zone(zone, request, &found);
+               if (result != UDS_SUCCESS)
+                       return result;
+       }
+
+       if (found)
+               set_chapter_location(request, zone, record.virtual_chapter);
+
+       /*
+        * If a record has overflowed a chapter index in more than one chapter (or overflowed in
+        * one chapter and collided with an existing record), it will exist as a collision record
+        * in the volume index, but we won't find it in the volume. This case needs special
+        * handling.
+        */
+       overflow_record = (record.is_found && record.is_collision && !found);
+       chapter = zone->newest_virtual_chapter;
+       if (found || overflow_record) {
+               if ((request->type == UDS_QUERY_NO_UPDATE) ||
+                   ((request->type == UDS_QUERY) && overflow_record)) {
+                       /* There is nothing left to do. */
+                       return UDS_SUCCESS;
+               }
+
+               if (record.virtual_chapter != chapter) {
+                       /*
+                        * Update the volume index to reference the new chapter for the block. If
+                        * the record had been deleted or dropped from the chapter index, it will
+                        * be back.
+                        */
+                       result = uds_set_volume_index_record_chapter(&record, chapter);
+               } else if (request->type != UDS_UPDATE) {
+                       /* The record is already in the open chapter. */
+                       return UDS_SUCCESS;
+               }
+       } else {
+               /*
+                * The record wasn't in the volume index, so check whether the
+                * name is in a cached sparse chapter. If we found the name on
+                * a previous search, use that result instead.
+                */
+               if (request->location == UDS_LOCATION_RECORD_PAGE_LOOKUP) {
+                       found = true;
+               } else if (request->location == UDS_LOCATION_UNAVAILABLE) {
+                       found = false;
+               } else if (uds_is_sparse_index_geometry(zone->index->volume->geometry) &&
+                          !uds_is_volume_index_sample(zone->index->volume_index,
+                                                      &request->record_name)) {
+                       result = search_sparse_cache_in_zone(zone, request, NO_CHAPTER,
+                                                            &found);
+                       if (result != UDS_SUCCESS)
+                               return result;
+               }
+
+               if (found)
+                       set_request_location(request, UDS_LOCATION_IN_SPARSE);
+
+               if ((request->type == UDS_QUERY_NO_UPDATE) ||
+                   ((request->type == UDS_QUERY) && !found)) {
+                       /* There is nothing left to do. */
+                       return UDS_SUCCESS;
+               }
+
+               /*
+                * Add a new entry to the volume index referencing the open chapter. This needs to
+                * be done both for new records, and for records from cached sparse chapters.
+                */
+               result = uds_put_volume_index_record(&record, chapter);
+       }
+
+       if (result == UDS_OVERFLOW) {
+               /*
+                * The volume index encountered a delta list overflow. The condition was already
+                * logged. We will go on without adding the record to the open chapter.
+                */
+               return UDS_SUCCESS;
+       }
+
+       if (result != UDS_SUCCESS)
+               return result;
+
+       if (!found || (request->type == UDS_UPDATE)) {
+               /* This is a new record or we're updating an existing record. */
+               metadata = &request->new_metadata;
+       } else {
+               /* Move the existing record to the open chapter. */
+               metadata = &request->old_metadata;
+       }
+
+       return put_record_in_zone(zone, request, metadata);
+}
+
+static int remove_from_index_zone(struct index_zone *zone, struct uds_request *request)
+{
+       int result;
+       struct volume_index_record record;
+
+       result = uds_get_volume_index_record(zone->index->volume_index,
+                                            &request->record_name, &record);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       if (!record.is_found)
+               return UDS_SUCCESS;
+
+       /* If the request was requeued, check whether the saved state is still valid. */
+
+       if (record.is_collision) {
+               set_chapter_location(request, zone, record.virtual_chapter);
+       } else {
+               /* Non-collision records are hints, so resolve the name in the chapter. */
+               bool found;
+
+               if (request->requeued && request->virtual_chapter != record.virtual_chapter)
+                       set_request_location(request, UDS_LOCATION_UNKNOWN);
+
+               request->virtual_chapter = record.virtual_chapter;
+               result = get_record_from_zone(zone, request, &found);
+               if (result != UDS_SUCCESS)
+                       return result;
+
+               if (!found) {
+                       /* There is no record to remove. */
+                       return UDS_SUCCESS;
+               }
+       }
+
+       set_chapter_location(request, zone, record.virtual_chapter);
+
+       /*
+        * Delete the volume index entry for the named record only. Note that a later search might
+        * later return stale advice if there is a colliding name in the same chapter, but it's a
+        * very rare case (1 in 2^21).
+        */
+       result = uds_remove_volume_index_record(&record);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       /*
+        * If the record is in the open chapter, we must remove it or mark it deleted to avoid
+        * trouble if the record is added again later.
+        */
+       if (request->location == UDS_LOCATION_IN_OPEN_CHAPTER)
+               uds_remove_from_open_chapter(zone->open_chapter, &request->record_name);
+
+       return UDS_SUCCESS;
+}
+
+static int dispatch_index_request(struct uds_index *index, struct uds_request *request)
+{
+       int result;
+       struct index_zone *zone = index->zones[request->zone_number];
+
+       if (!request->requeued) {
+               result = simulate_index_zone_barrier_message(zone, request);
+               if (result != UDS_SUCCESS)
+                       return result;
+       }
+
+       switch (request->type) {
+       case UDS_POST:
+       case UDS_UPDATE:
+       case UDS_QUERY:
+       case UDS_QUERY_NO_UPDATE:
+               result = search_index_zone(zone, request);
+               break;
+
+       case UDS_DELETE:
+               result = remove_from_index_zone(zone, request);
+               break;
+
+       default:
+               result = uds_log_warning_strerror(UDS_INVALID_ARGUMENT,
+                                                 "invalid request type: %d",
+                                                 request->type);
+               break;
+       }
+
+       return result;
+}
+
+/* This is the request processing function invoked by each zone's thread. */
+static void execute_zone_request(struct uds_request *request)
+{
+       int result;
+       struct uds_index *index = request->index;
+
+       if (request->zone_message.type != UDS_MESSAGE_NONE) {
+               result = dispatch_index_zone_control_request(request);
+               if (result != UDS_SUCCESS) {
+                       uds_log_error_strerror(result, "error executing message: %d",
+                                              request->zone_message.type);
+               }
+
+               /* Once the message is processed it can be freed. */
+               uds_free(uds_forget(request));
+               return;
+       }
+
+       index->need_to_save = true;
+       if (request->requeued && (request->status != UDS_SUCCESS)) {
+               set_request_location(request, UDS_LOCATION_UNAVAILABLE);
+               index->callback(request);
+               return;
+       }
+
+       result = dispatch_index_request(index, request);
+       if (result == UDS_QUEUED) {
+               /* The request has been requeued so don't let it complete. */
+               return;
+       }
+
+       if (!request->found)
+               set_request_location(request, UDS_LOCATION_UNAVAILABLE);
+
+       request->status = result;
+       index->callback(request);
+}
+
+static int initialize_index_queues(struct uds_index *index,
+                                  const struct index_geometry *geometry)
+{
+       int result;
+       unsigned int i;
+
+       for (i = 0; i < index->zone_count; i++) {
+               result = uds_make_request_queue("indexW", &execute_zone_request,
+                                               &index->zone_queues[i]);
+               if (result != UDS_SUCCESS)
+                       return result;
+       }
+
+       /* The triage queue is only needed for sparse multi-zone indexes. */
+       if ((index->zone_count > 1) && uds_is_sparse_index_geometry(geometry)) {
+               result = uds_make_request_queue("triageW", &triage_request,
+                                               &index->triage_queue);
+               if (result != UDS_SUCCESS)
+                       return result;
+       }
+
+       return UDS_SUCCESS;
+}
+
+/* This is the driver function for the chapter writer thread. */
+static void close_chapters(void *arg)
+{
+       int result;
+       struct chapter_writer *writer = arg;
+       struct uds_index *index = writer->index;
+
+       uds_log_debug("chapter writer starting");
+       mutex_lock(&writer->mutex);
+       for (;;) {
+               while (writer->zones_to_write < index->zone_count) {
+                       if (writer->stop && (writer->zones_to_write == 0)) {
+                               /*
+                                * We've been told to stop, and all of the zones are in the same
+                                * open chapter, so we can exit now.
+                                */
+                               mutex_unlock(&writer->mutex);
+                               uds_log_debug("chapter writer stopping");
+                               return;
+                       }
+                       uds_wait_cond(&writer->cond, &writer->mutex);
+               }
+
+               /*
+                * Release the lock while closing a chapter. We probably don't need to do this, but
+                * it seems safer in principle. It's OK to access the chapter and chapter_number
+                * fields without the lock since those aren't allowed to change until we're done.
+                */
+               mutex_unlock(&writer->mutex);
+
+               if (index->has_saved_open_chapter) {
+                       /*
+                        * Remove the saved open chapter the first time we close an open chapter
+                        * after loading from a clean shutdown, or after doing a clean save. The
+                        * lack of the saved open chapter will indicate that a recovery is
+                        * necessary.
+                        */
+                       index->has_saved_open_chapter = false;
+                       result = uds_discard_open_chapter(index->layout);
+                       if (result == UDS_SUCCESS)
+                               uds_log_debug("Discarding saved open chapter");
+               }
+
+               result = uds_close_open_chapter(writer->chapters, index->zone_count,
+                                               index->volume,
+                                               writer->open_chapter_index,
+                                               writer->collated_records,
+                                               index->newest_virtual_chapter);
+
+               mutex_lock(&writer->mutex);
+               index->newest_virtual_chapter++;
+               index->oldest_virtual_chapter +=
+                       uds_chapters_to_expire(index->volume->geometry,
+                                              index->newest_virtual_chapter);
+               writer->result = result;
+               writer->zones_to_write = 0;
+               uds_broadcast_cond(&writer->cond);
+       }
+}
+
+static void stop_chapter_writer(struct chapter_writer *writer)
+{
+       struct thread *writer_thread = NULL;
+
+       mutex_lock(&writer->mutex);
+       if (writer->thread != NULL) {
+               writer_thread = writer->thread;
+               writer->thread = NULL;
+               writer->stop = true;
+               uds_broadcast_cond(&writer->cond);
+       }
+       mutex_unlock(&writer->mutex);
+
+       if (writer_thread != NULL)
+               vdo_join_threads(writer_thread);
+}
+
+static void free_chapter_writer(struct chapter_writer *writer)
+{
+       if (writer == NULL)
+               return;
+
+       stop_chapter_writer(writer);
+       uds_free_open_chapter_index(writer->open_chapter_index);
+       uds_free(writer->collated_records);
+       uds_free(writer);
+}
+
+static int make_chapter_writer(struct uds_index *index,
+                              struct chapter_writer **writer_ptr)
+{
+       int result;
+       struct chapter_writer *writer;
+       size_t collated_records_size =
+               (sizeof(struct uds_volume_record) * index->volume->geometry->records_per_chapter);
+
+       result = uds_allocate_extended(struct chapter_writer, index->zone_count,
+                                      struct open_chapter_zone *, "Chapter Writer",
+                                      &writer);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       writer->index = index;
+       mutex_init(&writer->mutex);
+       uds_init_cond(&writer->cond);
+
+       result = uds_allocate_cache_aligned(collated_records_size, "collated records",
+                                           &writer->collated_records);
+       if (result != UDS_SUCCESS) {
+               free_chapter_writer(writer);
+               return result;
+       }
+
+       result = uds_make_open_chapter_index(&writer->open_chapter_index,
+                                            index->volume->geometry,
+                                            index->volume->nonce);
+       if (result != UDS_SUCCESS) {
+               free_chapter_writer(writer);
+               return result;
+       }
+
+       writer->memory_size = (sizeof(struct chapter_writer) +
+                              index->zone_count * sizeof(struct open_chapter_zone *) +
+                              collated_records_size +
+                              writer->open_chapter_index->memory_size);
+
+       result = vdo_create_thread(close_chapters, writer, "writer", &writer->thread);
+       if (result != UDS_SUCCESS) {
+               free_chapter_writer(writer);
+               return result;
+       }
+
+       *writer_ptr = writer;
+       return UDS_SUCCESS;
+}
+
+static int load_index(struct uds_index *index)
+{
+       int result;
+       u64 last_save_chapter;
+
+       result = uds_load_index_state(index->layout, index);
+       if (result != UDS_SUCCESS)
+               return UDS_INDEX_NOT_SAVED_CLEANLY;
+
+       last_save_chapter = ((index->last_save != NO_LAST_SAVE) ? index->last_save : 0);
+
+       uds_log_info("loaded index from chapter %llu through chapter %llu",
+                    (unsigned long long) index->oldest_virtual_chapter,
+                    (unsigned long long) last_save_chapter);
+
+       return UDS_SUCCESS;
+}
+
+static int rebuild_index_page_map(struct uds_index *index, u64 vcn)
+{
+       int result;
+       struct delta_index_page *chapter_index_page;
+       struct index_geometry *geometry = index->volume->geometry;
+       u32 chapter = uds_map_to_physical_chapter(geometry, vcn);
+       u32 expected_list_number = 0;
+       u32 index_page_number;
+       u32 lowest_delta_list;
+       u32 highest_delta_list;
+
+       for (index_page_number = 0;
+            index_page_number < geometry->index_pages_per_chapter;
+            index_page_number++) {
+               result = uds_get_volume_index_page(index->volume, chapter,
+                                                  index_page_number,
+                                                  &chapter_index_page);
+               if (result != UDS_SUCCESS) {
+                       return uds_log_error_strerror(result,
+                                                     "failed to read index page %u in chapter %u",
+                                                     index_page_number, chapter);
+               }
+
+               lowest_delta_list = chapter_index_page->lowest_list_number;
+               highest_delta_list = chapter_index_page->highest_list_number;
+               if (lowest_delta_list != expected_list_number) {
+                       return uds_log_error_strerror(UDS_CORRUPT_DATA,
+                                                     "chapter %u index page %u is corrupt",
+                                                     chapter, index_page_number);
+               }
+
+               uds_update_index_page_map(index->volume->index_page_map, vcn, chapter,
+                                         index_page_number, highest_delta_list);
+               expected_list_number = highest_delta_list + 1;
+       }
+
+       return UDS_SUCCESS;
+}
+
+static int replay_record(struct uds_index *index, const struct uds_record_name *name,
+                        u64 virtual_chapter, bool will_be_sparse_chapter)
+{
+       int result;
+       struct volume_index_record record;
+       bool update_record;
+
+       if (will_be_sparse_chapter &&
+           !uds_is_volume_index_sample(index->volume_index, name)) {
+               /*
+                * This entry will be in a sparse chapter after the rebuild completes, and it is
+                * not a sample, so just skip over it.
+                */
+               return UDS_SUCCESS;
+       }
+
+       result = uds_get_volume_index_record(index->volume_index, name, &record);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       if (record.is_found) {
+               if (record.is_collision) {
+                       if (record.virtual_chapter == virtual_chapter) {
+                               /* The record is already correct. */
+                               return UDS_SUCCESS;
+                       }
+
+                       update_record = true;
+               } else if (record.virtual_chapter == virtual_chapter) {
+                       /*
+                        * There is a volume index entry pointing to the current chapter, but we
+                        * don't know if it is for the same name as the one we are currently
+                        * working on or not. For now, we're just going to assume that it isn't.
+                        * This will create one extra collision record if there was a deleted
+                        * record in the current chapter.
+                        */
+                       update_record = false;
+               } else {
+                       /*
+                        * If we're rebuilding, we don't normally want to go to disk to see if the
+                        * record exists, since we will likely have just read the record from disk
+                        * (i.e. we know it's there). The exception to this is when we find an
+                        * entry in the volume index that has a different chapter. In this case, we
+                        * need to search that chapter to determine if the volume index entry was
+                        * for the same record or a different one.
+                        */
+                       result = uds_search_volume_page_cache_for_rebuild(index->volume,
+                                                                         name,
+                                                                         record.virtual_chapter,
+                                                                         &update_record);
+                       if (result != UDS_SUCCESS)
+                               return result;
+                       }
+       } else {
+               update_record = false;
+       }
+
+       if (update_record) {
+               /*
+                * Update the volume index to reference the new chapter for the block. If the
+                * record had been deleted or dropped from the chapter index, it will be back.
+                */
+               result = uds_set_volume_index_record_chapter(&record, virtual_chapter);
+       } else {
+               /*
+                * Add a new entry to the volume index referencing the open chapter. This should be
+                * done regardless of whether we are a brand new record or a sparse record, i.e.
+                * one that doesn't exist in the index but does on disk, since for a sparse record,
+                * we would want to un-sparsify if it did exist.
+                */
+               result = uds_put_volume_index_record(&record, virtual_chapter);
+       }
+
+       if ((result == UDS_DUPLICATE_NAME) || (result == UDS_OVERFLOW)) {
+               /* The rebuilt index will lose these records. */
+               return UDS_SUCCESS;
+       }
+
+       return result;
+}
+
+static bool check_for_suspend(struct uds_index *index)
+{
+       bool closing;
+
+       if (index->load_context == NULL)
+               return false;
+
+       mutex_lock(&index->load_context->mutex);
+       if (index->load_context->status != INDEX_SUSPENDING) {
+               mutex_unlock(&index->load_context->mutex);
+               return false;
+       }
+
+       /* Notify that we are suspended and wait for the resume. */
+       index->load_context->status = INDEX_SUSPENDED;
+       uds_broadcast_cond(&index->load_context->cond);
+
+       while ((index->load_context->status != INDEX_OPENING) &&
+              (index->load_context->status != INDEX_FREEING))
+               uds_wait_cond(&index->load_context->cond, &index->load_context->mutex);
+
+       closing = (index->load_context->status == INDEX_FREEING);
+       mutex_unlock(&index->load_context->mutex);
+       return closing;
+}
+
+static int replay_chapter(struct uds_index *index, u64 virtual, bool sparse)
+{
+       int result;
+       u32 i;
+       u32 j;
+       const struct index_geometry *geometry;
+       u32 physical_chapter;
+
+       if (check_for_suspend(index)) {
+               uds_log_info("Replay interrupted by index shutdown at chapter %llu",
+                            (unsigned long long) virtual);
+               return -EBUSY;
+       }
+
+       geometry = index->volume->geometry;
+       physical_chapter = uds_map_to_physical_chapter(geometry, virtual);
+       uds_prefetch_volume_chapter(index->volume, physical_chapter);
+       uds_set_volume_index_open_chapter(index->volume_index, virtual);
+
+       result = rebuild_index_page_map(index, virtual);
+       if (result != UDS_SUCCESS) {
+               return uds_log_error_strerror(result,
+                                             "could not rebuild index page map for chapter %u",
+                                             physical_chapter);
+       }
+
+       for (i = 0; i < geometry->record_pages_per_chapter; i++) {
+               u8 *record_page;
+               u32 record_page_number;
+
+               record_page_number = geometry->index_pages_per_chapter + i;
+               result = uds_get_volume_record_page(index->volume, physical_chapter,
+                                                   record_page_number, &record_page);
+               if (result != UDS_SUCCESS) {
+                       return uds_log_error_strerror(result, "could not get page %d",
+                                                     record_page_number);
+               }
+
+               for (j = 0; j < geometry->records_per_page; j++) {
+                       const u8 *name_bytes;
+                       struct uds_record_name name;
+
+                       name_bytes = record_page + (j * BYTES_PER_RECORD);
+                       memcpy(&name.name, name_bytes, UDS_RECORD_NAME_SIZE);
+                       result = replay_record(index, &name, virtual, sparse);
+                       if (result != UDS_SUCCESS)
+                               return result;
+               }
+       }
+
+       return UDS_SUCCESS;
+}
+
+static int replay_volume(struct uds_index *index)
+{
+       int result;
+       u64 old_map_update;
+       u64 new_map_update;
+       u64 virtual;
+       u64 from_virtual = index->oldest_virtual_chapter;
+       u64 upto_virtual = index->newest_virtual_chapter;
+       bool will_be_sparse;
+
+       uds_log_info("Replaying volume from chapter %llu through chapter %llu",
+                    (unsigned long long) from_virtual,
+                    (unsigned long long) upto_virtual);
+
+       /*
+        * The index failed to load, so the volume index is empty. Add records to the volume index
+        * in order, skipping non-hooks in chapters which will be sparse to save time.
+        *
+        * Go through each record page of each chapter and add the records back to the volume
+        * index. This should not cause anything to be written to either the open chapter or the
+        * on-disk volume. Also skip the on-disk chapter corresponding to upto_virtual, as this
+        * would have already been purged from the volume index when the chapter was opened.
+        *
+        * Also, go through each index page for each chapter and rebuild the index page map.
+        */
+       old_map_update = index->volume->index_page_map->last_update;
+       for (virtual = from_virtual; virtual < upto_virtual; virtual++) {
+               will_be_sparse = uds_is_chapter_sparse(index->volume->geometry,
+                                                      from_virtual, upto_virtual,
+                                                      virtual);
+               result = replay_chapter(index, virtual, will_be_sparse);
+               if (result != UDS_SUCCESS)
+                       return result;
+       }
+
+       /* Also reap the chapter being replaced by the open chapter. */
+       uds_set_volume_index_open_chapter(index->volume_index, upto_virtual);
+
+       new_map_update = index->volume->index_page_map->last_update;
+       if (new_map_update != old_map_update) {
+               uds_log_info("replay changed index page map update from %llu to %llu",
+                            (unsigned long long) old_map_update,
+                            (unsigned long long) new_map_update);
+       }
+
+       return UDS_SUCCESS;
+}
+
+static int rebuild_index(struct uds_index *index)
+{
+       int result;
+       u64 lowest;
+       u64 highest;
+       bool is_empty = false;
+       u32 chapters_per_volume = index->volume->geometry->chapters_per_volume;
+
+       index->volume->lookup_mode = LOOKUP_FOR_REBUILD;
+       result = uds_find_volume_chapter_boundaries(index->volume, &lowest, &highest,
+                                                   &is_empty);
+       if (result != UDS_SUCCESS) {
+               return uds_log_fatal_strerror(result,
+                                             "cannot rebuild index: unknown volume chapter boundaries");
+       }
+
+       if (is_empty) {
+               index->newest_virtual_chapter = 0;
+               index->oldest_virtual_chapter = 0;
+               index->volume->lookup_mode = LOOKUP_NORMAL;
+               return UDS_SUCCESS;
+       }
+
+       index->newest_virtual_chapter = highest + 1;
+       index->oldest_virtual_chapter = lowest;
+       if (index->newest_virtual_chapter ==
+           (index->oldest_virtual_chapter + chapters_per_volume)) {
+               /* Skip the chapter shadowed by the open chapter. */
+               index->oldest_virtual_chapter++;
+       }
+
+       result = replay_volume(index);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       index->volume->lookup_mode = LOOKUP_NORMAL;
+       return UDS_SUCCESS;
+}
+
+static void free_index_zone(struct index_zone *zone)
+{
+       if (zone == NULL)
+               return;
+
+       uds_free_open_chapter(zone->open_chapter);
+       uds_free_open_chapter(zone->writing_chapter);
+       uds_free(zone);
+}
+
+static int make_index_zone(struct uds_index *index, unsigned int zone_number)
+{
+       int result;
+       struct index_zone *zone;
+
+       result = uds_allocate(1, struct index_zone, "index zone", &zone);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       result = uds_make_open_chapter(index->volume->geometry, index->zone_count,
+                                      &zone->open_chapter);
+       if (result != UDS_SUCCESS) {
+               free_index_zone(zone);
+               return result;
+       }
+
+       result = uds_make_open_chapter(index->volume->geometry, index->zone_count,
+                                      &zone->writing_chapter);
+       if (result != UDS_SUCCESS) {
+               free_index_zone(zone);
+               return result;
+       }
+
+       zone->index = index;
+       zone->id = zone_number;
+       index->zones[zone_number] = zone;
+
+       return UDS_SUCCESS;
+}
+
+int uds_make_index(struct uds_configuration *config, enum uds_open_index_type open_type,
+                  struct index_load_context *load_context, index_callback_fn callback,
+                  struct uds_index **new_index)
+{
+       int result;
+       bool loaded = false;
+       bool new = (open_type == UDS_CREATE);
+       struct uds_index *index = NULL;
+       struct index_zone *zone;
+       u64 nonce;
+       unsigned int z;
+
+       result = uds_allocate_extended(struct uds_index, config->zone_count,
+                                      struct uds_request_queue *, "index", &index);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       index->zone_count = config->zone_count;
+
+       result = uds_make_index_layout(config, new, &index->layout);
+       if (result != UDS_SUCCESS) {
+               uds_free_index(index);
+               return result;
+       }
+
+       result = uds_allocate(index->zone_count, struct index_zone *, "zones",
+                             &index->zones);
+       if (result != UDS_SUCCESS) {
+               uds_free_index(index);
+               return result;
+       }
+
+       result = uds_make_volume(config, index->layout, &index->volume);
+       if (result != UDS_SUCCESS) {
+               uds_free_index(index);
+               return result;
+       }
+
+       index->volume->lookup_mode = LOOKUP_NORMAL;
+       for (z = 0; z < index->zone_count; z++) {
+               result = make_index_zone(index, z);
+               if (result != UDS_SUCCESS) {
+                       uds_free_index(index);
+                       return uds_log_error_strerror(result,
+                                                     "Could not create index zone");
+               }
+       }
+
+       nonce = uds_get_volume_nonce(index->layout);
+       result = uds_make_volume_index(config, nonce, &index->volume_index);
+       if (result != UDS_SUCCESS) {
+               uds_free_index(index);
+               return uds_log_error_strerror(result, "could not make volume index");
+       }
+
+       index->load_context = load_context;
+       index->callback = callback;
+
+       result = initialize_index_queues(index, config->geometry);
+       if (result != UDS_SUCCESS) {
+               uds_free_index(index);
+               return result;
+       }
+
+       result = make_chapter_writer(index, &index->chapter_writer);
+       if (result != UDS_SUCCESS) {
+               uds_free_index(index);
+               return result;
+       }
+
+       if (!new) {
+               result = load_index(index);
+               switch (result) {
+               case UDS_SUCCESS:
+                       loaded = true;
+                       break;
+               case -ENOMEM:
+                       /* We should not try a rebuild for this error. */
+                       uds_log_error_strerror(result, "index could not be loaded");
+                       break;
+               default:
+                       uds_log_error_strerror(result, "index could not be loaded");
+                       if (open_type == UDS_LOAD) {
+                               result = rebuild_index(index);
+                               if (result != UDS_SUCCESS) {
+                                       uds_log_error_strerror(result,
+                                                              "index could not be rebuilt");
+                               }
+                       }
+                       break;
+               }
+       }
+
+       if (result != UDS_SUCCESS) {
+               uds_free_index(index);
+               return uds_log_error_strerror(result, "fatal error in %s()", __func__);
+       }
+
+       for (z = 0; z < index->zone_count; z++) {
+               zone = index->zones[z];
+               zone->oldest_virtual_chapter = index->oldest_virtual_chapter;
+               zone->newest_virtual_chapter = index->newest_virtual_chapter;
+       }
+
+       if (index->load_context != NULL) {
+               mutex_lock(&index->load_context->mutex);
+               index->load_context->status = INDEX_READY;
+               /*
+                * If we get here, suspend is meaningless, but notify any thread trying to suspend
+                * us so it doesn't hang.
+                */
+               uds_broadcast_cond(&index->load_context->cond);
+               mutex_unlock(&index->load_context->mutex);
+       }
+
+       index->has_saved_open_chapter = loaded;
+       index->need_to_save = !loaded;
+       *new_index = index;
+       return UDS_SUCCESS;
+}
+
+void uds_free_index(struct uds_index *index)
+{
+       unsigned int i;
+
+       if (index == NULL)
+               return;
+
+       uds_request_queue_finish(index->triage_queue);
+       for (i = 0; i < index->zone_count; i++)
+               uds_request_queue_finish(index->zone_queues[i]);
+
+       free_chapter_writer(index->chapter_writer);
+
+       uds_free_volume_index(index->volume_index);
+       if (index->zones != NULL) {
+               for (i = 0; i < index->zone_count; i++)
+                       free_index_zone(index->zones[i]);
+               uds_free(index->zones);
+       }
+
+       uds_free_volume(index->volume);
+       uds_free_index_layout(uds_forget(index->layout));
+       uds_free(index);
+}
+
+/* Wait for the chapter writer to complete any outstanding writes. */
+void uds_wait_for_idle_index(struct uds_index *index)
+{
+       struct chapter_writer *writer = index->chapter_writer;
+
+       mutex_lock(&writer->mutex);
+       while (writer->zones_to_write > 0)
+               uds_wait_cond(&writer->cond, &writer->mutex);
+       mutex_unlock(&writer->mutex);
+}
+
+/* This function assumes that all requests have been drained. */
+int uds_save_index(struct uds_index *index)
+{
+       int result;
+
+       if (!index->need_to_save)
+               return UDS_SUCCESS;
+
+       uds_wait_for_idle_index(index);
+       index->prev_save = index->last_save;
+       index->last_save = ((index->newest_virtual_chapter == 0) ?
+                           NO_LAST_SAVE : index->newest_virtual_chapter - 1);
+       uds_log_info("beginning save (vcn %llu)", (unsigned long long) index->last_save);
+
+       result = uds_save_index_state(index->layout, index);
+       if (result != UDS_SUCCESS) {
+               uds_log_info("save index failed");
+               index->last_save = index->prev_save;
+       } else {
+               index->has_saved_open_chapter = true;
+               index->need_to_save = false;
+               uds_log_info("finished save (vcn %llu)",
+                            (unsigned long long) index->last_save);
+       }
+
+       return result;
+}
+
+int uds_replace_index_storage(struct uds_index *index, struct block_device *bdev)
+{
+       return uds_replace_volume_storage(index->volume, index->layout, bdev);
+}
+
+/* Accessing statistics should be safe from any thread. */
+void uds_get_index_stats(struct uds_index *index, struct uds_index_stats *counters)
+{
+       struct volume_index_stats stats;
+
+       uds_get_volume_index_stats(index->volume_index, &stats);
+       counters->entries_indexed = stats.record_count;
+       counters->collisions = stats.collision_count;
+       counters->entries_discarded = stats.discard_count;
+
+       counters->memory_used = (index->volume_index->memory_size +
+                                index->volume->cache_size +
+                                index->chapter_writer->memory_size);
+}
+
+void uds_enqueue_request(struct uds_request *request, enum request_stage stage)
+{
+       struct uds_index *index = request->index;
+       struct uds_request_queue *queue;
+
+       switch (stage) {
+       case STAGE_TRIAGE:
+               if (index->triage_queue != NULL) {
+                       queue = index->triage_queue;
+                       break;
+               }
+
+               fallthrough;
+
+       case STAGE_INDEX:
+               request->zone_number =
+                       uds_get_volume_index_zone(index->volume_index, &request->record_name);
+               fallthrough;
+
+       case STAGE_MESSAGE:
+               queue = index->zone_queues[request->zone_number];
+               break;
+
+       default:
+               ASSERT_LOG_ONLY(false, "invalid index stage: %d", stage);
+               return;
+       }
+
+       uds_request_queue_enqueue(queue, request);
+}
diff --git a/drivers/md/dm-vdo/indexer/index.h b/drivers/md/dm-vdo/indexer/index.h
new file mode 100644 (file)
index 0000000..edabb23
--- /dev/null
@@ -0,0 +1,83 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#ifndef UDS_INDEX_H
+#define UDS_INDEX_H
+
+#include "index-layout.h"
+#include "index-session.h"
+#include "open-chapter.h"
+#include "volume.h"
+#include "volume-index.h"
+
+/*
+ * The index is a high-level structure which represents the totality of the UDS index. It manages
+ * the queues for incoming requests and dispatches them to the appropriate sub-components like the
+ * volume or the volume index. It also manages administrative tasks such as saving and loading the
+ * index.
+ *
+ * The index is divided into a number of independent zones and assigns each request to a zone based
+ * on its name. Most sub-components are similarly divided into zones as well so that requests in
+ * each zone usually operate without interference or coordination between zones.
+ */
+
+typedef void (*index_callback_fn)(struct uds_request *request);
+
+struct index_zone {
+       struct uds_index *index;
+       struct open_chapter_zone *open_chapter;
+       struct open_chapter_zone *writing_chapter;
+       u64 oldest_virtual_chapter;
+       u64 newest_virtual_chapter;
+       unsigned int id;
+};
+
+struct uds_index {
+       bool has_saved_open_chapter;
+       bool need_to_save;
+       struct index_load_context *load_context;
+       struct index_layout *layout;
+       struct volume_index *volume_index;
+       struct volume *volume;
+       unsigned int zone_count;
+       struct index_zone **zones;
+
+       u64 oldest_virtual_chapter;
+       u64 newest_virtual_chapter;
+
+       u64 last_save;
+       u64 prev_save;
+       struct chapter_writer *chapter_writer;
+
+       index_callback_fn callback;
+       struct uds_request_queue *triage_queue;
+       struct uds_request_queue *zone_queues[];
+};
+
+enum request_stage {
+       STAGE_TRIAGE,
+       STAGE_INDEX,
+       STAGE_MESSAGE,
+};
+
+int __must_check uds_make_index(struct uds_configuration *config,
+                               enum uds_open_index_type open_type,
+                               struct index_load_context *load_context,
+                               index_callback_fn callback, struct uds_index **new_index);
+
+int __must_check uds_save_index(struct uds_index *index);
+
+void uds_free_index(struct uds_index *index);
+
+int __must_check uds_replace_index_storage(struct uds_index *index,
+                                          struct block_device *bdev);
+
+void uds_get_index_stats(struct uds_index *index, struct uds_index_stats *counters);
+
+void uds_enqueue_request(struct uds_request *request, enum request_stage stage);
+
+void uds_wait_for_idle_index(struct uds_index *index);
+
+#endif /* UDS_INDEX_H */
diff --git a/drivers/md/dm-vdo/indexer/indexer.h b/drivers/md/dm-vdo/indexer/indexer.h
new file mode 100644 (file)
index 0000000..3744aaf
--- /dev/null
@@ -0,0 +1,353 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#ifndef INDEXER_H
+#define INDEXER_H
+
+#include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+#include <linux/wait.h>
+
+#include "funnel-queue.h"
+
+/*
+ * UDS public API
+ *
+ * The Universal Deduplication System (UDS) is an efficient name-value store. When used for
+ * deduplicating storage, the names are generally hashes of data blocks and the associated data is
+ * where that block is located on the underlying storage medium. The stored names are expected to
+ * be randomly distributed among the space of possible names. If this assumption is violated, the
+ * UDS index will store fewer names than normal but will otherwise continue to work. The data
+ * associated with each name can be any 16-byte value.
+ *
+ * A client must first create an index session to interact with an index. Once created, the session
+ * can be shared among multiple threads or users. When a session is destroyed, it will also close
+ * and save any associated index.
+ *
+ * To make a request, a client must allocate a uds_request structure and set the required fields
+ * before launching it. UDS will invoke the provided callback to complete the request. After the
+ * callback has been called, the uds_request structure can be freed or reused for a new request.
+ * There are five types of requests:
+ *
+ * A UDS_UPDATE request will associate the provided name with the provided data. Any previous data
+ * associated with that name will be discarded.
+ *
+ * A UDS_QUERY request will return the data associated with the provided name, if any. The entry
+ * for the name will also be marked as most recent, as if the data had been updated.
+ *
+ * A UDS_POST request is a combination of UDS_QUERY and UDS_UPDATE. If there is already data
+ * associated with the provided name, that data is returned. If there is no existing association,
+ * the name is associated with the newly provided data. This request is equivalent to a UDS_QUERY
+ * request followed by a UDS_UPDATE request if no data is found, but it is much more efficient.
+ *
+ * A UDS_QUERY_NO_UPDATE request will return the data associated with the provided name, but will
+ * not change the recency of the entry for the name. This request is primarily useful for testing,
+ * to determine whether an entry exists without changing the internal state of the index.
+ *
+ * A UDS_DELETE request removes any data associated with the provided name. This operation is
+ * generally not necessary, because the index will automatically discard its oldest entries once it
+ * becomes full.
+ */
+
+/* General UDS constants and structures */
+
+enum uds_request_type {
+       /* Create or update the mapping for a name, and make the name most recent. */
+       UDS_UPDATE,
+
+       /* Return any mapped data for a name, and make the name most recent. */
+       UDS_QUERY,
+
+       /*
+        * Return any mapped data for a name, or map the provided data to the name if there is no
+        * current data, and make the name most recent.
+        */
+       UDS_POST,
+
+       /* Return any mapped data for a name without updating its recency. */
+       UDS_QUERY_NO_UPDATE,
+
+       /* Remove any mapping for a name. */
+       UDS_DELETE,
+
+};
+
+enum uds_open_index_type {
+       /* Create a new index. */
+       UDS_CREATE,
+
+       /* Load an existing index and try to recover if necessary. */
+       UDS_LOAD,
+
+       /* Load an existing index, but only if it was saved cleanly. */
+       UDS_NO_REBUILD,
+};
+
+enum {
+       /* The record name size in bytes */
+       UDS_RECORD_NAME_SIZE = 16,
+       /* The maximum record data size in bytes */
+       UDS_RECORD_DATA_SIZE = 16,
+};
+
+/*
+ * A type representing a UDS memory configuration which is either a positive integer number of
+ * gigabytes or one of the six special constants for configurations smaller than one gigabyte.
+ */
+typedef int uds_memory_config_size_t;
+
+enum {
+       /* The maximum configurable amount of memory */
+       UDS_MEMORY_CONFIG_MAX = 1024,
+       /* Flag indicating that the index has one less chapter than usual */
+       UDS_MEMORY_CONFIG_REDUCED = 0x1000,
+       UDS_MEMORY_CONFIG_REDUCED_MAX = 1024 + UDS_MEMORY_CONFIG_REDUCED,
+       /* Special values indicating sizes less than 1 GB */
+       UDS_MEMORY_CONFIG_256MB = -256,
+       UDS_MEMORY_CONFIG_512MB = -512,
+       UDS_MEMORY_CONFIG_768MB = -768,
+       UDS_MEMORY_CONFIG_REDUCED_256MB = -1280,
+       UDS_MEMORY_CONFIG_REDUCED_512MB = -1536,
+       UDS_MEMORY_CONFIG_REDUCED_768MB = -1792,
+};
+
+struct uds_record_name {
+       unsigned char name[UDS_RECORD_NAME_SIZE];
+};
+
+struct uds_record_data {
+       unsigned char data[UDS_RECORD_DATA_SIZE];
+};
+
+struct uds_volume_record {
+       struct uds_record_name name;
+       struct uds_record_data data;
+};
+
+struct uds_parameters {
+       /* The block_device used for storage */
+       struct block_device *bdev;
+       /* The maximum allowable size of the index on storage */
+       size_t size;
+       /* The offset where the index should start */
+       off_t offset;
+       /* The maximum memory allocation, in GB */
+       uds_memory_config_size_t memory_size;
+       /* Whether the index should include sparse chapters */
+       bool sparse;
+       /* A 64-bit nonce to validate the index */
+       u64 nonce;
+       /* The number of threads used to process index requests */
+       unsigned int zone_count;
+       /* The number of threads used to read volume pages */
+       unsigned int read_threads;
+};
+
+/*
+ * These statistics capture characteristics of the current index, including resource usage and
+ * requests processed since the index was opened.
+ */
+struct uds_index_stats {
+       /* The total number of records stored in the index */
+       u64 entries_indexed;
+       /* An estimate of the index's memory usage, in bytes */
+       u64 memory_used;
+       /* The number of collisions recorded in the volume index */
+       u64 collisions;
+       /* The number of entries discarded from the index since startup */
+       u64 entries_discarded;
+       /* The time at which these statistics were fetched */
+       s64 current_time;
+       /* The number of post calls that found an existing entry */
+       u64 posts_found;
+       /* The number of post calls that added an entry */
+       u64 posts_not_found;
+       /*
+        * The number of post calls that found an existing entry that is current enough to only
+        * exist in memory and not have been committed to disk yet
+        */
+       u64 in_memory_posts_found;
+       /*
+        * The number of post calls that found an existing entry in the dense portion of the index
+        */
+       u64 dense_posts_found;
+       /*
+        * The number of post calls that found an existing entry in the sparse portion of the index
+        */
+       u64 sparse_posts_found;
+       /* The number of update calls that updated an existing entry */
+       u64 updates_found;
+       /* The number of update calls that added a new entry */
+       u64 updates_not_found;
+       /* The number of delete requests that deleted an existing entry */
+       u64 deletions_found;
+       /* The number of delete requests that did nothing */
+       u64 deletions_not_found;
+       /* The number of query calls that found existing entry */
+       u64 queries_found;
+       /* The number of query calls that did not find an entry */
+       u64 queries_not_found;
+       /* The total number of requests processed */
+       u64 requests;
+};
+
+enum uds_index_region {
+       /* No location information has been determined */
+       UDS_LOCATION_UNKNOWN = 0,
+       /* The index page entry has been found */
+       UDS_LOCATION_INDEX_PAGE_LOOKUP,
+       /* The record page entry has been found */
+       UDS_LOCATION_RECORD_PAGE_LOOKUP,
+       /* The record is not in the index */
+       UDS_LOCATION_UNAVAILABLE,
+       /* The record was found in the open chapter */
+       UDS_LOCATION_IN_OPEN_CHAPTER,
+       /* The record was found in the dense part of the index */
+       UDS_LOCATION_IN_DENSE,
+       /* The record was found in the sparse part of the index */
+       UDS_LOCATION_IN_SPARSE,
+} __packed;
+
+/* Zone message requests are used to communicate between index zones. */
+enum uds_zone_message_type {
+       /* A standard request with no message */
+       UDS_MESSAGE_NONE = 0,
+       /* Add a chapter to the sparse chapter index cache */
+       UDS_MESSAGE_SPARSE_CACHE_BARRIER,
+       /* Close a chapter to keep the zone from falling behind */
+       UDS_MESSAGE_ANNOUNCE_CHAPTER_CLOSED,
+} __packed;
+
+struct uds_zone_message {
+       /* The type of message, determining how it will be processed */
+       enum uds_zone_message_type type;
+       /* The virtual chapter number to which the message applies */
+       u64 virtual_chapter;
+};
+
+struct uds_index_session;
+struct uds_index;
+struct uds_request;
+
+/* Once this callback has been invoked, the uds_request structure can be reused or freed. */
+typedef void (*uds_request_callback_fn)(struct uds_request *request);
+
+struct uds_request {
+       /* These input fields must be set before launching a request. */
+
+       /* The name of the record to look up or create */
+       struct uds_record_name record_name;
+       /* New data to associate with the record name, if applicable */
+       struct uds_record_data new_metadata;
+       /* A callback to invoke when the request is complete */
+       uds_request_callback_fn callback;
+       /* The index session that will manage this request */
+       struct uds_index_session *session;
+       /* The type of operation to perform, as describe above */
+       enum uds_request_type type;
+
+       /* These output fields are set when a request is complete. */
+
+       /* The existing data associated with the request name, if any */
+       struct uds_record_data old_metadata;
+       /* Either UDS_SUCCESS or an error code for the request */
+       int status;
+       /* True if the record name had an existing entry in the index */
+       bool found;
+
+       /*
+        * The remaining fields are used internally and should not be altered by clients. The index
+        * relies on zone_number being the first field in this section.
+        */
+
+       /* The number of the zone which will process this request*/
+       unsigned int zone_number;
+       /* A link for adding a request to a lock-free queue */
+       struct funnel_queue_entry queue_link;
+       /* A link for adding a request to a standard linked list */
+       struct uds_request *next_request;
+       /* A pointer to the index processing this request */
+       struct uds_index *index;
+       /* Control message for coordinating between zones */
+       struct uds_zone_message zone_message;
+       /* If true, process request immediately by waking the worker thread */
+       bool unbatched;
+       /* If true, continue this request before processing newer requests */
+       bool requeued;
+       /* The virtual chapter containing the record name, if known */
+       u64 virtual_chapter;
+       /* The region of the index containing the record name */
+       enum uds_index_region location;
+};
+
+/* Compute the number of bytes needed to store an index. */
+int __must_check uds_compute_index_size(const struct uds_parameters *parameters,
+                                       u64 *index_size);
+
+/* A session is required for most index operations. */
+int __must_check uds_create_index_session(struct uds_index_session **session);
+
+/* Destroying an index session also closes and saves the associated index. */
+int uds_destroy_index_session(struct uds_index_session *session);
+
+/*
+ * Create or open an index with an existing session. This operation fails if the index session is
+ * suspended, or if there is already an open index.
+ */
+int __must_check uds_open_index(enum uds_open_index_type open_type,
+                               const struct uds_parameters *parameters,
+                               struct uds_index_session *session);
+
+/*
+ * Wait until all callbacks for index operations are complete, and prevent new index operations
+ * from starting. New index operations will fail with EBUSY until the session is resumed. Also
+ * optionally saves the index.
+ */
+int __must_check uds_suspend_index_session(struct uds_index_session *session, bool save);
+
+/*
+ * Allow new index operations for an index, whether it was suspended or not. If the index is
+ * suspended and the supplied block device differs from the current backing store, the index will
+ * start using the new backing store instead.
+ */
+int __must_check uds_resume_index_session(struct uds_index_session *session,
+                                         struct block_device *bdev);
+
+/* Wait until all outstanding index operations are complete. */
+int __must_check uds_flush_index_session(struct uds_index_session *session);
+
+/* Close an index. This operation fails if the index session is suspended. */
+int __must_check uds_close_index(struct uds_index_session *session);
+
+/* Get index statistics since the last time the index was opened. */
+int __must_check uds_get_index_session_stats(struct uds_index_session *session,
+                                            struct uds_index_stats *stats);
+
+/* This function will fail if any required field of the request is not set. */
+int __must_check uds_launch_request(struct uds_request *request);
+
+struct cond_var {
+       wait_queue_head_t wait_queue;
+};
+
+static inline void uds_init_cond(struct cond_var *cv)
+{
+       init_waitqueue_head(&cv->wait_queue);
+}
+
+static inline void uds_signal_cond(struct cond_var *cv)
+{
+       wake_up(&cv->wait_queue);
+}
+
+static inline void uds_broadcast_cond(struct cond_var *cv)
+{
+       wake_up_all(&cv->wait_queue);
+}
+
+void uds_wait_cond(struct cond_var *cv, struct mutex *mutex);
+
+#endif /* INDEXER_H */
diff --git a/drivers/md/dm-vdo/indexer/io-factory.c b/drivers/md/dm-vdo/indexer/io-factory.c
new file mode 100644 (file)
index 0000000..02242df
--- /dev/null
@@ -0,0 +1,415 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#include "io-factory.h"
+
+#include <linux/atomic.h>
+#include <linux/blkdev.h>
+#include <linux/err.h>
+#include <linux/mount.h>
+
+#include "logger.h"
+#include "memory-alloc.h"
+#include "numeric.h"
+
+/*
+ * The I/O factory object manages access to index storage, which is a contiguous range of blocks on
+ * a block device.
+ *
+ * The factory holds the open device and is responsible for closing it. The factory has methods to
+ * make helper structures that can be used to access sections of the index.
+ */
+struct io_factory {
+       struct block_device *bdev;
+       atomic_t ref_count;
+};
+
+/* The buffered reader allows efficient I/O by reading page-sized segments into a buffer. */
+struct buffered_reader {
+       struct io_factory *factory;
+       struct dm_bufio_client *client;
+       struct dm_buffer *buffer;
+       sector_t limit;
+       sector_t block_number;
+       u8 *start;
+       u8 *end;
+};
+
+enum { MAX_READ_AHEAD_BLOCKS = 4 };
+
+/*
+ * The buffered writer allows efficient I/O by buffering writes and committing page-sized segments
+ * to storage.
+ */
+struct buffered_writer {
+       struct io_factory *factory;
+       struct dm_bufio_client *client;
+       struct dm_buffer *buffer;
+       sector_t limit;
+       sector_t block_number;
+       u8 *start;
+       u8 *end;
+       int error;
+};
+
+static void uds_get_io_factory(struct io_factory *factory)
+{
+       atomic_inc(&factory->ref_count);
+}
+
+int uds_make_io_factory(struct block_device *bdev, struct io_factory **factory_ptr)
+{
+       int result;
+       struct io_factory *factory;
+
+       result = uds_allocate(1, struct io_factory, __func__, &factory);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       factory->bdev = bdev;
+       atomic_set_release(&factory->ref_count, 1);
+
+       *factory_ptr = factory;
+       return UDS_SUCCESS;
+}
+
+int uds_replace_storage(struct io_factory *factory, struct block_device *bdev)
+{
+       factory->bdev = bdev;
+       return UDS_SUCCESS;
+}
+
+/* Free an I/O factory once all references have been released. */
+void uds_put_io_factory(struct io_factory *factory)
+{
+       if (atomic_add_return(-1, &factory->ref_count) <= 0)
+               uds_free(factory);
+}
+
+size_t uds_get_writable_size(struct io_factory *factory)
+{
+       return i_size_read(factory->bdev->bd_inode);
+}
+
+/* Create a struct dm_bufio_client for an index region starting at offset. */
+int uds_make_bufio(struct io_factory *factory, off_t block_offset, size_t block_size,
+                  unsigned int reserved_buffers, struct dm_bufio_client **client_ptr)
+{
+       struct dm_bufio_client *client;
+
+       client = dm_bufio_client_create(factory->bdev, block_size, reserved_buffers, 0,
+                                       NULL, NULL, 0);
+       if (IS_ERR(client))
+               return -PTR_ERR(client);
+
+       dm_bufio_set_sector_offset(client, block_offset * SECTORS_PER_BLOCK);
+       *client_ptr = client;
+       return UDS_SUCCESS;
+}
+
+static void read_ahead(struct buffered_reader *reader, sector_t block_number)
+{
+       if (block_number < reader->limit) {
+               sector_t read_ahead = min((sector_t) MAX_READ_AHEAD_BLOCKS,
+                                         reader->limit - block_number);
+
+               dm_bufio_prefetch(reader->client, block_number, read_ahead);
+       }
+}
+
+void uds_free_buffered_reader(struct buffered_reader *reader)
+{
+       if (reader == NULL)
+               return;
+
+       if (reader->buffer != NULL)
+               dm_bufio_release(reader->buffer);
+
+       dm_bufio_client_destroy(reader->client);
+       uds_put_io_factory(reader->factory);
+       uds_free(reader);
+}
+
+/* Create a buffered reader for an index region starting at offset. */
+int uds_make_buffered_reader(struct io_factory *factory, off_t offset, u64 block_count,
+                            struct buffered_reader **reader_ptr)
+{
+       int result;
+       struct dm_bufio_client *client = NULL;
+       struct buffered_reader *reader = NULL;
+
+       result = uds_make_bufio(factory, offset, UDS_BLOCK_SIZE, 1, &client);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       result = uds_allocate(1, struct buffered_reader, "buffered reader", &reader);
+       if (result != UDS_SUCCESS) {
+               dm_bufio_client_destroy(client);
+               return result;
+       }
+
+       *reader = (struct buffered_reader) {
+               .factory = factory,
+               .client = client,
+               .buffer = NULL,
+               .limit = block_count,
+               .block_number = 0,
+               .start = NULL,
+               .end = NULL,
+       };
+
+       read_ahead(reader, 0);
+       uds_get_io_factory(factory);
+       *reader_ptr = reader;
+       return UDS_SUCCESS;
+}
+
+static int position_reader(struct buffered_reader *reader, sector_t block_number,
+                          off_t offset)
+{
+       struct dm_buffer *buffer = NULL;
+       void *data;
+
+       if ((reader->end == NULL) || (block_number != reader->block_number)) {
+               if (block_number >= reader->limit)
+                       return UDS_OUT_OF_RANGE;
+
+               if (reader->buffer != NULL)
+                       dm_bufio_release(uds_forget(reader->buffer));
+
+               data = dm_bufio_read(reader->client, block_number, &buffer);
+               if (IS_ERR(data))
+                       return -PTR_ERR(data);
+
+               reader->buffer = buffer;
+               reader->start = data;
+               if (block_number == reader->block_number + 1)
+                       read_ahead(reader, block_number + 1);
+       }
+
+       reader->block_number = block_number;
+       reader->end = reader->start + offset;
+       return UDS_SUCCESS;
+}
+
+static size_t bytes_remaining_in_read_buffer(struct buffered_reader *reader)
+{
+       return (reader->end == NULL) ? 0 : reader->start + UDS_BLOCK_SIZE - reader->end;
+}
+
+static int reset_reader(struct buffered_reader *reader)
+{
+       sector_t block_number;
+
+       if (bytes_remaining_in_read_buffer(reader) > 0)
+               return UDS_SUCCESS;
+
+       block_number = reader->block_number;
+       if (reader->end != NULL)
+               block_number++;
+
+       return position_reader(reader, block_number, 0);
+}
+
+int uds_read_from_buffered_reader(struct buffered_reader *reader, u8 *data,
+                                 size_t length)
+{
+       int result = UDS_SUCCESS;
+       size_t chunk_size;
+
+       while (length > 0) {
+               result = reset_reader(reader);
+               if (result != UDS_SUCCESS)
+                       return result;
+
+               chunk_size = min(length, bytes_remaining_in_read_buffer(reader));
+               memcpy(data, reader->end, chunk_size);
+               length -= chunk_size;
+               data += chunk_size;
+               reader->end += chunk_size;
+       }
+
+       return UDS_SUCCESS;
+}
+
+/*
+ * Verify that the next data on the reader matches the required value. If the value matches, the
+ * matching contents are consumed. If the value does not match, the reader state is unchanged.
+ */
+int uds_verify_buffered_data(struct buffered_reader *reader, const u8 *value,
+                            size_t length)
+{
+       int result = UDS_SUCCESS;
+       size_t chunk_size;
+       sector_t start_block_number = reader->block_number;
+       int start_offset = reader->end - reader->start;
+
+       while (length > 0) {
+               result = reset_reader(reader);
+               if (result != UDS_SUCCESS) {
+                       result = UDS_CORRUPT_DATA;
+                       break;
+               }
+
+               chunk_size = min(length, bytes_remaining_in_read_buffer(reader));
+               if (memcmp(value, reader->end, chunk_size) != 0) {
+                       result = UDS_CORRUPT_DATA;
+                       break;
+               }
+
+               length -= chunk_size;
+               value += chunk_size;
+               reader->end += chunk_size;
+       }
+
+       if (result != UDS_SUCCESS)
+               position_reader(reader, start_block_number, start_offset);
+
+       return result;
+}
+
+/* Create a buffered writer for an index region starting at offset. */
+int uds_make_buffered_writer(struct io_factory *factory, off_t offset, u64 block_count,
+                            struct buffered_writer **writer_ptr)
+{
+       int result;
+       struct dm_bufio_client *client = NULL;
+       struct buffered_writer *writer;
+
+       result = uds_make_bufio(factory, offset, UDS_BLOCK_SIZE, 1, &client);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       result = uds_allocate(1, struct buffered_writer, "buffered writer", &writer);
+       if (result != UDS_SUCCESS) {
+               dm_bufio_client_destroy(client);
+               return result;
+       }
+
+       *writer = (struct buffered_writer) {
+               .factory = factory,
+               .client = client,
+               .buffer = NULL,
+               .limit = block_count,
+               .start = NULL,
+               .end = NULL,
+               .block_number = 0,
+               .error = UDS_SUCCESS,
+       };
+
+       uds_get_io_factory(factory);
+       *writer_ptr = writer;
+       return UDS_SUCCESS;
+}
+
+static size_t get_remaining_write_space(struct buffered_writer *writer)
+{
+       return writer->start + UDS_BLOCK_SIZE - writer->end;
+}
+
+static int __must_check prepare_next_buffer(struct buffered_writer *writer)
+{
+       struct dm_buffer *buffer = NULL;
+       void *data;
+
+       if (writer->block_number >= writer->limit) {
+               writer->error = UDS_OUT_OF_RANGE;
+               return UDS_OUT_OF_RANGE;
+       }
+
+       data = dm_bufio_new(writer->client, writer->block_number, &buffer);
+       if (IS_ERR(data)) {
+               writer->error = -PTR_ERR(data);
+               return writer->error;
+       }
+
+       writer->buffer = buffer;
+       writer->start = data;
+       writer->end = data;
+       return UDS_SUCCESS;
+}
+
+static int flush_previous_buffer(struct buffered_writer *writer)
+{
+       size_t available;
+
+       if (writer->buffer == NULL)
+               return writer->error;
+
+       if (writer->error == UDS_SUCCESS) {
+               available = get_remaining_write_space(writer);
+
+               if (available > 0)
+                       memset(writer->end, 0, available);
+
+               dm_bufio_mark_buffer_dirty(writer->buffer);
+       }
+
+       dm_bufio_release(writer->buffer);
+       writer->buffer = NULL;
+       writer->start = NULL;
+       writer->end = NULL;
+       writer->block_number++;
+       return writer->error;
+}
+
+void uds_free_buffered_writer(struct buffered_writer *writer)
+{
+       int result;
+
+       if (writer == NULL)
+               return;
+
+       flush_previous_buffer(writer);
+       result = -dm_bufio_write_dirty_buffers(writer->client);
+       if (result != UDS_SUCCESS)
+               uds_log_warning_strerror(result, "%s: failed to sync storage", __func__);
+
+       dm_bufio_client_destroy(writer->client);
+       uds_put_io_factory(writer->factory);
+       uds_free(writer);
+}
+
+/*
+ * Append data to the buffer, writing as needed. If no data is provided, zeros are written instead.
+ * If a write error occurs, it is recorded and returned on every subsequent write attempt.
+ */
+int uds_write_to_buffered_writer(struct buffered_writer *writer, const u8 *data,
+                                size_t length)
+{
+       int result = writer->error;
+       size_t chunk_size;
+
+       while ((length > 0) && (result == UDS_SUCCESS)) {
+               if (writer->buffer == NULL) {
+                       result = prepare_next_buffer(writer);
+                       continue;
+               }
+
+               chunk_size = min(length, get_remaining_write_space(writer));
+               if (data == NULL) {
+                       memset(writer->end, 0, chunk_size);
+               } else {
+                       memcpy(writer->end, data, chunk_size);
+                       data += chunk_size;
+               }
+
+               length -= chunk_size;
+               writer->end += chunk_size;
+
+               if (get_remaining_write_space(writer) == 0)
+                       result = uds_flush_buffered_writer(writer);
+       }
+
+       return result;
+}
+
+int uds_flush_buffered_writer(struct buffered_writer *writer)
+{
+       if (writer->error != UDS_SUCCESS)
+               return writer->error;
+
+       return flush_previous_buffer(writer);
+}
diff --git a/drivers/md/dm-vdo/indexer/io-factory.h b/drivers/md/dm-vdo/indexer/io-factory.h
new file mode 100644 (file)
index 0000000..7fb5a06
--- /dev/null
@@ -0,0 +1,64 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#ifndef UDS_IO_FACTORY_H
+#define UDS_IO_FACTORY_H
+
+#include <linux/dm-bufio.h>
+
+/*
+ * The I/O factory manages all low-level I/O operations to the underlying storage device. Its main
+ * clients are the index layout and the volume. The buffered reader and buffered writer interfaces
+ * are helpers for accessing data in a contiguous range of storage blocks.
+ */
+
+struct buffered_reader;
+struct buffered_writer;
+
+struct io_factory;
+
+enum {
+       UDS_BLOCK_SIZE = 4096,
+       SECTORS_PER_BLOCK = UDS_BLOCK_SIZE >> SECTOR_SHIFT,
+};
+
+int __must_check uds_make_io_factory(struct block_device *bdev,
+                                    struct io_factory **factory_ptr);
+
+int __must_check uds_replace_storage(struct io_factory *factory,
+                                    struct block_device *bdev);
+
+void uds_put_io_factory(struct io_factory *factory);
+
+size_t __must_check uds_get_writable_size(struct io_factory *factory);
+
+int __must_check uds_make_bufio(struct io_factory *factory, off_t block_offset,
+                               size_t block_size, unsigned int reserved_buffers,
+                               struct dm_bufio_client **client_ptr);
+
+int __must_check uds_make_buffered_reader(struct io_factory *factory, off_t offset,
+                                         u64 block_count,
+                                         struct buffered_reader **reader_ptr);
+
+void uds_free_buffered_reader(struct buffered_reader *reader);
+
+int __must_check uds_read_from_buffered_reader(struct buffered_reader *reader, u8 *data,
+                                              size_t length);
+
+int __must_check uds_verify_buffered_data(struct buffered_reader *reader, const u8 *value,
+                                         size_t length);
+
+int __must_check uds_make_buffered_writer(struct io_factory *factory, off_t offset,
+                                         u64 block_count,
+                                         struct buffered_writer **writer_ptr);
+
+void uds_free_buffered_writer(struct buffered_writer *buffer);
+
+int __must_check uds_write_to_buffered_writer(struct buffered_writer *writer,
+                                             const u8 *data, size_t length);
+
+int __must_check uds_flush_buffered_writer(struct buffered_writer *writer);
+
+#endif /* UDS_IO_FACTORY_H */
diff --git a/drivers/md/dm-vdo/indexer/open-chapter.c b/drivers/md/dm-vdo/indexer/open-chapter.c
new file mode 100644 (file)
index 0000000..da16afa
--- /dev/null
@@ -0,0 +1,428 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#include "open-chapter.h"
+
+#include <linux/log2.h>
+
+#include "logger.h"
+#include "memory-alloc.h"
+#include "numeric.h"
+#include "permassert.h"
+
+#include "config.h"
+#include "hash-utils.h"
+
+/*
+ * Each index zone has a dedicated open chapter zone structure which gets an equal share of the
+ * open chapter space. Records are assigned to zones based on their record name. Within each zone,
+ * records are stored in an array in the order they arrive. Additionally, a reference to each
+ * record is stored in a hash table to help determine if a new record duplicates an existing one.
+ * If new metadata for an existing name arrives, the record is altered in place. The array of
+ * records is 1-based so that record number 0 can be used to indicate an unused hash slot.
+ *
+ * Deleted records are marked with a flag rather than actually removed to simplify hash table
+ * management. The array of deleted flags overlays the array of hash slots, but the flags are
+ * indexed by record number instead of by record name. The number of hash slots will always be a
+ * power of two that is greater than the number of records to be indexed, guaranteeing that hash
+ * insertion cannot fail, and that there are sufficient flags for all records.
+ *
+ * Once any open chapter zone fills its available space, the chapter is closed. The records from
+ * each zone are interleaved to attempt to preserve temporal locality and assigned to record pages.
+ * Empty or deleted records are replaced by copies of a valid record so that the record pages only
+ * contain valid records. The chapter then constructs a delta index which maps each record name to
+ * the record page on which that record can be found, which is split into index pages. These
+ * structures are then passed to the volume to be recorded on storage.
+ *
+ * When the index is saved, the open chapter records are saved in a single array, once again
+ * interleaved to attempt to preserve temporal locality. When the index is reloaded, there may be a
+ * different number of zones than previously, so the records must be parcelled out to their new
+ * zones. In addition, depending on the distribution of record names, a new zone may have more
+ * records than it has space. In this case, the latest records for that zone will be discarded.
+ */
+
+static const u8 OPEN_CHAPTER_MAGIC[] = "ALBOC";
+static const u8 OPEN_CHAPTER_VERSION[] = "02.00";
+
+enum {
+       OPEN_CHAPTER_MAGIC_LENGTH = sizeof(OPEN_CHAPTER_MAGIC) - 1,
+       OPEN_CHAPTER_VERSION_LENGTH = sizeof(OPEN_CHAPTER_VERSION) - 1,
+       LOAD_RATIO = 2,
+};
+
+static inline size_t records_size(const struct open_chapter_zone *open_chapter)
+{
+       return sizeof(struct uds_volume_record) * (1 + open_chapter->capacity);
+}
+
+static inline size_t slots_size(size_t slot_count)
+{
+       return sizeof(struct open_chapter_zone_slot) * slot_count;
+}
+
+int uds_make_open_chapter(const struct index_geometry *geometry, unsigned int zone_count,
+                         struct open_chapter_zone **open_chapter_ptr)
+{
+       int result;
+       struct open_chapter_zone *open_chapter;
+       size_t capacity = geometry->records_per_chapter / zone_count;
+       size_t slot_count = (1 << bits_per(capacity * LOAD_RATIO));
+
+       result = uds_allocate_extended(struct open_chapter_zone, slot_count,
+                                      struct open_chapter_zone_slot, "open chapter",
+                                      &open_chapter);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       open_chapter->slot_count = slot_count;
+       open_chapter->capacity = capacity;
+       result = uds_allocate_cache_aligned(records_size(open_chapter), "record pages",
+                                           &open_chapter->records);
+       if (result != UDS_SUCCESS) {
+               uds_free_open_chapter(open_chapter);
+               return result;
+       }
+
+       *open_chapter_ptr = open_chapter;
+       return UDS_SUCCESS;
+}
+
+void uds_reset_open_chapter(struct open_chapter_zone *open_chapter)
+{
+       open_chapter->size = 0;
+       open_chapter->deletions = 0;
+
+       memset(open_chapter->records, 0, records_size(open_chapter));
+       memset(open_chapter->slots, 0, slots_size(open_chapter->slot_count));
+}
+
+static unsigned int probe_chapter_slots(struct open_chapter_zone *open_chapter,
+                                       const struct uds_record_name *name)
+{
+       struct uds_volume_record *record;
+       unsigned int slot_count = open_chapter->slot_count;
+       unsigned int slot = uds_name_to_hash_slot(name, slot_count);
+       unsigned int record_number;
+       unsigned int attempts = 1;
+
+       while (true) {
+               record_number = open_chapter->slots[slot].record_number;
+
+               /*
+                * If the hash slot is empty, we've reached the end of a chain without finding the
+                * record and should terminate the search.
+                */
+               if (record_number == 0)
+                       return slot;
+
+               /*
+                * If the name of the record referenced by the slot matches and has not been
+                * deleted, then we've found the requested name.
+                */
+               record = &open_chapter->records[record_number];
+               if ((memcmp(&record->name, name, UDS_RECORD_NAME_SIZE) == 0) &&
+                   !open_chapter->slots[record_number].deleted)
+                       return slot;
+
+               /*
+                * Quadratic probing: advance the probe by 1, 2, 3, etc. and try again. This
+                * performs better than linear probing and works best for 2^N slots.
+                */
+               slot = (slot + attempts++) % slot_count;
+       }
+}
+
+void uds_search_open_chapter(struct open_chapter_zone *open_chapter,
+                            const struct uds_record_name *name,
+                            struct uds_record_data *metadata, bool *found)
+{
+       unsigned int slot;
+       unsigned int record_number;
+
+       slot = probe_chapter_slots(open_chapter, name);
+       record_number = open_chapter->slots[slot].record_number;
+       if (record_number == 0) {
+               *found = false;
+       } else {
+               *found = true;
+               *metadata = open_chapter->records[record_number].data;
+       }
+}
+
+/* Add a record to the open chapter zone and return the remaining space. */
+int uds_put_open_chapter(struct open_chapter_zone *open_chapter,
+                        const struct uds_record_name *name,
+                        const struct uds_record_data *metadata)
+{
+       unsigned int slot;
+       unsigned int record_number;
+       struct uds_volume_record *record;
+
+       if (open_chapter->size >= open_chapter->capacity)
+               return 0;
+
+       slot = probe_chapter_slots(open_chapter, name);
+       record_number = open_chapter->slots[slot].record_number;
+
+       if (record_number == 0) {
+               record_number = ++open_chapter->size;
+               open_chapter->slots[slot].record_number = record_number;
+       }
+
+       record = &open_chapter->records[record_number];
+       record->name = *name;
+       record->data = *metadata;
+
+       return open_chapter->capacity - open_chapter->size;
+}
+
+void uds_remove_from_open_chapter(struct open_chapter_zone *open_chapter,
+                                 const struct uds_record_name *name)
+{
+       unsigned int slot;
+       unsigned int record_number;
+
+       slot = probe_chapter_slots(open_chapter, name);
+       record_number = open_chapter->slots[slot].record_number;
+
+       if (record_number > 0) {
+               open_chapter->slots[record_number].deleted = true;
+               open_chapter->deletions += 1;
+       }
+}
+
+void uds_free_open_chapter(struct open_chapter_zone *open_chapter)
+{
+       if (open_chapter != NULL) {
+               uds_free(open_chapter->records);
+               uds_free(open_chapter);
+       }
+}
+
+/* Map each record name to its record page number in the delta chapter index. */
+static int fill_delta_chapter_index(struct open_chapter_zone **chapter_zones,
+                                   unsigned int zone_count,
+                                   struct open_chapter_index *index,
+                                   struct uds_volume_record *collated_records)
+{
+       int result;
+       unsigned int records_per_chapter;
+       unsigned int records_per_page;
+       unsigned int record_index;
+       unsigned int records = 0;
+       u32 page_number;
+       unsigned int z;
+       int overflow_count = 0;
+       struct uds_volume_record *fill_record = NULL;
+
+       /*
+        * The record pages should not have any empty space, so find a record with which to fill
+        * the chapter zone if it was closed early, and also to replace any deleted records. The
+        * last record in any filled zone is guaranteed to not have been deleted, so use one of
+        * those.
+        */
+       for (z = 0; z < zone_count; z++) {
+               struct open_chapter_zone *zone = chapter_zones[z];
+
+               if (zone->size == zone->capacity) {
+                       fill_record = &zone->records[zone->size];
+                       break;
+               }
+       }
+
+       records_per_chapter = index->geometry->records_per_chapter;
+       records_per_page = index->geometry->records_per_page;
+
+       for (records = 0; records < records_per_chapter; records++) {
+               struct uds_volume_record *record = &collated_records[records];
+               struct open_chapter_zone *open_chapter;
+
+               /* The record arrays in the zones are 1-based. */
+               record_index = 1 + (records / zone_count);
+               page_number = records / records_per_page;
+               open_chapter = chapter_zones[records % zone_count];
+
+               /* Use the fill record in place of an unused record. */
+               if (record_index > open_chapter->size ||
+                   open_chapter->slots[record_index].deleted) {
+                       *record = *fill_record;
+                       continue;
+               }
+
+               *record = open_chapter->records[record_index];
+               result = uds_put_open_chapter_index_record(index, &record->name,
+                                                          page_number);
+               switch (result) {
+               case UDS_SUCCESS:
+                       break;
+               case UDS_OVERFLOW:
+                       overflow_count++;
+                       break;
+               default:
+                       uds_log_error_strerror(result,
+                                              "failed to build open chapter index");
+                       return result;
+               }
+       }
+
+       if (overflow_count > 0)
+               uds_log_warning("Failed to add %d entries to chapter index",
+                               overflow_count);
+
+       return UDS_SUCCESS;
+}
+
+int uds_close_open_chapter(struct open_chapter_zone **chapter_zones,
+                          unsigned int zone_count, struct volume *volume,
+                          struct open_chapter_index *chapter_index,
+                          struct uds_volume_record *collated_records,
+                          u64 virtual_chapter_number)
+{
+       int result;
+
+       uds_empty_open_chapter_index(chapter_index, virtual_chapter_number);
+       result = fill_delta_chapter_index(chapter_zones, zone_count, chapter_index,
+                                         collated_records);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       return uds_write_chapter(volume, chapter_index, collated_records);
+}
+
+int uds_save_open_chapter(struct uds_index *index, struct buffered_writer *writer)
+{
+       int result;
+       struct open_chapter_zone *open_chapter;
+       struct uds_volume_record *record;
+       u8 record_count_data[sizeof(u32)];
+       u32 record_count = 0;
+       unsigned int record_index;
+       unsigned int z;
+
+       result = uds_write_to_buffered_writer(writer, OPEN_CHAPTER_MAGIC,
+                                             OPEN_CHAPTER_MAGIC_LENGTH);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       result = uds_write_to_buffered_writer(writer, OPEN_CHAPTER_VERSION,
+                                             OPEN_CHAPTER_VERSION_LENGTH);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       for (z = 0; z < index->zone_count; z++) {
+               open_chapter = index->zones[z]->open_chapter;
+               record_count += open_chapter->size - open_chapter->deletions;
+       }
+
+       put_unaligned_le32(record_count, record_count_data);
+       result = uds_write_to_buffered_writer(writer, record_count_data,
+                                             sizeof(record_count_data));
+       if (result != UDS_SUCCESS)
+               return result;
+
+       record_index = 1;
+       while (record_count > 0) {
+               for (z = 0; z < index->zone_count; z++) {
+                       open_chapter = index->zones[z]->open_chapter;
+                       if (record_index > open_chapter->size)
+                               continue;
+
+                       if (open_chapter->slots[record_index].deleted)
+                               continue;
+
+                       record = &open_chapter->records[record_index];
+                       result = uds_write_to_buffered_writer(writer, (u8 *) record,
+                                                             sizeof(*record));
+                       if (result != UDS_SUCCESS)
+                               return result;
+
+                       record_count--;
+               }
+
+               record_index++;
+       }
+
+       return uds_flush_buffered_writer(writer);
+}
+
+u64 uds_compute_saved_open_chapter_size(struct index_geometry *geometry)
+{
+       unsigned int records_per_chapter = geometry->records_per_chapter;
+
+       return OPEN_CHAPTER_MAGIC_LENGTH + OPEN_CHAPTER_VERSION_LENGTH + sizeof(u32) +
+               records_per_chapter * sizeof(struct uds_volume_record);
+}
+
+static int load_version20(struct uds_index *index, struct buffered_reader *reader)
+{
+       int result;
+       u32 record_count;
+       u8 record_count_data[sizeof(u32)];
+       struct uds_volume_record record;
+
+       /*
+        * Track which zones cannot accept any more records. If the open chapter had a different
+        * number of zones previously, some new zones may have more records than they have space
+        * for. These overflow records will be discarded.
+        */
+       bool full_flags[MAX_ZONES] = {
+               false,
+       };
+
+       result = uds_read_from_buffered_reader(reader, (u8 *) &record_count_data,
+                                              sizeof(record_count_data));
+       if (result != UDS_SUCCESS)
+               return result;
+
+       record_count = get_unaligned_le32(record_count_data);
+       while (record_count-- > 0) {
+               unsigned int zone = 0;
+
+               result = uds_read_from_buffered_reader(reader, (u8 *) &record,
+                                                      sizeof(record));
+               if (result != UDS_SUCCESS)
+                       return result;
+
+               if (index->zone_count > 1)
+                       zone = uds_get_volume_index_zone(index->volume_index,
+                                                        &record.name);
+
+               if (!full_flags[zone]) {
+                       struct open_chapter_zone *open_chapter;
+                       unsigned int remaining;
+
+                       open_chapter = index->zones[zone]->open_chapter;
+                       remaining = uds_put_open_chapter(open_chapter, &record.name,
+                                                        &record.data);
+                       /* Do not allow any zone to fill completely. */
+                       full_flags[zone] = (remaining <= 1);
+               }
+       }
+
+       return UDS_SUCCESS;
+}
+
+int uds_load_open_chapter(struct uds_index *index, struct buffered_reader *reader)
+{
+       u8 version[OPEN_CHAPTER_VERSION_LENGTH];
+       int result;
+
+       result = uds_verify_buffered_data(reader, OPEN_CHAPTER_MAGIC,
+                                         OPEN_CHAPTER_MAGIC_LENGTH);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       result = uds_read_from_buffered_reader(reader, version, sizeof(version));
+       if (result != UDS_SUCCESS)
+               return result;
+
+       if (memcmp(OPEN_CHAPTER_VERSION, version, sizeof(version)) != 0) {
+               return uds_log_error_strerror(UDS_CORRUPT_DATA,
+                                             "Invalid open chapter version: %.*s",
+                                             (int) sizeof(version), version);
+       }
+
+       return load_version20(index, reader);
+}
diff --git a/drivers/md/dm-vdo/indexer/open-chapter.h b/drivers/md/dm-vdo/indexer/open-chapter.h
new file mode 100644 (file)
index 0000000..a4250bb
--- /dev/null
@@ -0,0 +1,79 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#ifndef UDS_OPEN_CHAPTER_H
+#define UDS_OPEN_CHAPTER_H
+
+#include "chapter-index.h"
+#include "geometry.h"
+#include "index.h"
+#include "volume.h"
+
+/*
+ * The open chapter tracks the newest records in memory. Like the index as a whole, each open
+ * chapter is divided into a number of independent zones which are interleaved when the chapter is
+ * committed to the volume.
+ */
+
+enum {
+       OPEN_CHAPTER_RECORD_NUMBER_BITS = 23,
+};
+
+struct open_chapter_zone_slot {
+       /* If non-zero, the record number addressed by this hash slot */
+       unsigned int record_number : OPEN_CHAPTER_RECORD_NUMBER_BITS;
+       /* If true, the record at the index of this hash slot was deleted */
+       bool deleted : 1;
+} __packed;
+
+struct open_chapter_zone {
+       /* The maximum number of records that can be stored */
+       unsigned int capacity;
+       /* The number of records stored */
+       unsigned int size;
+       /* The number of deleted records */
+       unsigned int deletions;
+       /* Array of chunk records, 1-based */
+       struct uds_volume_record *records;
+       /* The number of slots in the hash table */
+       unsigned int slot_count;
+       /* The hash table slots, referencing virtual record numbers */
+       struct open_chapter_zone_slot slots[];
+};
+
+int __must_check uds_make_open_chapter(const struct index_geometry *geometry,
+                                      unsigned int zone_count,
+                                      struct open_chapter_zone **open_chapter_ptr);
+
+void uds_reset_open_chapter(struct open_chapter_zone *open_chapter);
+
+void uds_search_open_chapter(struct open_chapter_zone *open_chapter,
+                            const struct uds_record_name *name,
+                            struct uds_record_data *metadata, bool *found);
+
+int __must_check uds_put_open_chapter(struct open_chapter_zone *open_chapter,
+                                     const struct uds_record_name *name,
+                                     const struct uds_record_data *metadata);
+
+void uds_remove_from_open_chapter(struct open_chapter_zone *open_chapter,
+                                 const struct uds_record_name *name);
+
+void uds_free_open_chapter(struct open_chapter_zone *open_chapter);
+
+int __must_check uds_close_open_chapter(struct open_chapter_zone **chapter_zones,
+                                       unsigned int zone_count, struct volume *volume,
+                                       struct open_chapter_index *chapter_index,
+                                       struct uds_volume_record *collated_records,
+                                       u64 virtual_chapter_number);
+
+int __must_check uds_save_open_chapter(struct uds_index *index,
+                                      struct buffered_writer *writer);
+
+int __must_check uds_load_open_chapter(struct uds_index *index,
+                                      struct buffered_reader *reader);
+
+u64 uds_compute_saved_open_chapter_size(struct index_geometry *geometry);
+
+#endif /* UDS_OPEN_CHAPTER_H */
diff --git a/drivers/md/dm-vdo/indexer/radix-sort.c b/drivers/md/dm-vdo/indexer/radix-sort.c
new file mode 100644 (file)
index 0000000..1f17c70
--- /dev/null
@@ -0,0 +1,332 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#include "radix-sort.h"
+
+#include <linux/limits.h>
+#include <linux/types.h>
+
+#include "memory-alloc.h"
+#include "string-utils.h"
+
+/*
+ * This implementation allocates one large object to do the sorting, which can be reused as many
+ * times as desired. The amount of memory required is logarithmically proportional to the number of
+ * keys to be sorted.
+ */
+
+enum {
+       /* Piles smaller than this are handled with a simple insertion sort. */
+       INSERTION_SORT_THRESHOLD = 12,
+};
+
+/* Sort keys are pointers to immutable fixed-length arrays of bytes. */
+typedef const u8 *sort_key_t;
+
+/*
+ * The keys are separated into piles based on the byte in each keys at the current offset, so the
+ * number of keys with each byte must be counted.
+ */
+struct histogram {
+       /* The number of non-empty bins */
+       u16 used;
+       /* The index (key byte) of the first non-empty bin */
+       u16 first;
+       /* The index (key byte) of the last non-empty bin */
+       u16 last;
+       /* The number of occurrences of each specific byte */
+       u32 size[256];
+};
+
+/*
+ * Sub-tasks are manually managed on a stack, both for performance and to put a logarithmic bound
+ * on the stack space needed.
+ */
+struct task {
+       /* Pointer to the first key to sort. */
+       sort_key_t *first_key;
+       /* Pointer to the last key to sort. */
+       sort_key_t *last_key;
+       /* The offset into the key at which to continue sorting. */
+       u16 offset;
+       /* The number of bytes remaining in the sort keys. */
+       u16 length;
+};
+
+struct radix_sorter {
+       unsigned int count;
+       struct histogram bins;
+       sort_key_t *pile[256];
+       struct task *end_of_stack;
+       struct task insertion_list[256];
+       struct task stack[];
+};
+
+/* Compare a segment of two fixed-length keys starting at an offset. */
+static inline int compare(sort_key_t key1, sort_key_t key2, u16 offset, u16 length)
+{
+       return memcmp(&key1[offset], &key2[offset], length);
+}
+
+/* Insert the next unsorted key into an array of sorted keys. */
+static inline void insert_key(const struct task task, sort_key_t *next)
+{
+       /* Pull the unsorted key out, freeing up the array slot. */
+       sort_key_t unsorted = *next;
+
+       /* Compare the key to the preceding sorted entries, shifting down ones that are larger. */
+       while ((--next >= task.first_key) &&
+              (compare(unsorted, next[0], task.offset, task.length) < 0))
+               next[1] = next[0];
+
+       /* Insert the key into the last slot that was cleared, sorting it. */
+       next[1] = unsorted;
+}
+
+/*
+ * Sort a range of key segments using an insertion sort. This simple sort is faster than the
+ * 256-way radix sort when the number of keys to sort is small.
+ */
+static inline void insertion_sort(const struct task task)
+{
+       sort_key_t *next;
+
+       for (next = task.first_key + 1; next <= task.last_key; next++)
+               insert_key(task, next);
+}
+
+/* Push a sorting task onto a task stack. */
+static inline void push_task(struct task **stack_pointer, sort_key_t *first_key,
+                            u32 count, u16 offset, u16 length)
+{
+       struct task *task = (*stack_pointer)++;
+
+       task->first_key = first_key;
+       task->last_key = &first_key[count - 1];
+       task->offset = offset;
+       task->length = length;
+}
+
+static inline void swap_keys(sort_key_t *a, sort_key_t *b)
+{
+       sort_key_t c = *a;
+       *a = *b;
+       *b = c;
+}
+
+/*
+ * Count the number of times each byte value appears in the arrays of keys to sort at the current
+ * offset, keeping track of the number of non-empty bins, and the index of the first and last
+ * non-empty bin.
+ */
+static inline void measure_bins(const struct task task, struct histogram *bins)
+{
+       sort_key_t *key_ptr;
+
+       /*
+        * Subtle invariant: bins->used and bins->size[] are zero because the sorting code clears
+        * it all out as it goes. Even though this structure is re-used, we don't need to pay to
+        * zero it before starting a new tally.
+        */
+       bins->first = U8_MAX;
+       bins->last = 0;
+
+       for (key_ptr = task.first_key; key_ptr <= task.last_key; key_ptr++) {
+               /* Increment the count for the byte in the key at the current offset. */
+               u8 bin = (*key_ptr)[task.offset];
+               u32 size = ++bins->size[bin];
+
+               /* Track non-empty bins. */
+               if (size == 1) {
+                       bins->used += 1;
+                       if (bin < bins->first)
+                               bins->first = bin;
+
+                       if (bin > bins->last)
+                               bins->last = bin;
+               }
+       }
+}
+
+/*
+ * Convert the bin sizes to pointers to where each pile goes.
+ *
+ *   pile[0] = first_key + bin->size[0],
+ *   pile[1] = pile[0]  + bin->size[1], etc.
+ *
+ * After the keys are moved to the appropriate pile, we'll need to sort each of the piles by the
+ * next radix position. A new task is put on the stack for each pile containing lots of keys, or a
+ * new task is put on the list for each pile containing few keys.
+ *
+ * @stack: pointer the top of the stack
+ * @end_of_stack: the end of the stack
+ * @list: pointer the head of the list
+ * @pile: array for pointers to the end of each pile
+ * @bins: the histogram of the sizes of each pile
+ * @first_key: the first key of the stack
+ * @offset: the next radix position to sort by
+ * @length: the number of bytes remaining in the sort keys
+ *
+ * Return: UDS_SUCCESS or an error code
+ */
+static inline int push_bins(struct task **stack, struct task *end_of_stack,
+                           struct task **list, sort_key_t *pile[],
+                           struct histogram *bins, sort_key_t *first_key,
+                           u16 offset, u16 length)
+{
+       sort_key_t *pile_start = first_key;
+       int bin;
+
+       for (bin = bins->first; ; bin++) {
+               u32 size = bins->size[bin];
+
+               /* Skip empty piles. */
+               if (size == 0)
+                       continue;
+
+               /* There's no need to sort empty keys. */
+               if (length > 0) {
+                       if (size > INSERTION_SORT_THRESHOLD) {
+                               if (*stack >= end_of_stack)
+                                       return UDS_BAD_STATE;
+
+                               push_task(stack, pile_start, size, offset, length);
+                       } else if (size > 1) {
+                               push_task(list, pile_start, size, offset, length);
+                       }
+               }
+
+               pile_start += size;
+               pile[bin] = pile_start;
+               if (--bins->used == 0)
+                       break;
+       }
+
+       return UDS_SUCCESS;
+}
+
+int uds_make_radix_sorter(unsigned int count, struct radix_sorter **sorter)
+{
+       int result;
+       unsigned int stack_size = count / INSERTION_SORT_THRESHOLD;
+       struct radix_sorter *radix_sorter;
+
+       result = uds_allocate_extended(struct radix_sorter, stack_size, struct task,
+                                      __func__, &radix_sorter);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       radix_sorter->count = count;
+       radix_sorter->end_of_stack = radix_sorter->stack + stack_size;
+       *sorter = radix_sorter;
+       return UDS_SUCCESS;
+}
+
+void uds_free_radix_sorter(struct radix_sorter *sorter)
+{
+       uds_free(sorter);
+}
+
+/*
+ * Sort pointers to fixed-length keys (arrays of bytes) using a radix sort. The sort implementation
+ * is unstable, so the relative ordering of equal keys is not preserved.
+ */
+int uds_radix_sort(struct radix_sorter *sorter, const unsigned char *keys[],
+                  unsigned int count, unsigned short length)
+{
+       struct task start;
+       struct histogram *bins = &sorter->bins;
+       sort_key_t **pile = sorter->pile;
+       struct task *task_stack = sorter->stack;
+
+       /* All zero-length keys are identical and therefore already sorted. */
+       if ((count == 0) || (length == 0))
+               return UDS_SUCCESS;
+
+       /* The initial task is to sort the entire length of all the keys. */
+       start = (struct task) {
+               .first_key = keys,
+               .last_key = &keys[count - 1],
+               .offset = 0,
+               .length = length,
+       };
+
+       if (count <= INSERTION_SORT_THRESHOLD) {
+               insertion_sort(start);
+               return UDS_SUCCESS;
+       }
+
+       if (count > sorter->count)
+               return UDS_INVALID_ARGUMENT;
+
+       /*
+        * Repeatedly consume a sorting task from the stack and process it, pushing new sub-tasks
+        * onto the stack for each radix-sorted pile. When all tasks and sub-tasks have been
+        * processed, the stack will be empty and all the keys in the starting task will be fully
+        * sorted.
+        */
+       for (*task_stack = start; task_stack >= sorter->stack; task_stack--) {
+               const struct task task = *task_stack;
+               struct task *insertion_task_list;
+               int result;
+               sort_key_t *fence;
+               sort_key_t *end;
+
+               measure_bins(task, bins);
+
+               /*
+                * Now that we know how large each bin is, generate pointers for each of the piles
+                * and push a new task to sort each pile by the next radix byte.
+                */
+               insertion_task_list = sorter->insertion_list;
+               result = push_bins(&task_stack, sorter->end_of_stack,
+                                  &insertion_task_list, pile, bins, task.first_key,
+                                  task.offset + 1, task.length - 1);
+               if (result != UDS_SUCCESS) {
+                       memset(bins, 0, sizeof(*bins));
+                       return result;
+               }
+
+               /* Now bins->used is zero again. */
+
+               /*
+                * Don't bother processing the last pile: when piles 0..N-1 are all in place, then
+                * pile N must also be in place.
+                */
+               end = task.last_key - bins->size[bins->last];
+               bins->size[bins->last] = 0;
+
+               for (fence = task.first_key; fence <= end; ) {
+                       u8 bin;
+                       sort_key_t key = *fence;
+
+                       /*
+                        * The radix byte of the key tells us which pile it belongs in. Swap it for
+                        * an unprocessed item just below that pile, and repeat.
+                        */
+                       while (--pile[bin = key[task.offset]] > fence)
+                               swap_keys(pile[bin], &key);
+
+                       /*
+                        * The pile reached the fence. Put the key at the bottom of that pile,
+                        * completing it, and advance the fence to the next pile.
+                        */
+                       *fence = key;
+                       fence += bins->size[bin];
+                       bins->size[bin] = 0;
+               }
+
+               /* Now bins->size[] is all zero again. */
+
+               /*
+                * When the number of keys in a task gets small enough, it is faster to use an
+                * insertion sort than to keep subdividing into tiny piles.
+                */
+               while (--insertion_task_list >= sorter->insertion_list)
+                       insertion_sort(*insertion_task_list);
+       }
+
+       return UDS_SUCCESS;
+}
diff --git a/drivers/md/dm-vdo/indexer/radix-sort.h b/drivers/md/dm-vdo/indexer/radix-sort.h
new file mode 100644 (file)
index 0000000..812949b
--- /dev/null
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#ifndef UDS_RADIX_SORT_H
+#define UDS_RADIX_SORT_H
+
+/*
+ * Radix sort is implemented using an American Flag sort, an unstable, in-place 8-bit radix
+ * exchange sort. This is adapted from the algorithm in the paper by Peter M. McIlroy, Keith
+ * Bostic, and M. Douglas McIlroy, "Engineering Radix Sort".
+ *
+ * http://www.usenix.org/publications/compsystems/1993/win_mcilroy.pdf
+ */
+
+struct radix_sorter;
+
+int __must_check uds_make_radix_sorter(unsigned int count, struct radix_sorter **sorter);
+
+void uds_free_radix_sorter(struct radix_sorter *sorter);
+
+int __must_check uds_radix_sort(struct radix_sorter *sorter, const unsigned char *keys[],
+                               unsigned int count, unsigned short length);
+
+#endif /* UDS_RADIX_SORT_H */
diff --git a/drivers/md/dm-vdo/indexer/sparse-cache.c b/drivers/md/dm-vdo/indexer/sparse-cache.c
new file mode 100644 (file)
index 0000000..f2141de
--- /dev/null
@@ -0,0 +1,626 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#include "sparse-cache.h"
+
+#include <linux/cache.h>
+#include <linux/delay.h>
+#include <linux/dm-bufio.h>
+
+#include "logger.h"
+#include "memory-alloc.h"
+#include "permassert.h"
+
+#include "chapter-index.h"
+#include "config.h"
+#include "index.h"
+
+/*
+ * Since the cache is small, it is implemented as a simple array of cache entries. Searching for a
+ * specific virtual chapter is implemented as a linear search. The cache replacement policy is
+ * least-recently-used (LRU). Again, the small size of the cache allows the LRU order to be
+ * maintained by shifting entries in an array list.
+ *
+ * Changing the contents of the cache requires the coordinated participation of all zone threads
+ * via the careful use of barrier messages sent to all the index zones by the triage queue worker
+ * thread. The critical invariant for coordination is that the cache membership must not change
+ * between updates, so that all calls to uds_sparse_cache_contains() from the zone threads must all
+ * receive the same results for every virtual chapter number. To ensure that critical invariant,
+ * state changes such as "that virtual chapter is no longer in the volume" and "skip searching that
+ * chapter because it has had too many cache misses" are represented separately from the cache
+ * membership information (the virtual chapter number).
+ *
+ * As a result of this invariant, we have the guarantee that every zone thread will call
+ * uds_update_sparse_cache() once and exactly once to request a chapter that is not in the cache,
+ * and the serialization of the barrier requests from the triage queue ensures they will all
+ * request the same chapter number. This means the only synchronization we need can be provided by
+ * a pair of thread barriers used only in the uds_update_sparse_cache() call, providing a critical
+ * section where a single zone thread can drive the cache update while all the other zone threads
+ * are known to be blocked, waiting in the second barrier. Outside that critical section, all the
+ * zone threads implicitly hold a shared lock. Inside it, the thread for zone zero holds an
+ * exclusive lock. No other threads may access or modify the cache entries.
+ *
+ * Chapter statistics must only be modified by a single thread, which is also the zone zero thread.
+ * All fields that might be frequently updated by that thread are kept in separate cache-aligned
+ * structures so they will not cause cache contention via "false sharing" with the fields that are
+ * frequently accessed by all of the zone threads.
+ *
+ * The LRU order is managed independently by each zone thread, and each zone uses its own list for
+ * searching and cache membership queries. The zone zero list is used to decide which chapter to
+ * evict when the cache is updated, and its search list is copied to the other threads at that
+ * time.
+ *
+ * The virtual chapter number field of the cache entry is the single field indicating whether a
+ * chapter is a member of the cache or not. The value NO_CHAPTER is used to represent a null or
+ * undefined chapter number. When present in the virtual chapter number field of a
+ * cached_chapter_index, it indicates that the cache entry is dead, and all the other fields of
+ * that entry (other than immutable pointers to cache memory) are undefined and irrelevant. Any
+ * cache entry that is not marked as dead is fully defined and a member of the cache, and
+ * uds_sparse_cache_contains() will always return true for any virtual chapter number that appears
+ * in any of the cache entries.
+ *
+ * A chapter index that is a member of the cache may be excluded from searches between calls to
+ * uds_update_sparse_cache() in two different ways. First, when a chapter falls off the end of the
+ * volume, its virtual chapter number will be less that the oldest virtual chapter number. Since
+ * that chapter is no longer part of the volume, there's no point in continuing to search that
+ * chapter index. Once invalidated, that virtual chapter will still be considered a member of the
+ * cache, but it will no longer be searched for matching names.
+ *
+ * The second mechanism is a heuristic based on keeping track of the number of consecutive search
+ * misses in a given chapter index. Once that count exceeds a threshold, the skip_search flag will
+ * be set to true, causing the chapter to be skipped when searching the entire cache, but still
+ * allowing it to be found when searching for a hook in that specific chapter. Finding a hook will
+ * clear the skip_search flag, once again allowing the non-hook searches to use that cache entry.
+ * Again, regardless of the state of the skip_search flag, the virtual chapter must still
+ * considered to be a member of the cache for uds_sparse_cache_contains().
+ */
+
+enum {
+       SKIP_SEARCH_THRESHOLD = 20000,
+       ZONE_ZERO = 0,
+};
+
+/*
+ * These counters are essentially fields of the struct cached_chapter_index, but are segregated
+ * into this structure because they are frequently modified. They are grouped and aligned to keep
+ * them on different cache lines from the chapter fields that are accessed far more often than they
+ * are updated.
+ */
+struct __aligned(L1_CACHE_BYTES) cached_index_counters {
+       u64 consecutive_misses;
+};
+
+struct __aligned(L1_CACHE_BYTES) cached_chapter_index {
+       /*
+        * The virtual chapter number of the cached chapter index. NO_CHAPTER means this cache
+        * entry is unused. This field must only be modified in the critical section in
+        * uds_update_sparse_cache().
+        */
+       u64 virtual_chapter;
+
+       u32 index_pages_count;
+
+       /*
+        * These pointers are immutable during the life of the cache. The contents of the arrays
+        * change when the cache entry is replaced.
+        */
+       struct delta_index_page *index_pages;
+       struct dm_buffer **page_buffers;
+
+       /*
+        * If set, skip the chapter when searching the entire cache. This flag is just a
+        * performance optimization. This flag is mutable between cache updates, but it rarely
+        * changes and is frequently accessed, so it groups with the immutable fields.
+        */
+       bool skip_search;
+
+       /*
+        * The cache-aligned counters change often and are placed at the end of the structure to
+        * prevent false sharing with the more stable fields above.
+        */
+       struct cached_index_counters counters;
+};
+
+/*
+ * A search_list represents an ordering of the sparse chapter index cache entry array, from most
+ * recently accessed to least recently accessed, which is the order in which the indexes should be
+ * searched and the reverse order in which they should be evicted from the cache.
+ *
+ * Cache entries that are dead or empty are kept at the end of the list, avoiding the need to even
+ * iterate over them to search, and ensuring that dead entries are replaced before any live entries
+ * are evicted.
+ *
+ * The search list is instantiated for each zone thread, avoiding any need for synchronization. The
+ * structure is allocated on a cache boundary to avoid false sharing of memory cache lines between
+ * zone threads.
+ */
+struct search_list {
+       u8 capacity;
+       u8 first_dead_entry;
+       struct cached_chapter_index *entries[];
+};
+
+struct threads_barrier {
+       /* Lock for this barrier object */
+       struct semaphore lock;
+       /* Semaphore for threads waiting at this barrier */
+       struct semaphore wait;
+       /* Number of threads which have arrived */
+       int arrived;
+       /* Total number of threads using this barrier */
+       int thread_count;
+};
+
+struct sparse_cache {
+       const struct index_geometry *geometry;
+       unsigned int capacity;
+       unsigned int zone_count;
+
+       unsigned int skip_threshold;
+       struct search_list *search_lists[MAX_ZONES];
+       struct cached_chapter_index **scratch_entries;
+
+       struct threads_barrier begin_update_barrier;
+       struct threads_barrier end_update_barrier;
+
+       struct cached_chapter_index chapters[];
+};
+
+static void initialize_threads_barrier(struct threads_barrier *barrier,
+                                      unsigned int thread_count)
+{
+       sema_init(&barrier->lock, 1);
+       barrier->arrived = 0;
+       barrier->thread_count = thread_count;
+       sema_init(&barrier->wait, 0);
+}
+
+static inline void __down(struct semaphore *semaphore)
+{
+       /*
+        * Do not use down(semaphore). Instead use down_interruptible so that
+        * we do not get 120 second stall messages in kern.log.
+        */
+       while (down_interruptible(semaphore) != 0) {
+               /*
+                * If we're called from a user-mode process (e.g., "dmsetup
+                * remove") while waiting for an operation that may take a
+                * while (e.g., UDS index save), and a signal is sent (SIGINT,
+                * SIGUSR2), then down_interruptible will not block. If that
+                * happens, sleep briefly to avoid keeping the CPU locked up in
+                * this loop. We could just call cond_resched, but then we'd
+                * still keep consuming CPU time slices and swamp other threads
+                * trying to do computational work.
+                */
+               fsleep(1000);
+       }
+}
+
+static void enter_threads_barrier(struct threads_barrier *barrier)
+{
+       __down(&barrier->lock);
+       if (++barrier->arrived == barrier->thread_count) {
+               /* last thread */
+               int i;
+
+               for (i = 1; i < barrier->thread_count; i++)
+                       up(&barrier->wait);
+
+               barrier->arrived = 0;
+               up(&barrier->lock);
+       } else {
+               up(&barrier->lock);
+               __down(&barrier->wait);
+       }
+}
+
+static int __must_check initialize_cached_chapter_index(struct cached_chapter_index *chapter,
+                                                       const struct index_geometry *geometry)
+{
+       int result;
+
+       chapter->virtual_chapter = NO_CHAPTER;
+       chapter->index_pages_count = geometry->index_pages_per_chapter;
+
+       result = uds_allocate(chapter->index_pages_count, struct delta_index_page,
+                             __func__, &chapter->index_pages);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       return uds_allocate(chapter->index_pages_count, struct dm_buffer *,
+                           "sparse index volume pages", &chapter->page_buffers);
+}
+
+static int __must_check make_search_list(struct sparse_cache *cache,
+                                        struct search_list **list_ptr)
+{
+       struct search_list *list;
+       unsigned int bytes;
+       u8 i;
+       int result;
+
+       bytes = (sizeof(struct search_list) +
+                (cache->capacity * sizeof(struct cached_chapter_index *)));
+       result = uds_allocate_cache_aligned(bytes, "search list", &list);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       list->capacity = cache->capacity;
+       list->first_dead_entry = 0;
+
+       for (i = 0; i < list->capacity; i++)
+               list->entries[i] = &cache->chapters[i];
+
+       *list_ptr = list;
+       return UDS_SUCCESS;
+}
+
+int uds_make_sparse_cache(const struct index_geometry *geometry, unsigned int capacity,
+                         unsigned int zone_count, struct sparse_cache **cache_ptr)
+{
+       int result;
+       unsigned int i;
+       struct sparse_cache *cache;
+       unsigned int bytes;
+
+       bytes = (sizeof(struct sparse_cache) + (capacity * sizeof(struct cached_chapter_index)));
+       result = uds_allocate_cache_aligned(bytes, "sparse cache", &cache);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       cache->geometry = geometry;
+       cache->capacity = capacity;
+       cache->zone_count = zone_count;
+
+       /*
+        * Scale down the skip threshold since the cache only counts cache misses in zone zero, but
+        * requests are being handled in all zones.
+        */
+       cache->skip_threshold = (SKIP_SEARCH_THRESHOLD / zone_count);
+
+       initialize_threads_barrier(&cache->begin_update_barrier, zone_count);
+       initialize_threads_barrier(&cache->end_update_barrier, zone_count);
+
+       for (i = 0; i < capacity; i++) {
+               result = initialize_cached_chapter_index(&cache->chapters[i], geometry);
+               if (result != UDS_SUCCESS)
+                       goto out;
+       }
+
+       for (i = 0; i < zone_count; i++) {
+               result = make_search_list(cache, &cache->search_lists[i]);
+               if (result != UDS_SUCCESS)
+                       goto out;
+       }
+
+       /* purge_search_list() needs some temporary lists for sorting. */
+       result = uds_allocate(capacity * 2, struct cached_chapter_index *,
+                             "scratch entries", &cache->scratch_entries);
+       if (result != UDS_SUCCESS)
+               goto out;
+
+       *cache_ptr = cache;
+       return UDS_SUCCESS;
+out:
+       uds_free_sparse_cache(cache);
+       return result;
+}
+
+static inline void set_skip_search(struct cached_chapter_index *chapter,
+                                  bool skip_search)
+{
+       /* Check before setting to reduce cache line contention. */
+       if (READ_ONCE(chapter->skip_search) != skip_search)
+               WRITE_ONCE(chapter->skip_search, skip_search);
+}
+
+static void score_search_hit(struct cached_chapter_index *chapter)
+{
+       chapter->counters.consecutive_misses = 0;
+       set_skip_search(chapter, false);
+}
+
+static void score_search_miss(struct sparse_cache *cache,
+                             struct cached_chapter_index *chapter)
+{
+       chapter->counters.consecutive_misses++;
+       if (chapter->counters.consecutive_misses > cache->skip_threshold)
+               set_skip_search(chapter, true);
+}
+
+static void release_cached_chapter_index(struct cached_chapter_index *chapter)
+{
+       unsigned int i;
+
+       chapter->virtual_chapter = NO_CHAPTER;
+       if (chapter->page_buffers == NULL)
+               return;
+
+       for (i = 0; i < chapter->index_pages_count; i++) {
+               if (chapter->page_buffers[i] != NULL)
+                       dm_bufio_release(uds_forget(chapter->page_buffers[i]));
+       }
+}
+
+void uds_free_sparse_cache(struct sparse_cache *cache)
+{
+       unsigned int i;
+
+       if (cache == NULL)
+               return;
+
+       uds_free(cache->scratch_entries);
+
+       for (i = 0; i < cache->zone_count; i++)
+               uds_free(cache->search_lists[i]);
+
+       for (i = 0; i < cache->capacity; i++) {
+               release_cached_chapter_index(&cache->chapters[i]);
+               uds_free(cache->chapters[i].index_pages);
+               uds_free(cache->chapters[i].page_buffers);
+       }
+
+       uds_free(cache);
+}
+
+/*
+ * Take the indicated element of the search list and move it to the start, pushing the pointers
+ * previously before it back down the list.
+ */
+static inline void set_newest_entry(struct search_list *search_list, u8 index)
+{
+       struct cached_chapter_index *newest;
+
+       if (index > 0) {
+               newest = search_list->entries[index];
+               memmove(&search_list->entries[1], &search_list->entries[0],
+                       index * sizeof(struct cached_chapter_index *));
+               search_list->entries[0] = newest;
+       }
+
+       /*
+        * This function may have moved a dead chapter to the front of the list for reuse, in which
+        * case the set of dead chapters becomes smaller.
+        */
+       if (search_list->first_dead_entry <= index)
+               search_list->first_dead_entry++;
+}
+
+bool uds_sparse_cache_contains(struct sparse_cache *cache, u64 virtual_chapter,
+                              unsigned int zone_number)
+{
+       struct search_list *search_list;
+       struct cached_chapter_index *chapter;
+       u8 i;
+
+       /*
+        * The correctness of the barriers depends on the invariant that between calls to
+        * uds_update_sparse_cache(), the answers this function returns must never vary: the result
+        * for a given chapter must be identical across zones. That invariant must be maintained
+        * even if the chapter falls off the end of the volume, or if searching it is disabled
+        * because of too many search misses.
+        */
+       search_list = cache->search_lists[zone_number];
+       for (i = 0; i < search_list->first_dead_entry; i++) {
+               chapter = search_list->entries[i];
+
+               if (virtual_chapter == chapter->virtual_chapter) {
+                       if (zone_number == ZONE_ZERO)
+                               score_search_hit(chapter);
+
+                       set_newest_entry(search_list, i);
+                       return true;
+               }
+       }
+
+       return false;
+}
+
+/*
+ * Re-sort cache entries into three sets (active, skippable, and dead) while maintaining the LRU
+ * ordering that already existed. This operation must only be called during the critical section in
+ * uds_update_sparse_cache().
+ */
+static void purge_search_list(struct search_list *search_list,
+                             struct sparse_cache *cache, u64 oldest_virtual_chapter)
+{
+       struct cached_chapter_index **entries;
+       struct cached_chapter_index **skipped;
+       struct cached_chapter_index **dead;
+       struct cached_chapter_index *chapter;
+       unsigned int next_alive = 0;
+       unsigned int next_skipped = 0;
+       unsigned int next_dead = 0;
+       unsigned int i;
+
+       entries = &search_list->entries[0];
+       skipped = &cache->scratch_entries[0];
+       dead = &cache->scratch_entries[search_list->capacity];
+
+       for (i = 0; i < search_list->first_dead_entry; i++) {
+               chapter = search_list->entries[i];
+               if ((chapter->virtual_chapter < oldest_virtual_chapter) ||
+                   (chapter->virtual_chapter == NO_CHAPTER))
+                       dead[next_dead++] = chapter;
+               else if (chapter->skip_search)
+                       skipped[next_skipped++] = chapter;
+               else
+                       entries[next_alive++] = chapter;
+       }
+
+       memcpy(&entries[next_alive], skipped,
+              next_skipped * sizeof(struct cached_chapter_index *));
+       memcpy(&entries[next_alive + next_skipped], dead,
+              next_dead * sizeof(struct cached_chapter_index *));
+       search_list->first_dead_entry = next_alive + next_skipped;
+}
+
+static int __must_check cache_chapter_index(struct cached_chapter_index *chapter,
+                                           u64 virtual_chapter,
+                                           const struct volume *volume)
+{
+       int result;
+
+       release_cached_chapter_index(chapter);
+
+       result = uds_read_chapter_index_from_volume(volume, virtual_chapter,
+                                                   chapter->page_buffers,
+                                                   chapter->index_pages);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       chapter->counters.consecutive_misses = 0;
+       chapter->virtual_chapter = virtual_chapter;
+       chapter->skip_search = false;
+
+       return UDS_SUCCESS;
+}
+
+static inline void copy_search_list(const struct search_list *source,
+                                   struct search_list *target)
+{
+       *target = *source;
+       memcpy(target->entries, source->entries,
+              source->capacity * sizeof(struct cached_chapter_index *));
+}
+
+/*
+ * Update the sparse cache to contain a chapter index. This function must be called by all the zone
+ * threads with the same chapter number to correctly enter the thread barriers used to synchronize
+ * the cache updates.
+ */
+int uds_update_sparse_cache(struct index_zone *zone, u64 virtual_chapter)
+{
+       int result = UDS_SUCCESS;
+       const struct uds_index *index = zone->index;
+       struct sparse_cache *cache = index->volume->sparse_cache;
+
+       if (uds_sparse_cache_contains(cache, virtual_chapter, zone->id))
+               return UDS_SUCCESS;
+
+       /*
+        * Wait for every zone thread to reach its corresponding barrier request and invoke this
+        * function before starting to modify the cache.
+        */
+       enter_threads_barrier(&cache->begin_update_barrier);
+
+       /*
+        * This is the start of the critical section: the zone zero thread is captain, effectively
+        * holding an exclusive lock on the sparse cache. All the other zone threads must do
+        * nothing between the two barriers. They will wait at the end_update_barrier again for the
+        * captain to finish the update.
+        */
+
+       if (zone->id == ZONE_ZERO) {
+               unsigned int z;
+               struct search_list *list = cache->search_lists[ZONE_ZERO];
+
+               purge_search_list(list, cache, zone->oldest_virtual_chapter);
+
+               if (virtual_chapter >= index->oldest_virtual_chapter) {
+                       set_newest_entry(list, list->capacity - 1);
+                       result = cache_chapter_index(list->entries[0], virtual_chapter,
+                                                    index->volume);
+               }
+
+               for (z = 1; z < cache->zone_count; z++)
+                       copy_search_list(list, cache->search_lists[z]);
+       }
+
+       /*
+        * This is the end of the critical section. All cache invariants must have been restored.
+        */
+       enter_threads_barrier(&cache->end_update_barrier);
+       return result;
+}
+
+void uds_invalidate_sparse_cache(struct sparse_cache *cache)
+{
+       unsigned int i;
+
+       for (i = 0; i < cache->capacity; i++)
+               release_cached_chapter_index(&cache->chapters[i]);
+}
+
+static inline bool should_skip_chapter(struct cached_chapter_index *chapter,
+                                      u64 oldest_chapter, u64 requested_chapter)
+{
+       if ((chapter->virtual_chapter == NO_CHAPTER) ||
+           (chapter->virtual_chapter < oldest_chapter))
+               return true;
+
+       if (requested_chapter != NO_CHAPTER)
+               return requested_chapter != chapter->virtual_chapter;
+       else
+               return READ_ONCE(chapter->skip_search);
+}
+
+static int __must_check search_cached_chapter_index(struct cached_chapter_index *chapter,
+                                                   const struct index_geometry *geometry,
+                                                   const struct index_page_map *index_page_map,
+                                                   const struct uds_record_name *name,
+                                                   u16 *record_page_ptr)
+{
+       u32 physical_chapter =
+               uds_map_to_physical_chapter(geometry, chapter->virtual_chapter);
+       u32 index_page_number =
+               uds_find_index_page_number(index_page_map, name, physical_chapter);
+       struct delta_index_page *index_page =
+               &chapter->index_pages[index_page_number];
+
+       return uds_search_chapter_index_page(index_page, geometry, name,
+                                            record_page_ptr);
+}
+
+int uds_search_sparse_cache(struct index_zone *zone, const struct uds_record_name *name,
+                           u64 *virtual_chapter_ptr, u16 *record_page_ptr)
+{
+       int result;
+       struct volume *volume = zone->index->volume;
+       struct sparse_cache *cache = volume->sparse_cache;
+       struct cached_chapter_index *chapter;
+       struct search_list *search_list;
+       u8 i;
+       /* Search the entire cache unless a specific chapter was requested. */
+       bool search_one = (*virtual_chapter_ptr != NO_CHAPTER);
+
+       *record_page_ptr = NO_CHAPTER_INDEX_ENTRY;
+       search_list = cache->search_lists[zone->id];
+       for (i = 0; i < search_list->first_dead_entry; i++) {
+               chapter = search_list->entries[i];
+
+               if (should_skip_chapter(chapter, zone->oldest_virtual_chapter,
+                                       *virtual_chapter_ptr))
+                       continue;
+
+               result = search_cached_chapter_index(chapter, cache->geometry,
+                                                    volume->index_page_map, name,
+                                                    record_page_ptr);
+               if (result != UDS_SUCCESS)
+                       return result;
+
+               if (*record_page_ptr != NO_CHAPTER_INDEX_ENTRY) {
+                       /*
+                        * In theory, this might be a false match while a true match exists in
+                        * another chapter, but that's a very rare case and not worth the extra
+                        * search complexity.
+                        */
+                       set_newest_entry(search_list, i);
+                       if (zone->id == ZONE_ZERO)
+                               score_search_hit(chapter);
+
+                       *virtual_chapter_ptr = chapter->virtual_chapter;
+                       return UDS_SUCCESS;
+               }
+
+               if (zone->id == ZONE_ZERO)
+                       score_search_miss(cache, chapter);
+
+               if (search_one)
+                       break;
+       }
+
+       return UDS_SUCCESS;
+}
diff --git a/drivers/md/dm-vdo/indexer/sparse-cache.h b/drivers/md/dm-vdo/indexer/sparse-cache.h
new file mode 100644 (file)
index 0000000..45e2dcf
--- /dev/null
@@ -0,0 +1,46 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#ifndef UDS_SPARSE_CACHE_H
+#define UDS_SPARSE_CACHE_H
+
+#include "geometry.h"
+#include "indexer.h"
+
+/*
+ * The sparse cache is a cache of entire chapter indexes from sparse chapters used for searching
+ * for names after all other search paths have failed. It contains only complete chapter indexes;
+ * record pages from sparse chapters and single index pages used for resolving hooks are kept in
+ * the regular page cache in the volume.
+ *
+ * The most important property of this cache is the absence of synchronization for read operations.
+ * Safe concurrent access to the cache by the zone threads is controlled by the triage queue and
+ * the barrier requests it issues to the zone queues. The set of cached chapters does not and must
+ * not change between the carefully coordinated calls to uds_update_sparse_cache() from the zone
+ * threads. Outside of updates, every zone will get the same result when calling
+ * uds_sparse_cache_contains() as every other zone.
+ */
+
+struct index_zone;
+struct sparse_cache;
+
+int __must_check uds_make_sparse_cache(const struct index_geometry *geometry,
+                                      unsigned int capacity, unsigned int zone_count,
+                                      struct sparse_cache **cache_ptr);
+
+void uds_free_sparse_cache(struct sparse_cache *cache);
+
+bool uds_sparse_cache_contains(struct sparse_cache *cache, u64 virtual_chapter,
+                              unsigned int zone_number);
+
+int __must_check uds_update_sparse_cache(struct index_zone *zone, u64 virtual_chapter);
+
+void uds_invalidate_sparse_cache(struct sparse_cache *cache);
+
+int __must_check uds_search_sparse_cache(struct index_zone *zone,
+                                        const struct uds_record_name *name,
+                                        u64 *virtual_chapter_ptr, u16 *record_page_ptr);
+
+#endif /* UDS_SPARSE_CACHE_H */
diff --git a/drivers/md/dm-vdo/indexer/volume-index.c b/drivers/md/dm-vdo/indexer/volume-index.c
new file mode 100644 (file)
index 0000000..8cbd928
--- /dev/null
@@ -0,0 +1,1281 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2023 Red Hat
+ */
+#include "volume-index.h"
+
+#include <linux/bitops.h>
+#include <linux/bits.h>
+#include <linux/cache.h>
+#include <linux/compiler.h>
+#include <linux/log2.h>
+
+#include "errors.h"
+#include "logger.h"
+#include "memory-alloc.h"
+#include "numeric.h"
+#include "permassert.h"
+#include "thread-utils.h"
+
+#include "config.h"
+#include "geometry.h"
+#include "hash-utils.h"
+#include "indexer.h"
+
+/*
+ * The volume index is a combination of two separate subindexes, one containing sparse hook entries
+ * (retained for all chapters), and one containing the remaining entries (retained only for the
+ * dense chapters). If there are no sparse chapters, only the non-hook sub index is used, and it
+ * will contain all records for all chapters.
+ *
+ * The volume index is also divided into zones, with one thread operating on each zone. Each
+ * incoming request is dispatched to the appropriate thread, and then to the appropriate subindex.
+ * Each delta list is handled by a single zone. To ensure that the distribution of delta lists to
+ * zones doesn't underflow (leaving some zone with no delta lists), the minimum number of delta
+ * lists must be the square of the maximum zone count for both subindexes.
+ *
+ * Each subindex zone is a delta index where the payload is a chapter number. The volume index can
+ * compute the delta list number, address, and zone number from the record name in order to
+ * dispatch record handling to the correct structures.
+ *
+ * Most operations that use all the zones take place either before request processing is allowed,
+ * or after all requests have been flushed in order to shut down. The only multi-threaded operation
+ * supported during normal operation is the uds_lookup_volume_index_name() method, used to determine
+ * whether a new chapter should be loaded into the sparse index cache. This operation only uses the
+ * sparse hook subindex, and the zone mutexes are used to make this operation safe.
+ *
+ * There are three ways of expressing chapter numbers in the volume index: virtual, index, and
+ * rolling. The interface to the volume index uses virtual chapter numbers, which are 64 bits long.
+ * Internally the subindex stores only the minimal number of bits necessary by masking away the
+ * high-order bits. When the index needs to deal with ordering of index chapter numbers, as when
+ * flushing entries from older chapters, it rolls the index chapter number around so that the
+ * smallest one in use is mapped to 0. See convert_index_to_virtual() or flush_invalid_entries()
+ * for an example of this technique.
+ *
+ * For efficiency, when older chapter numbers become invalid, the index does not immediately remove
+ * the invalidated entries. Instead it lazily removes them from a given delta list the next time it
+ * walks that list during normal operation. Because of this, the index size must be increased
+ * somewhat to accommodate all the invalid entries that have not yet been removed. For the standard
+ * index sizes, this requires about 4 chapters of old entries per 1024 chapters of valid entries in
+ * the index.
+ */
+
+struct sub_index_parameters {
+       /* The number of bits in address mask */
+       u8 address_bits;
+       /* The number of bits in chapter number */
+       u8 chapter_bits;
+       /* The mean delta */
+       u32 mean_delta;
+       /* The number of delta lists */
+       u64 list_count;
+       /* The number of chapters used */
+       u32 chapter_count;
+       /* The number of bits per chapter */
+       size_t chapter_size_in_bits;
+       /* The number of bytes of delta list memory */
+       size_t memory_size;
+       /* The number of bytes the index should keep free at all times */
+       size_t target_free_bytes;
+};
+
+struct split_config {
+       /* The hook subindex configuration */
+       struct uds_configuration hook_config;
+       struct index_geometry hook_geometry;
+
+       /* The non-hook subindex configuration */
+       struct uds_configuration non_hook_config;
+       struct index_geometry non_hook_geometry;
+};
+
+struct chapter_range {
+       u32 chapter_start;
+       u32 chapter_count;
+};
+
+enum { MAGIC_SIZE = 8 };
+static const char MAGIC_START_5[] = "MI5-0005";
+
+struct sub_index_data {
+       char magic[MAGIC_SIZE]; /* MAGIC_START_5 */
+       u64 volume_nonce;
+       u64 virtual_chapter_low;
+       u64 virtual_chapter_high;
+       u32 first_list;
+       u32 list_count;
+};
+
+static const char MAGIC_START_6[] = "MI6-0001";
+
+struct volume_index_data {
+       char magic[MAGIC_SIZE]; /* MAGIC_START_6 */
+       u32 sparse_sample_rate;
+};
+
+static inline u32 extract_address(const struct volume_sub_index *sub_index,
+                                 const struct uds_record_name *name)
+{
+       return uds_extract_volume_index_bytes(name) & sub_index->address_mask;
+}
+
+static inline u32 extract_dlist_num(const struct volume_sub_index *sub_index,
+                                   const struct uds_record_name *name)
+{
+       u64 bits = uds_extract_volume_index_bytes(name);
+
+       return (bits >> sub_index->address_bits) % sub_index->list_count;
+}
+
+static inline const struct volume_sub_index_zone *
+get_zone_for_record(const struct volume_index_record *record)
+{
+       return &record->sub_index->zones[record->zone_number];
+}
+
+static inline u64 convert_index_to_virtual(const struct volume_index_record *record,
+                                          u32 index_chapter)
+{
+       const struct volume_sub_index_zone *volume_index_zone = get_zone_for_record(record);
+       u32 rolling_chapter = ((index_chapter - volume_index_zone->virtual_chapter_low) &
+                              record->sub_index->chapter_mask);
+
+       return volume_index_zone->virtual_chapter_low + rolling_chapter;
+}
+
+static inline u32 convert_virtual_to_index(const struct volume_sub_index *sub_index,
+                                          u64 virtual_chapter)
+{
+       return virtual_chapter & sub_index->chapter_mask;
+}
+
+static inline bool is_virtual_chapter_indexed(const struct volume_index_record *record,
+                                             u64 virtual_chapter)
+{
+       const struct volume_sub_index_zone *volume_index_zone = get_zone_for_record(record);
+
+       return ((virtual_chapter >= volume_index_zone->virtual_chapter_low) &&
+               (virtual_chapter <= volume_index_zone->virtual_chapter_high));
+}
+
+static inline bool has_sparse(const struct volume_index *volume_index)
+{
+       return volume_index->sparse_sample_rate > 0;
+}
+
+bool uds_is_volume_index_sample(const struct volume_index *volume_index,
+                               const struct uds_record_name *name)
+{
+       if (!has_sparse(volume_index))
+               return false;
+
+       return (uds_extract_sampling_bytes(name) % volume_index->sparse_sample_rate) == 0;
+}
+
+static inline const struct volume_sub_index *
+get_volume_sub_index(const struct volume_index *volume_index,
+                    const struct uds_record_name *name)
+{
+       return (uds_is_volume_index_sample(volume_index, name) ?
+               &volume_index->vi_hook :
+               &volume_index->vi_non_hook);
+}
+
+static unsigned int get_volume_sub_index_zone(const struct volume_sub_index *sub_index,
+                                             const struct uds_record_name *name)
+{
+       return extract_dlist_num(sub_index, name) / sub_index->delta_index.lists_per_zone;
+}
+
+unsigned int uds_get_volume_index_zone(const struct volume_index *volume_index,
+                                      const struct uds_record_name *name)
+{
+       return get_volume_sub_index_zone(get_volume_sub_index(volume_index, name), name);
+}
+
+static int compute_volume_sub_index_parameters(const struct uds_configuration *config,
+                                              struct sub_index_parameters *params)
+{
+       enum { DELTA_LIST_SIZE = 256 };
+       u64 entries_in_volume_index, address_span;
+       u32 chapters_in_volume_index, invalid_chapters;
+       u32 rounded_chapters;
+       u64 delta_list_records;
+       u32 address_count;
+       u64 index_size_in_bits;
+       size_t expected_index_size;
+       u64 min_delta_lists = MAX_ZONES * MAX_ZONES;
+       struct index_geometry *geometry = config->geometry;
+       u64 records_per_chapter = geometry->records_per_chapter;
+
+       params->chapter_count = geometry->chapters_per_volume;
+       /*
+        * Make sure that the number of delta list records in the volume index does not change when
+        * the volume is reduced by one chapter. This preserves the mapping from name to volume
+        * index delta list.
+        */
+       rounded_chapters = params->chapter_count;
+       if (uds_is_reduced_index_geometry(geometry))
+               rounded_chapters += 1;
+       delta_list_records = records_per_chapter * rounded_chapters;
+       address_count = config->volume_index_mean_delta * DELTA_LIST_SIZE;
+       params->list_count = max(delta_list_records / DELTA_LIST_SIZE, min_delta_lists);
+       params->address_bits = bits_per(address_count - 1);
+       params->chapter_bits = bits_per(rounded_chapters - 1);
+       if ((u32) params->list_count != params->list_count) {
+               return uds_log_warning_strerror(UDS_INVALID_ARGUMENT,
+                                               "cannot initialize volume index with %llu delta lists",
+                                               (unsigned long long) params->list_count);
+       }
+
+       if (params->address_bits > 31) {
+               return uds_log_warning_strerror(UDS_INVALID_ARGUMENT,
+                                               "cannot initialize volume index with %u address bits",
+                                               params->address_bits);
+       }
+
+       /*
+        * The probability that a given delta list is not touched during the writing of an entire
+        * chapter is:
+        *
+        * double p_not_touched = pow((double) (params->list_count - 1) / params->list_count,
+        *                            records_per_chapter);
+        *
+        * For the standard index sizes, about 78% of the delta lists are not touched, and
+        * therefore contain old index entries that have not been eliminated by the lazy LRU
+        * processing. Then the number of old index entries that accumulate over the entire index,
+        * in terms of full chapters worth of entries, is:
+        *
+        * double invalid_chapters = p_not_touched / (1.0 - p_not_touched);
+        *
+        * For the standard index sizes, the index needs about 3.5 chapters of space for the old
+        * entries in a 1024 chapter index, so round this up to use 4 chapters per 1024 chapters in
+        * the index.
+        */
+       invalid_chapters = max(rounded_chapters / 256, 2U);
+       chapters_in_volume_index = rounded_chapters + invalid_chapters;
+       entries_in_volume_index = records_per_chapter * chapters_in_volume_index;
+
+       address_span = params->list_count << params->address_bits;
+       params->mean_delta = address_span / entries_in_volume_index;
+
+       /*
+        * Compute the expected size of a full index, then set the total memory to be 6% larger
+        * than that expected size. This number should be large enough that there are not many
+        * rebalances when the index is full.
+        */
+       params->chapter_size_in_bits = uds_compute_delta_index_size(records_per_chapter,
+                                                                   params->mean_delta,
+                                                                   params->chapter_bits);
+       index_size_in_bits = params->chapter_size_in_bits * chapters_in_volume_index;
+       expected_index_size = index_size_in_bits / BITS_PER_BYTE;
+       params->memory_size = expected_index_size * 106 / 100;
+
+       params->target_free_bytes = expected_index_size / 20;
+       return UDS_SUCCESS;
+}
+
+static void uninitialize_volume_sub_index(struct volume_sub_index *sub_index)
+{
+       uds_free(uds_forget(sub_index->flush_chapters));
+       uds_free(uds_forget(sub_index->zones));
+       uds_uninitialize_delta_index(&sub_index->delta_index);
+}
+
+void uds_free_volume_index(struct volume_index *volume_index)
+{
+       if (volume_index == NULL)
+               return;
+
+       if (volume_index->zones != NULL)
+               uds_free(uds_forget(volume_index->zones));
+
+       uninitialize_volume_sub_index(&volume_index->vi_non_hook);
+       uninitialize_volume_sub_index(&volume_index->vi_hook);
+       uds_free(volume_index);
+}
+
+
+static int compute_volume_sub_index_save_bytes(const struct uds_configuration *config,
+                                              size_t *bytes)
+{
+       struct sub_index_parameters params = { .address_bits = 0 };
+       int result;
+
+       result = compute_volume_sub_index_parameters(config, &params);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       *bytes = (sizeof(struct sub_index_data) + params.list_count * sizeof(u64) +
+                 uds_compute_delta_index_save_bytes(params.list_count,
+                                                    params.memory_size));
+       return UDS_SUCCESS;
+}
+
+/* This function is only useful if the configuration includes sparse chapters. */
+static void split_configuration(const struct uds_configuration *config,
+                               struct split_config *split)
+{
+       u64 sample_rate, sample_records;
+       u64 dense_chapters, sparse_chapters;
+
+       /* Start with copies of the base configuration. */
+       split->hook_config = *config;
+       split->hook_geometry = *config->geometry;
+       split->hook_config.geometry = &split->hook_geometry;
+       split->non_hook_config = *config;
+       split->non_hook_geometry = *config->geometry;
+       split->non_hook_config.geometry = &split->non_hook_geometry;
+
+       sample_rate = config->sparse_sample_rate;
+       sparse_chapters = config->geometry->sparse_chapters_per_volume;
+       dense_chapters = config->geometry->chapters_per_volume - sparse_chapters;
+       sample_records = config->geometry->records_per_chapter / sample_rate;
+
+       /* Adjust the number of records indexed for each chapter. */
+       split->hook_geometry.records_per_chapter = sample_records;
+       split->non_hook_geometry.records_per_chapter -= sample_records;
+
+       /* Adjust the number of chapters indexed. */
+       split->hook_geometry.sparse_chapters_per_volume = 0;
+       split->non_hook_geometry.sparse_chapters_per_volume = 0;
+       split->non_hook_geometry.chapters_per_volume = dense_chapters;
+}
+
+static int compute_volume_index_save_bytes(const struct uds_configuration *config,
+                                          size_t *bytes)
+{
+       size_t hook_bytes, non_hook_bytes;
+       struct split_config split;
+       int result;
+
+       if (!uds_is_sparse_index_geometry(config->geometry))
+               return compute_volume_sub_index_save_bytes(config, bytes);
+
+       split_configuration(config, &split);
+       result = compute_volume_sub_index_save_bytes(&split.hook_config, &hook_bytes);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       result = compute_volume_sub_index_save_bytes(&split.non_hook_config,
+                                                    &non_hook_bytes);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       *bytes = sizeof(struct volume_index_data) + hook_bytes + non_hook_bytes;
+       return UDS_SUCCESS;
+}
+
+int uds_compute_volume_index_save_blocks(const struct uds_configuration *config,
+                                        size_t block_size, u64 *block_count)
+{
+       size_t bytes;
+       int result;
+
+       result = compute_volume_index_save_bytes(config, &bytes);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       bytes += sizeof(struct delta_list_save_info);
+       *block_count = DIV_ROUND_UP(bytes, block_size) + MAX_ZONES;
+       return UDS_SUCCESS;
+}
+
+/* Flush invalid entries while walking the delta list. */
+static inline int flush_invalid_entries(struct volume_index_record *record,
+                                       struct chapter_range *flush_range,
+                                       u32 *next_chapter_to_invalidate)
+{
+       int result;
+
+       result = uds_next_delta_index_entry(&record->delta_entry);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       while (!record->delta_entry.at_end) {
+               u32 index_chapter = uds_get_delta_entry_value(&record->delta_entry);
+               u32 relative_chapter = ((index_chapter - flush_range->chapter_start) &
+                                       record->sub_index->chapter_mask);
+
+               if (likely(relative_chapter >= flush_range->chapter_count)) {
+                       if (relative_chapter < *next_chapter_to_invalidate)
+                               *next_chapter_to_invalidate = relative_chapter;
+                       break;
+               }
+
+               result = uds_remove_delta_index_entry(&record->delta_entry);
+               if (result != UDS_SUCCESS)
+                       return result;
+       }
+
+       return UDS_SUCCESS;
+}
+
+/* Find the matching record, or the list offset where the record would go. */
+static int get_volume_index_entry(struct volume_index_record *record, u32 list_number,
+                                 u32 key, struct chapter_range *flush_range)
+{
+       struct volume_index_record other_record;
+       const struct volume_sub_index *sub_index = record->sub_index;
+       u32 next_chapter_to_invalidate = sub_index->chapter_mask;
+       int result;
+
+       result = uds_start_delta_index_search(&sub_index->delta_index, list_number, 0,
+                                             &record->delta_entry);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       do {
+               result = flush_invalid_entries(record, flush_range,
+                                              &next_chapter_to_invalidate);
+               if (result != UDS_SUCCESS)
+                       return result;
+       } while (!record->delta_entry.at_end && (key > record->delta_entry.key));
+
+       result = uds_remember_delta_index_offset(&record->delta_entry);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       /* Check any collision records for a more precise match. */
+       other_record = *record;
+       if (!other_record.delta_entry.at_end && (key == other_record.delta_entry.key)) {
+               for (;;) {
+                       u8 collision_name[UDS_RECORD_NAME_SIZE];
+
+                       result = flush_invalid_entries(&other_record, flush_range,
+                                                      &next_chapter_to_invalidate);
+                       if (result != UDS_SUCCESS)
+                               return result;
+
+                       if (other_record.delta_entry.at_end ||
+                           !other_record.delta_entry.is_collision)
+                               break;
+
+                       result = uds_get_delta_entry_collision(&other_record.delta_entry,
+                                                              collision_name);
+                       if (result != UDS_SUCCESS)
+                               return result;
+
+                       if (memcmp(collision_name, record->name, UDS_RECORD_NAME_SIZE) == 0) {
+                               *record = other_record;
+                               break;
+                       }
+               }
+       }
+       while (!other_record.delta_entry.at_end) {
+               result = flush_invalid_entries(&other_record, flush_range,
+                                              &next_chapter_to_invalidate);
+               if (result != UDS_SUCCESS)
+                       return result;
+       }
+       next_chapter_to_invalidate += flush_range->chapter_start;
+       next_chapter_to_invalidate &= sub_index->chapter_mask;
+       flush_range->chapter_start = next_chapter_to_invalidate;
+       flush_range->chapter_count = 0;
+       return UDS_SUCCESS;
+}
+
+static int get_volume_sub_index_record(struct volume_sub_index *sub_index,
+                                      const struct uds_record_name *name,
+                                      struct volume_index_record *record)
+{
+       int result;
+       const struct volume_sub_index_zone *volume_index_zone;
+       u32 address = extract_address(sub_index, name);
+       u32 delta_list_number = extract_dlist_num(sub_index, name);
+       u64 flush_chapter = sub_index->flush_chapters[delta_list_number];
+
+       record->sub_index = sub_index;
+       record->mutex = NULL;
+       record->name = name;
+       record->zone_number = delta_list_number / sub_index->delta_index.lists_per_zone;
+       volume_index_zone = get_zone_for_record(record);
+
+       if (flush_chapter < volume_index_zone->virtual_chapter_low) {
+               struct chapter_range range;
+               u64 flush_count = volume_index_zone->virtual_chapter_low - flush_chapter;
+
+               range.chapter_start = convert_virtual_to_index(sub_index, flush_chapter);
+               range.chapter_count = (flush_count > sub_index->chapter_mask ?
+                                      sub_index->chapter_mask + 1 :
+                                      flush_count);
+               result = get_volume_index_entry(record, delta_list_number, address,
+                                               &range);
+               flush_chapter = convert_index_to_virtual(record, range.chapter_start);
+               if (flush_chapter > volume_index_zone->virtual_chapter_high)
+                       flush_chapter = volume_index_zone->virtual_chapter_high;
+               sub_index->flush_chapters[delta_list_number] = flush_chapter;
+       } else {
+               result = uds_get_delta_index_entry(&sub_index->delta_index,
+                                                  delta_list_number, address,
+                                                  name->name, &record->delta_entry);
+       }
+
+       if (result != UDS_SUCCESS)
+               return result;
+
+       record->is_found =
+               (!record->delta_entry.at_end && (record->delta_entry.key == address));
+       if (record->is_found) {
+               u32 index_chapter = uds_get_delta_entry_value(&record->delta_entry);
+
+               record->virtual_chapter = convert_index_to_virtual(record, index_chapter);
+       }
+
+       record->is_collision = record->delta_entry.is_collision;
+       return UDS_SUCCESS;
+}
+
+int uds_get_volume_index_record(struct volume_index *volume_index,
+                               const struct uds_record_name *name,
+                               struct volume_index_record *record)
+{
+       int result;
+
+       if (uds_is_volume_index_sample(volume_index, name)) {
+               /*
+                * Other threads cannot be allowed to call uds_lookup_volume_index_name() while
+                * this thread is finding the volume index record. Due to the lazy LRU flushing of
+                * the volume index, uds_get_volume_index_record() is not a read-only operation.
+                */
+               unsigned int zone =
+                       get_volume_sub_index_zone(&volume_index->vi_hook, name);
+               struct mutex *mutex = &volume_index->zones[zone].hook_mutex;
+
+               mutex_lock(mutex);
+               result = get_volume_sub_index_record(&volume_index->vi_hook, name,
+                                                    record);
+               mutex_unlock(mutex);
+               /* Remember the mutex so that other operations on the index record can use it. */
+               record->mutex = mutex;
+       } else {
+               result = get_volume_sub_index_record(&volume_index->vi_non_hook, name,
+                                                    record);
+       }
+
+       return result;
+}
+
+int uds_put_volume_index_record(struct volume_index_record *record, u64 virtual_chapter)
+{
+       int result;
+       u32 address;
+       const struct volume_sub_index *sub_index = record->sub_index;
+
+       if (!is_virtual_chapter_indexed(record, virtual_chapter)) {
+               u64 low = get_zone_for_record(record)->virtual_chapter_low;
+               u64 high = get_zone_for_record(record)->virtual_chapter_high;
+
+               return uds_log_warning_strerror(UDS_INVALID_ARGUMENT,
+                                               "cannot put record into chapter number %llu that is out of the valid range %llu to %llu",
+                                               (unsigned long long) virtual_chapter,
+                                               (unsigned long long) low,
+                                               (unsigned long long) high);
+       }
+       address = extract_address(sub_index, record->name);
+       if (unlikely(record->mutex != NULL))
+               mutex_lock(record->mutex);
+       result = uds_put_delta_index_entry(&record->delta_entry, address,
+                                          convert_virtual_to_index(sub_index,
+                                                                   virtual_chapter),
+                                          record->is_found ? record->name->name : NULL);
+       if (unlikely(record->mutex != NULL))
+               mutex_unlock(record->mutex);
+       switch (result) {
+       case UDS_SUCCESS:
+               record->virtual_chapter = virtual_chapter;
+               record->is_collision = record->delta_entry.is_collision;
+               record->is_found = true;
+               break;
+       case UDS_OVERFLOW:
+               uds_log_ratelimit(uds_log_warning_strerror, UDS_OVERFLOW,
+                                 "Volume index entry dropped due to overflow condition");
+               uds_log_delta_index_entry(&record->delta_entry);
+               break;
+       default:
+               break;
+       }
+
+       return result;
+}
+
+int uds_remove_volume_index_record(struct volume_index_record *record)
+{
+       int result;
+
+       if (!record->is_found)
+               return uds_log_warning_strerror(UDS_BAD_STATE,
+                                               "illegal operation on new record");
+
+       /* Mark the record so that it cannot be used again */
+       record->is_found = false;
+       if (unlikely(record->mutex != NULL))
+               mutex_lock(record->mutex);
+       result = uds_remove_delta_index_entry(&record->delta_entry);
+       if (unlikely(record->mutex != NULL))
+               mutex_unlock(record->mutex);
+       return result;
+}
+
+static void set_volume_sub_index_zone_open_chapter(struct volume_sub_index *sub_index,
+                                                  unsigned int zone_number,
+                                                  u64 virtual_chapter)
+{
+       u64 used_bits = 0;
+       struct volume_sub_index_zone *zone = &sub_index->zones[zone_number];
+       struct delta_zone *delta_zone;
+       u32 i;
+
+       zone->virtual_chapter_low = (virtual_chapter >= sub_index->chapter_count ?
+                                    virtual_chapter - sub_index->chapter_count + 1 :
+                                    0);
+       zone->virtual_chapter_high = virtual_chapter;
+
+       /* Check to see if the new zone data is too large. */
+       delta_zone = &sub_index->delta_index.delta_zones[zone_number];
+       for (i = 1; i <= delta_zone->list_count; i++)
+               used_bits += delta_zone->delta_lists[i].size;
+
+       if (used_bits > sub_index->max_zone_bits) {
+               /* Expire enough chapters to free the desired space. */
+               u64 expire_count =
+                       1 + (used_bits - sub_index->max_zone_bits) / sub_index->chapter_zone_bits;
+
+               if (expire_count == 1) {
+                       uds_log_ratelimit(uds_log_info,
+                                         "zone %u:  At chapter %llu, expiring chapter %llu early",
+                                         zone_number,
+                                         (unsigned long long) virtual_chapter,
+                                         (unsigned long long) zone->virtual_chapter_low);
+                       zone->early_flushes++;
+                       zone->virtual_chapter_low++;
+               } else {
+                       u64 first_expired = zone->virtual_chapter_low;
+
+                       if (first_expired + expire_count < zone->virtual_chapter_high) {
+                               zone->early_flushes += expire_count;
+                               zone->virtual_chapter_low += expire_count;
+                       } else {
+                               zone->early_flushes +=
+                                       zone->virtual_chapter_high - zone->virtual_chapter_low;
+                               zone->virtual_chapter_low = zone->virtual_chapter_high;
+                       }
+                       uds_log_ratelimit(uds_log_info,
+                                         "zone %u:  At chapter %llu, expiring chapters %llu to %llu early",
+                                         zone_number,
+                                         (unsigned long long) virtual_chapter,
+                                         (unsigned long long) first_expired,
+                                         (unsigned long long) zone->virtual_chapter_low - 1);
+               }
+       }
+}
+
+void uds_set_volume_index_zone_open_chapter(struct volume_index *volume_index,
+                                           unsigned int zone_number,
+                                           u64 virtual_chapter)
+{
+       struct mutex *mutex = &volume_index->zones[zone_number].hook_mutex;
+
+       set_volume_sub_index_zone_open_chapter(&volume_index->vi_non_hook, zone_number,
+                                              virtual_chapter);
+
+       /*
+        * Other threads cannot be allowed to call uds_lookup_volume_index_name() while the open
+        * chapter number is changing.
+        */
+       if (has_sparse(volume_index)) {
+               mutex_lock(mutex);
+               set_volume_sub_index_zone_open_chapter(&volume_index->vi_hook,
+                                                      zone_number, virtual_chapter);
+               mutex_unlock(mutex);
+       }
+}
+
+/*
+ * Set the newest open chapter number for the index, while also advancing the oldest valid chapter
+ * number.
+ */
+void uds_set_volume_index_open_chapter(struct volume_index *volume_index,
+                                      u64 virtual_chapter)
+{
+       unsigned int zone;
+
+       for (zone = 0; zone < volume_index->zone_count; zone++)
+               uds_set_volume_index_zone_open_chapter(volume_index, zone, virtual_chapter);
+}
+
+int uds_set_volume_index_record_chapter(struct volume_index_record *record,
+                                       u64 virtual_chapter)
+{
+       const struct volume_sub_index *sub_index = record->sub_index;
+       int result;
+
+       if (!record->is_found)
+               return uds_log_warning_strerror(UDS_BAD_STATE,
+                                               "illegal operation on new record");
+
+       if (!is_virtual_chapter_indexed(record, virtual_chapter)) {
+               u64 low = get_zone_for_record(record)->virtual_chapter_low;
+               u64 high = get_zone_for_record(record)->virtual_chapter_high;
+
+               return uds_log_warning_strerror(UDS_INVALID_ARGUMENT,
+                                               "cannot set chapter number %llu that is out of the valid range %llu to %llu",
+                                               (unsigned long long) virtual_chapter,
+                                               (unsigned long long) low,
+                                               (unsigned long long) high);
+       }
+
+       if (unlikely(record->mutex != NULL))
+               mutex_lock(record->mutex);
+       result = uds_set_delta_entry_value(&record->delta_entry,
+                                          convert_virtual_to_index(sub_index,
+                                                                   virtual_chapter));
+       if (unlikely(record->mutex != NULL))
+               mutex_unlock(record->mutex);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       record->virtual_chapter = virtual_chapter;
+       return UDS_SUCCESS;
+}
+
+static u64 lookup_volume_sub_index_name(const struct volume_sub_index *sub_index,
+                                       const struct uds_record_name *name)
+{
+       int result;
+       u32 address = extract_address(sub_index, name);
+       u32 delta_list_number = extract_dlist_num(sub_index, name);
+       unsigned int zone_number = get_volume_sub_index_zone(sub_index, name);
+       const struct volume_sub_index_zone *zone = &sub_index->zones[zone_number];
+       u64 virtual_chapter;
+       u32 index_chapter;
+       u32 rolling_chapter;
+       struct delta_index_entry delta_entry;
+
+       result = uds_get_delta_index_entry(&sub_index->delta_index, delta_list_number,
+                                          address, name->name, &delta_entry);
+       if (result != UDS_SUCCESS)
+               return NO_CHAPTER;
+
+       if (delta_entry.at_end || (delta_entry.key != address))
+               return NO_CHAPTER;
+
+       index_chapter = uds_get_delta_entry_value(&delta_entry);
+       rolling_chapter = (index_chapter - zone->virtual_chapter_low) & sub_index->chapter_mask;
+
+       virtual_chapter = zone->virtual_chapter_low + rolling_chapter;
+       if (virtual_chapter > zone->virtual_chapter_high)
+               return NO_CHAPTER;
+
+       return virtual_chapter;
+}
+
+/* Do a read-only lookup of the record name for sparse cache management. */
+u64 uds_lookup_volume_index_name(const struct volume_index *volume_index,
+                                const struct uds_record_name *name)
+{
+       unsigned int zone_number = uds_get_volume_index_zone(volume_index, name);
+       struct mutex *mutex = &volume_index->zones[zone_number].hook_mutex;
+       u64 virtual_chapter;
+
+       if (!uds_is_volume_index_sample(volume_index, name))
+               return NO_CHAPTER;
+
+       mutex_lock(mutex);
+       virtual_chapter = lookup_volume_sub_index_name(&volume_index->vi_hook, name);
+       mutex_unlock(mutex);
+
+       return virtual_chapter;
+}
+
+static void abort_restoring_volume_sub_index(struct volume_sub_index *sub_index)
+{
+       uds_reset_delta_index(&sub_index->delta_index);
+}
+
+static void abort_restoring_volume_index(struct volume_index *volume_index)
+{
+       abort_restoring_volume_sub_index(&volume_index->vi_non_hook);
+       if (has_sparse(volume_index))
+               abort_restoring_volume_sub_index(&volume_index->vi_hook);
+}
+
+static int start_restoring_volume_sub_index(struct volume_sub_index *sub_index,
+                                           struct buffered_reader **readers,
+                                           unsigned int reader_count)
+{
+       unsigned int z;
+       int result;
+       u64 virtual_chapter_low = 0, virtual_chapter_high = 0;
+       unsigned int i;
+
+       for (i = 0; i < reader_count; i++) {
+               struct sub_index_data header;
+               u8 buffer[sizeof(struct sub_index_data)];
+               size_t offset = 0;
+               u32 j;
+
+               result = uds_read_from_buffered_reader(readers[i], buffer,
+                                                      sizeof(buffer));
+               if (result != UDS_SUCCESS) {
+                       return uds_log_warning_strerror(result,
+                                                       "failed to read volume index header");
+               }
+
+               memcpy(&header.magic, buffer, MAGIC_SIZE);
+               offset += MAGIC_SIZE;
+               decode_u64_le(buffer, &offset, &header.volume_nonce);
+               decode_u64_le(buffer, &offset, &header.virtual_chapter_low);
+               decode_u64_le(buffer, &offset, &header.virtual_chapter_high);
+               decode_u32_le(buffer, &offset, &header.first_list);
+               decode_u32_le(buffer, &offset, &header.list_count);
+
+               result = ASSERT(offset == sizeof(buffer),
+                               "%zu bytes decoded of %zu expected", offset,
+                               sizeof(buffer));
+               if (result != UDS_SUCCESS)
+                       result = UDS_CORRUPT_DATA;
+
+               if (memcmp(header.magic, MAGIC_START_5, MAGIC_SIZE) != 0) {
+                       return uds_log_warning_strerror(UDS_CORRUPT_DATA,
+                                                       "volume index file had bad magic number");
+               }
+
+               if (sub_index->volume_nonce == 0) {
+                       sub_index->volume_nonce = header.volume_nonce;
+               } else if (header.volume_nonce != sub_index->volume_nonce) {
+                       return uds_log_warning_strerror(UDS_CORRUPT_DATA,
+                                                       "volume index volume nonce incorrect");
+               }
+
+               if (i == 0) {
+                       virtual_chapter_low = header.virtual_chapter_low;
+                       virtual_chapter_high = header.virtual_chapter_high;
+               } else if (virtual_chapter_high != header.virtual_chapter_high) {
+                       u64 low = header.virtual_chapter_low;
+                       u64 high = header.virtual_chapter_high;
+
+                       return uds_log_warning_strerror(UDS_CORRUPT_DATA,
+                                                       "Inconsistent volume index zone files: Chapter range is [%llu,%llu], chapter range %d is [%llu,%llu]",
+                                                       (unsigned long long) virtual_chapter_low,
+                                                       (unsigned long long) virtual_chapter_high,
+                                                       i, (unsigned long long) low,
+                                                       (unsigned long long) high);
+               } else if (virtual_chapter_low < header.virtual_chapter_low) {
+                       virtual_chapter_low = header.virtual_chapter_low;
+               }
+
+               for (j = 0; j < header.list_count; j++) {
+                       u8 decoded[sizeof(u64)];
+
+                       result = uds_read_from_buffered_reader(readers[i], decoded,
+                                                              sizeof(u64));
+                       if (result != UDS_SUCCESS) {
+                               return uds_log_warning_strerror(result,
+                                                               "failed to read volume index flush ranges");
+                       }
+
+                       sub_index->flush_chapters[header.first_list + j] =
+                               get_unaligned_le64(decoded);
+               }
+       }
+
+       for (z = 0; z < sub_index->zone_count; z++) {
+               memset(&sub_index->zones[z], 0, sizeof(struct volume_sub_index_zone));
+               sub_index->zones[z].virtual_chapter_low = virtual_chapter_low;
+               sub_index->zones[z].virtual_chapter_high = virtual_chapter_high;
+       }
+
+       result = uds_start_restoring_delta_index(&sub_index->delta_index, readers,
+                                                reader_count);
+       if (result != UDS_SUCCESS)
+               return uds_log_warning_strerror(result, "restoring delta index failed");
+
+       return UDS_SUCCESS;
+}
+
+static int start_restoring_volume_index(struct volume_index *volume_index,
+                                       struct buffered_reader **buffered_readers,
+                                       unsigned int reader_count)
+{
+       unsigned int i;
+       int result;
+
+       if (!has_sparse(volume_index)) {
+               return start_restoring_volume_sub_index(&volume_index->vi_non_hook,
+                                                       buffered_readers, reader_count);
+       }
+
+       for (i = 0; i < reader_count; i++) {
+               struct volume_index_data header;
+               u8 buffer[sizeof(struct volume_index_data)];
+               size_t offset = 0;
+
+               result = uds_read_from_buffered_reader(buffered_readers[i], buffer,
+                                                      sizeof(buffer));
+               if (result != UDS_SUCCESS) {
+                       return uds_log_warning_strerror(result,
+                                                       "failed to read volume index header");
+               }
+
+               memcpy(&header.magic, buffer, MAGIC_SIZE);
+               offset += MAGIC_SIZE;
+               decode_u32_le(buffer, &offset, &header.sparse_sample_rate);
+
+               result = ASSERT(offset == sizeof(buffer),
+                               "%zu bytes decoded of %zu expected", offset,
+                               sizeof(buffer));
+               if (result != UDS_SUCCESS)
+                       result = UDS_CORRUPT_DATA;
+
+               if (memcmp(header.magic, MAGIC_START_6, MAGIC_SIZE) != 0)
+                       return uds_log_warning_strerror(UDS_CORRUPT_DATA,
+                                                       "volume index file had bad magic number");
+
+               if (i == 0) {
+                       volume_index->sparse_sample_rate = header.sparse_sample_rate;
+               } else if (volume_index->sparse_sample_rate != header.sparse_sample_rate) {
+                       uds_log_warning_strerror(UDS_CORRUPT_DATA,
+                                                "Inconsistent sparse sample rate in delta index zone files: %u vs. %u",
+                                                volume_index->sparse_sample_rate,
+                                                header.sparse_sample_rate);
+                       return UDS_CORRUPT_DATA;
+               }
+       }
+
+       result = start_restoring_volume_sub_index(&volume_index->vi_non_hook,
+                                                 buffered_readers, reader_count);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       return start_restoring_volume_sub_index(&volume_index->vi_hook, buffered_readers,
+                                               reader_count);
+}
+
+static int finish_restoring_volume_sub_index(struct volume_sub_index *sub_index,
+                                            struct buffered_reader **buffered_readers,
+                                            unsigned int reader_count)
+{
+       return uds_finish_restoring_delta_index(&sub_index->delta_index,
+                                               buffered_readers, reader_count);
+}
+
+static int finish_restoring_volume_index(struct volume_index *volume_index,
+                                        struct buffered_reader **buffered_readers,
+                                        unsigned int reader_count)
+{
+       int result;
+
+       result = finish_restoring_volume_sub_index(&volume_index->vi_non_hook,
+                                                  buffered_readers, reader_count);
+       if ((result == UDS_SUCCESS) && has_sparse(volume_index)) {
+               result = finish_restoring_volume_sub_index(&volume_index->vi_hook,
+                                                          buffered_readers,
+                                                          reader_count);
+       }
+
+       return result;
+}
+
+int uds_load_volume_index(struct volume_index *volume_index,
+                         struct buffered_reader **readers, unsigned int reader_count)
+{
+       int result;
+
+       /* Start by reading the header section of the stream. */
+       result = start_restoring_volume_index(volume_index, readers, reader_count);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       result = finish_restoring_volume_index(volume_index, readers, reader_count);
+       if (result != UDS_SUCCESS) {
+               abort_restoring_volume_index(volume_index);
+               return result;
+       }
+
+       /* Check the final guard lists to make sure there is no extra data. */
+       result = uds_check_guard_delta_lists(readers, reader_count);
+       if (result != UDS_SUCCESS)
+               abort_restoring_volume_index(volume_index);
+
+       return result;
+}
+
+static int start_saving_volume_sub_index(const struct volume_sub_index *sub_index,
+                                        unsigned int zone_number,
+                                        struct buffered_writer *buffered_writer)
+{
+       int result;
+       struct volume_sub_index_zone *volume_index_zone = &sub_index->zones[zone_number];
+       u32 first_list = sub_index->delta_index.delta_zones[zone_number].first_list;
+       u32 list_count = sub_index->delta_index.delta_zones[zone_number].list_count;
+       u8 buffer[sizeof(struct sub_index_data)];
+       size_t offset = 0;
+       u32 i;
+
+       memcpy(buffer, MAGIC_START_5, MAGIC_SIZE);
+       offset += MAGIC_SIZE;
+       encode_u64_le(buffer, &offset, sub_index->volume_nonce);
+       encode_u64_le(buffer, &offset, volume_index_zone->virtual_chapter_low);
+       encode_u64_le(buffer, &offset, volume_index_zone->virtual_chapter_high);
+       encode_u32_le(buffer, &offset, first_list);
+       encode_u32_le(buffer, &offset, list_count);
+
+       result =  ASSERT(offset == sizeof(struct sub_index_data),
+                        "%zu bytes of config written, of %zu expected", offset,
+                        sizeof(struct sub_index_data));
+       if (result != UDS_SUCCESS)
+               return result;
+
+       result = uds_write_to_buffered_writer(buffered_writer, buffer, offset);
+       if (result != UDS_SUCCESS)
+               return uds_log_warning_strerror(result,
+                                               "failed to write volume index header");
+
+       for (i = 0; i < list_count; i++) {
+               u8 encoded[sizeof(u64)];
+
+               put_unaligned_le64(sub_index->flush_chapters[first_list + i], &encoded);
+               result = uds_write_to_buffered_writer(buffered_writer, encoded,
+                                                     sizeof(u64));
+               if (result != UDS_SUCCESS) {
+                       return uds_log_warning_strerror(result,
+                                                       "failed to write volume index flush ranges");
+               }
+       }
+
+       return uds_start_saving_delta_index(&sub_index->delta_index, zone_number,
+                                           buffered_writer);
+}
+
+static int start_saving_volume_index(const struct volume_index *volume_index,
+                                    unsigned int zone_number,
+                                    struct buffered_writer *writer)
+{
+       u8 buffer[sizeof(struct volume_index_data)];
+       size_t offset = 0;
+       int result;
+
+       if (!has_sparse(volume_index)) {
+               return start_saving_volume_sub_index(&volume_index->vi_non_hook,
+                                                    zone_number, writer);
+       }
+
+       memcpy(buffer, MAGIC_START_6, MAGIC_SIZE);
+       offset += MAGIC_SIZE;
+       encode_u32_le(buffer, &offset, volume_index->sparse_sample_rate);
+       result = ASSERT(offset == sizeof(struct volume_index_data),
+                       "%zu bytes of header written, of %zu expected", offset,
+                       sizeof(struct volume_index_data));
+       if (result != UDS_SUCCESS)
+               return result;
+
+       result = uds_write_to_buffered_writer(writer, buffer, offset);
+       if (result != UDS_SUCCESS) {
+               uds_log_warning_strerror(result, "failed to write volume index header");
+               return result;
+       }
+
+       result = start_saving_volume_sub_index(&volume_index->vi_non_hook, zone_number,
+                                              writer);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       return start_saving_volume_sub_index(&volume_index->vi_hook, zone_number,
+                                            writer);
+}
+
+static int finish_saving_volume_sub_index(const struct volume_sub_index *sub_index,
+                                         unsigned int zone_number)
+{
+       return uds_finish_saving_delta_index(&sub_index->delta_index, zone_number);
+}
+
+static int finish_saving_volume_index(const struct volume_index *volume_index,
+                                     unsigned int zone_number)
+{
+       int result;
+
+       result = finish_saving_volume_sub_index(&volume_index->vi_non_hook, zone_number);
+       if ((result == UDS_SUCCESS) && has_sparse(volume_index))
+               result = finish_saving_volume_sub_index(&volume_index->vi_hook, zone_number);
+       return result;
+}
+
+int uds_save_volume_index(struct volume_index *volume_index,
+                         struct buffered_writer **writers, unsigned int writer_count)
+{
+       int result = UDS_SUCCESS;
+       unsigned int zone;
+
+       for (zone = 0; zone < writer_count; zone++) {
+               result = start_saving_volume_index(volume_index, zone, writers[zone]);
+               if (result != UDS_SUCCESS)
+                       break;
+
+               result = finish_saving_volume_index(volume_index, zone);
+               if (result != UDS_SUCCESS)
+                       break;
+
+               result = uds_write_guard_delta_list(writers[zone]);
+               if (result != UDS_SUCCESS)
+                       break;
+
+               result = uds_flush_buffered_writer(writers[zone]);
+               if (result != UDS_SUCCESS)
+                       break;
+       }
+
+       return result;
+}
+
+static void get_volume_sub_index_stats(const struct volume_sub_index *sub_index,
+                                      struct volume_index_stats *stats)
+{
+       struct delta_index_stats dis;
+       unsigned int z;
+
+       uds_get_delta_index_stats(&sub_index->delta_index, &dis);
+       stats->rebalance_time = dis.rebalance_time;
+       stats->rebalance_count = dis.rebalance_count;
+       stats->record_count = dis.record_count;
+       stats->collision_count = dis.collision_count;
+       stats->discard_count = dis.discard_count;
+       stats->overflow_count = dis.overflow_count;
+       stats->delta_lists = dis.list_count;
+       stats->early_flushes = 0;
+       for (z = 0; z < sub_index->zone_count; z++)
+               stats->early_flushes += sub_index->zones[z].early_flushes;
+}
+
+void uds_get_volume_index_stats(const struct volume_index *volume_index,
+                               struct volume_index_stats *stats)
+{
+       struct volume_index_stats sparse_stats;
+
+       get_volume_sub_index_stats(&volume_index->vi_non_hook, stats);
+       if (!has_sparse(volume_index))
+               return;
+
+       get_volume_sub_index_stats(&volume_index->vi_hook, &sparse_stats);
+       stats->rebalance_time += sparse_stats.rebalance_time;
+       stats->rebalance_count += sparse_stats.rebalance_count;
+       stats->record_count += sparse_stats.record_count;
+       stats->collision_count += sparse_stats.collision_count;
+       stats->discard_count += sparse_stats.discard_count;
+       stats->overflow_count += sparse_stats.overflow_count;
+       stats->delta_lists += sparse_stats.delta_lists;
+       stats->early_flushes += sparse_stats.early_flushes;
+}
+
+static int initialize_volume_sub_index(const struct uds_configuration *config,
+                                      u64 volume_nonce, u8 tag,
+                                      struct volume_sub_index *sub_index)
+{
+       struct sub_index_parameters params = { .address_bits = 0 };
+       unsigned int zone_count = config->zone_count;
+       u64 available_bytes = 0;
+       unsigned int z;
+       int result;
+
+       result = compute_volume_sub_index_parameters(config, &params);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       sub_index->address_bits = params.address_bits;
+       sub_index->address_mask = (1u << params.address_bits) - 1;
+       sub_index->chapter_bits = params.chapter_bits;
+       sub_index->chapter_mask = (1u << params.chapter_bits) - 1;
+       sub_index->chapter_count = params.chapter_count;
+       sub_index->list_count = params.list_count;
+       sub_index->zone_count = zone_count;
+       sub_index->chapter_zone_bits = params.chapter_size_in_bits / zone_count;
+       sub_index->volume_nonce = volume_nonce;
+
+       result = uds_initialize_delta_index(&sub_index->delta_index, zone_count,
+                                           params.list_count, params.mean_delta,
+                                           params.chapter_bits, params.memory_size,
+                                           tag);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       for (z = 0; z < sub_index->delta_index.zone_count; z++)
+               available_bytes += sub_index->delta_index.delta_zones[z].size;
+       available_bytes -= params.target_free_bytes;
+       sub_index->max_zone_bits = (available_bytes * BITS_PER_BYTE) / zone_count;
+       sub_index->memory_size = (sub_index->delta_index.memory_size +
+                                 sizeof(struct volume_sub_index) +
+                                 (params.list_count * sizeof(u64)) +
+                                 (zone_count * sizeof(struct volume_sub_index_zone)));
+
+       /* The following arrays are initialized to all zeros. */
+       result = uds_allocate(params.list_count, u64, "first chapter to flush",
+                             &sub_index->flush_chapters);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       return uds_allocate(zone_count, struct volume_sub_index_zone,
+                           "volume index zones", &sub_index->zones);
+}
+
+int uds_make_volume_index(const struct uds_configuration *config, u64 volume_nonce,
+                         struct volume_index **volume_index_ptr)
+{
+       struct split_config split;
+       unsigned int zone;
+       struct volume_index *volume_index;
+       int result;
+
+       result = uds_allocate(1, struct volume_index, "volume index", &volume_index);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       volume_index->zone_count = config->zone_count;
+
+       if (!uds_is_sparse_index_geometry(config->geometry)) {
+               result = initialize_volume_sub_index(config, volume_nonce, 'm',
+                                                    &volume_index->vi_non_hook);
+               if (result != UDS_SUCCESS) {
+                       uds_free_volume_index(volume_index);
+                       return result;
+               }
+
+               volume_index->memory_size = volume_index->vi_non_hook.memory_size;
+               *volume_index_ptr = volume_index;
+               return UDS_SUCCESS;
+       }
+
+       volume_index->sparse_sample_rate = config->sparse_sample_rate;
+
+       result = uds_allocate(config->zone_count, struct volume_index_zone,
+                             "volume index zones", &volume_index->zones);
+       if (result != UDS_SUCCESS) {
+               uds_free_volume_index(volume_index);
+               return result;
+       }
+
+       for (zone = 0; zone < config->zone_count; zone++)
+               mutex_init(&volume_index->zones[zone].hook_mutex);
+
+       split_configuration(config, &split);
+       result = initialize_volume_sub_index(&split.non_hook_config, volume_nonce, 'd',
+                                            &volume_index->vi_non_hook);
+       if (result != UDS_SUCCESS) {
+               uds_free_volume_index(volume_index);
+               return uds_log_error_strerror(result,
+                                             "Error creating non hook volume index");
+       }
+
+       result = initialize_volume_sub_index(&split.hook_config, volume_nonce, 's',
+                                            &volume_index->vi_hook);
+       if (result != UDS_SUCCESS) {
+               uds_free_volume_index(volume_index);
+               return uds_log_error_strerror(result,
+                                             "Error creating hook volume index");
+       }
+
+       volume_index->memory_size =
+               volume_index->vi_non_hook.memory_size + volume_index->vi_hook.memory_size;
+       *volume_index_ptr = volume_index;
+       return UDS_SUCCESS;
+}
diff --git a/drivers/md/dm-vdo/indexer/volume-index.h b/drivers/md/dm-vdo/indexer/volume-index.h
new file mode 100644 (file)
index 0000000..583998c
--- /dev/null
@@ -0,0 +1,193 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#ifndef UDS_VOLUME_INDEX_H
+#define UDS_VOLUME_INDEX_H
+
+#include <linux/limits.h>
+
+#include "thread-utils.h"
+
+#include "config.h"
+#include "delta-index.h"
+#include "indexer.h"
+
+/*
+ * The volume index is the primary top-level index for UDS. It contains records which map a record
+ * name to the chapter where a record with that name is stored. This mapping can definitively say
+ * when no record exists. However, because we only use a subset of the name for this index, it
+ * cannot definitively say that a record for the entry does exist. It can only say that if a record
+ * exists, it will be in a particular chapter. The request can then be dispatched to that chapter
+ * for further processing.
+ *
+ * If the volume_index_record does not actually match the record name, the index can store a more
+ * specific collision record to disambiguate the new entry from the existing one. Index entries are
+ * managed with volume_index_record structures.
+ */
+
+#define NO_CHAPTER U64_MAX
+
+struct volume_index_stats {
+       /* Nanoseconds spent rebalancing */
+       ktime_t rebalance_time;
+       /* Number of memory rebalances */
+       u32 rebalance_count;
+       /* The number of records in the index */
+       u64 record_count;
+       /* The number of collision records */
+       u64 collision_count;
+       /* The number of records removed */
+       u64 discard_count;
+       /* The number of UDS_OVERFLOWs detected */
+       u64 overflow_count;
+       /* The number of delta lists */
+       u32 delta_lists;
+       /* Number of early flushes */
+       u64 early_flushes;
+};
+
+struct volume_sub_index_zone {
+       u64 virtual_chapter_low;
+       u64 virtual_chapter_high;
+       u64 early_flushes;
+} __aligned(L1_CACHE_BYTES);
+
+struct volume_sub_index {
+       /* The delta index */
+       struct delta_index delta_index;
+       /* The first chapter to be flushed in each zone */
+       u64 *flush_chapters;
+       /* The zones */
+       struct volume_sub_index_zone *zones;
+       /* The volume nonce */
+       u64 volume_nonce;
+       /* Expected size of a chapter (per zone) */
+       u64 chapter_zone_bits;
+       /* Maximum size of the index (per zone) */
+       u64 max_zone_bits;
+       /* The number of bits in address mask */
+       u8 address_bits;
+       /* Mask to get address within delta list */
+       u32 address_mask;
+       /* The number of bits in chapter number */
+       u8 chapter_bits;
+       /* The largest storable chapter number */
+       u32 chapter_mask;
+       /* The number of chapters used */
+       u32 chapter_count;
+       /* The number of delta lists */
+       u32 list_count;
+       /* The number of zones */
+       unsigned int zone_count;
+       /* The amount of memory allocated */
+       u64 memory_size;
+};
+
+struct volume_index_zone {
+       /* Protects the sampled index in this zone */
+       struct mutex hook_mutex;
+} __aligned(L1_CACHE_BYTES);
+
+struct volume_index {
+       u32 sparse_sample_rate;
+       unsigned int zone_count;
+       u64 memory_size;
+       struct volume_sub_index vi_non_hook;
+       struct volume_sub_index vi_hook;
+       struct volume_index_zone *zones;
+};
+
+/*
+ * The volume_index_record structure is used to facilitate processing of a record name. A client
+ * first calls uds_get_volume_index_record() to find the volume index record for a record name. The
+ * fields of the record can then be examined to determine the state of the record.
+ *
+ * If is_found is false, then the index did not find an entry for the record name. Calling
+ * uds_put_volume_index_record() will insert a new entry for that name at the proper place.
+ *
+ * If is_found is true, then we did find an entry for the record name, and the virtual_chapter and
+ * is_collision fields reflect the entry found. Subsequently, a call to
+ * uds_remove_volume_index_record() will remove the entry, a call to
+ * uds_set_volume_index_record_chapter() will update the existing entry, and a call to
+ * uds_put_volume_index_record() will insert a new collision record after the existing entry.
+ */
+struct volume_index_record {
+       /* Public fields */
+
+       /* Chapter where the record info is found */
+       u64 virtual_chapter;
+       /* This record is a collision */
+       bool is_collision;
+       /* This record is the requested record */
+       bool is_found;
+
+       /* Private fields */
+
+       /* Zone that contains this name */
+       unsigned int zone_number;
+       /* The volume index */
+       struct volume_sub_index *sub_index;
+       /* Mutex for accessing this delta index entry in the hook index */
+       struct mutex *mutex;
+       /* The record name to which this record refers */
+       const struct uds_record_name *name;
+       /* The delta index entry for this record */
+       struct delta_index_entry delta_entry;
+};
+
+int __must_check uds_make_volume_index(const struct uds_configuration *config,
+                                      u64 volume_nonce,
+                                      struct volume_index **volume_index);
+
+void uds_free_volume_index(struct volume_index *volume_index);
+
+int __must_check uds_compute_volume_index_save_blocks(const struct uds_configuration *config,
+                                                     size_t block_size,
+                                                     u64 *block_count);
+
+unsigned int __must_check uds_get_volume_index_zone(const struct volume_index *volume_index,
+                                                   const struct uds_record_name *name);
+
+bool __must_check uds_is_volume_index_sample(const struct volume_index *volume_index,
+                                            const struct uds_record_name *name);
+
+/*
+ * This function is only used to manage sparse cache membership. Most requests should use
+ * uds_get_volume_index_record() to look up index records instead.
+ */
+u64 __must_check uds_lookup_volume_index_name(const struct volume_index *volume_index,
+                                             const struct uds_record_name *name);
+
+int __must_check uds_get_volume_index_record(struct volume_index *volume_index,
+                                            const struct uds_record_name *name,
+                                            struct volume_index_record *record);
+
+int __must_check uds_put_volume_index_record(struct volume_index_record *record,
+                                            u64 virtual_chapter);
+
+int __must_check uds_remove_volume_index_record(struct volume_index_record *record);
+
+int __must_check uds_set_volume_index_record_chapter(struct volume_index_record *record,
+                                                    u64 virtual_chapter);
+
+void uds_set_volume_index_open_chapter(struct volume_index *volume_index,
+                                      u64 virtual_chapter);
+
+void uds_set_volume_index_zone_open_chapter(struct volume_index *volume_index,
+                                           unsigned int zone_number,
+                                           u64 virtual_chapter);
+
+int __must_check uds_load_volume_index(struct volume_index *volume_index,
+                                      struct buffered_reader **readers,
+                                      unsigned int reader_count);
+
+int __must_check uds_save_volume_index(struct volume_index *volume_index,
+                                      struct buffered_writer **writers,
+                                      unsigned int writer_count);
+
+void uds_get_volume_index_stats(const struct volume_index *volume_index,
+                               struct volume_index_stats *stats);
+
+#endif /* UDS_VOLUME_INDEX_H */
diff --git a/drivers/md/dm-vdo/indexer/volume.c b/drivers/md/dm-vdo/indexer/volume.c
new file mode 100644 (file)
index 0000000..eca83b6
--- /dev/null
@@ -0,0 +1,1695 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#include "volume.h"
+
+#include <linux/atomic.h>
+#include <linux/dm-bufio.h>
+#include <linux/err.h>
+
+#include "errors.h"
+#include "logger.h"
+#include "memory-alloc.h"
+#include "permassert.h"
+#include "string-utils.h"
+#include "thread-utils.h"
+
+#include "chapter-index.h"
+#include "config.h"
+#include "geometry.h"
+#include "hash-utils.h"
+#include "index.h"
+#include "sparse-cache.h"
+
+/*
+ * The first block of the volume layout is reserved for the volume header, which is no longer used.
+ * The remainder of the volume is divided into chapters consisting of several pages of records, and
+ * several pages of static index to use to find those records. The index pages are recorded first,
+ * followed by the record pages. The chapters are written in order as they are filled, so the
+ * volume storage acts as a circular log of the most recent chapters, with each new chapter
+ * overwriting the oldest saved one.
+ *
+ * When a new chapter is filled and closed, the records from that chapter are sorted and
+ * interleaved in approximate temporal order, and assigned to record pages. Then a static delta
+ * index is generated to store which record page contains each record. The in-memory index page map
+ * is also updated to indicate which delta lists fall on each chapter index page. This means that
+ * when a record is read, the volume only has to load a single index page and a single record page,
+ * rather than search the entire chapter. These index and record pages are written to storage, and
+ * the index pages are transferred to the page cache under the theory that the most recently
+ * written chapter is likely to be accessed again soon.
+ *
+ * When reading a record, the volume index will indicate which chapter should contain it. The
+ * volume uses the index page map to determine which chapter index page needs to be loaded, and
+ * then reads the relevant record page number from the chapter index. Both index and record pages
+ * are stored in a page cache when read for the common case that subsequent records need the same
+ * pages. The page cache evicts the least recently accessed entries when caching new pages. In
+ * addition, the volume uses dm-bufio to manage access to the storage, which may allow for
+ * additional caching depending on available system resources.
+ *
+ * Record requests are handled from cached pages when possible. If a page needs to be read, it is
+ * placed on a queue along with the request that wants to read it. Any requests for the same page
+ * that arrive while the read is pending are added to the queue entry. A separate reader thread
+ * handles the queued reads, adding the page to the cache and updating any requests queued with it
+ * so they can continue processing. This allows the index zone threads to continue processing new
+ * requests rather than wait for the storage reads.
+ *
+ * When an index rebuild is necessary, the volume reads each stored chapter to determine which
+ * range of chapters contain valid records, so that those records can be used to reconstruct the
+ * in-memory volume index.
+ */
+
+enum {
+       /* The maximum allowable number of contiguous bad chapters */
+       MAX_BAD_CHAPTERS = 100,
+       VOLUME_CACHE_MAX_ENTRIES = (U16_MAX >> 1),
+       VOLUME_CACHE_QUEUED_FLAG = (1 << 15),
+       VOLUME_CACHE_MAX_QUEUED_READS = 4096,
+};
+
+static const u64 BAD_CHAPTER = U64_MAX;
+
+/*
+ * The invalidate counter is two 32 bits fields stored together atomically. The low order 32 bits
+ * are the physical page number of the cached page being read. The high order 32 bits are a
+ * sequence number. This value is written when the zone that owns it begins or completes a cache
+ * search. Any other thread will only read the counter in wait_for_pending_searches() while waiting
+ * to update the cache contents.
+ */
+union invalidate_counter {
+       u64 value;
+       struct {
+               u32 page;
+               u32 counter;
+       };
+};
+
+static inline u32 map_to_page_number(struct index_geometry *geometry, u32 physical_page)
+{
+       return (physical_page - HEADER_PAGES_PER_VOLUME) % geometry->pages_per_chapter;
+}
+
+static inline u32 map_to_chapter_number(struct index_geometry *geometry, u32 physical_page)
+{
+       return (physical_page - HEADER_PAGES_PER_VOLUME) / geometry->pages_per_chapter;
+}
+
+static inline bool is_record_page(struct index_geometry *geometry, u32 physical_page)
+{
+       return map_to_page_number(geometry, physical_page) >= geometry->index_pages_per_chapter;
+}
+
+static u32 map_to_physical_page(const struct index_geometry *geometry, u32 chapter, u32 page)
+{
+       /* Page zero is the header page, so the first chapter index page is page one. */
+       return HEADER_PAGES_PER_VOLUME + (geometry->pages_per_chapter * chapter) + page;
+}
+
+static inline union invalidate_counter get_invalidate_counter(struct page_cache *cache,
+                                                             unsigned int zone_number)
+{
+       return (union invalidate_counter) {
+               .value = READ_ONCE(cache->search_pending_counters[zone_number].atomic_value),
+       };
+}
+
+static inline void set_invalidate_counter(struct page_cache *cache,
+                                         unsigned int zone_number,
+                                         union invalidate_counter invalidate_counter)
+{
+       WRITE_ONCE(cache->search_pending_counters[zone_number].atomic_value,
+                  invalidate_counter.value);
+}
+
+static inline bool search_pending(union invalidate_counter invalidate_counter)
+{
+       return (invalidate_counter.counter & 1) != 0;
+}
+
+/* Lock the cache for a zone in order to search for a page. */
+static void begin_pending_search(struct page_cache *cache, u32 physical_page,
+                                unsigned int zone_number)
+{
+       union invalidate_counter invalidate_counter =
+               get_invalidate_counter(cache, zone_number);
+
+       invalidate_counter.page = physical_page;
+       invalidate_counter.counter++;
+       set_invalidate_counter(cache, zone_number, invalidate_counter);
+       ASSERT_LOG_ONLY(search_pending(invalidate_counter),
+                       "Search is pending for zone %u", zone_number);
+       /*
+        * This memory barrier ensures that the write to the invalidate counter is seen by other
+        * threads before this thread accesses the cached page. The corresponding read memory
+        * barrier is in wait_for_pending_searches().
+        */
+       smp_mb();
+}
+
+/* Unlock the cache for a zone by clearing its invalidate counter. */
+static void end_pending_search(struct page_cache *cache, unsigned int zone_number)
+{
+       union invalidate_counter invalidate_counter;
+
+       /*
+        * This memory barrier ensures that this thread completes reads of the
+        * cached page before other threads see the write to the invalidate
+        * counter.
+        */
+       smp_mb();
+
+       invalidate_counter = get_invalidate_counter(cache, zone_number);
+       ASSERT_LOG_ONLY(search_pending(invalidate_counter),
+                       "Search is pending for zone %u", zone_number);
+       invalidate_counter.counter++;
+       set_invalidate_counter(cache, zone_number, invalidate_counter);
+}
+
+static void wait_for_pending_searches(struct page_cache *cache, u32 physical_page)
+{
+       union invalidate_counter initial_counters[MAX_ZONES];
+       unsigned int i;
+
+       /*
+        * We hold the read_threads_mutex. We are waiting for threads that do not hold the
+        * read_threads_mutex. Those threads have "locked" their targeted page by setting the
+        * search_pending_counter. The corresponding write memory barrier is in
+        * begin_pending_search().
+        */
+       smp_mb();
+
+       for (i = 0; i < cache->zone_count; i++)
+               initial_counters[i] = get_invalidate_counter(cache, i);
+       for (i = 0; i < cache->zone_count; i++) {
+               if (search_pending(initial_counters[i]) &&
+                   (initial_counters[i].page == physical_page)) {
+                       /*
+                        * There is an active search using the physical page. We need to wait for
+                        * the search to finish.
+                        *
+                        * FIXME: Investigate using wait_event() to wait for the search to finish.
+                        */
+                       while (initial_counters[i].value ==
+                              get_invalidate_counter(cache, i).value)
+                               cond_resched();
+               }
+       }
+}
+
+static void release_page_buffer(struct cached_page *page)
+{
+       if (page->buffer != NULL)
+               dm_bufio_release(uds_forget(page->buffer));
+}
+
+static void clear_cache_page(struct page_cache *cache, struct cached_page *page)
+{
+       /* Do not clear read_pending because the read queue relies on it. */
+       release_page_buffer(page);
+       page->physical_page = cache->indexable_pages;
+       WRITE_ONCE(page->last_used, 0);
+}
+
+static void make_page_most_recent(struct page_cache *cache, struct cached_page *page)
+{
+       /*
+        * ASSERTION: We are either a zone thread holding a search_pending_counter, or we are any
+        * thread holding the read_threads_mutex.
+        */
+       if (atomic64_read(&cache->clock) != READ_ONCE(page->last_used))
+               WRITE_ONCE(page->last_used, atomic64_inc_return(&cache->clock));
+}
+
+/* Select a page to remove from the cache to make space for a new entry. */
+static struct cached_page *select_victim_in_cache(struct page_cache *cache)
+{
+       struct cached_page *page;
+       int oldest_index = 0;
+       s64 oldest_time = S64_MAX;
+       s64 last_used;
+       u16 i;
+
+       /* Find the oldest unclaimed page. We hold the read_threads_mutex. */
+       for (i = 0; i < cache->cache_slots; i++) {
+               /* A page with a pending read must not be replaced. */
+               if (cache->cache[i].read_pending)
+                       continue;
+
+               last_used = READ_ONCE(cache->cache[i].last_used);
+               if (last_used <= oldest_time) {
+                       oldest_time = last_used;
+                       oldest_index = i;
+               }
+       }
+
+       page = &cache->cache[oldest_index];
+       if (page->physical_page != cache->indexable_pages) {
+               WRITE_ONCE(cache->index[page->physical_page], cache->cache_slots);
+               wait_for_pending_searches(cache, page->physical_page);
+       }
+
+       page->read_pending = true;
+       clear_cache_page(cache, page);
+       return page;
+}
+
+/* Make a newly filled cache entry available to other threads. */
+static int put_page_in_cache(struct page_cache *cache, u32 physical_page,
+                            struct cached_page *page)
+{
+       int result;
+
+       /* We hold the read_threads_mutex. */
+       result = ASSERT((page->read_pending), "page to install has a pending read");
+       if (result != UDS_SUCCESS)
+               return result;
+
+       page->physical_page = physical_page;
+       make_page_most_recent(cache, page);
+       page->read_pending = false;
+
+       /*
+        * We hold the read_threads_mutex, but we must have a write memory barrier before making
+        * the cached_page available to the readers that do not hold the mutex. The corresponding
+        * read memory barrier is in get_page_and_index().
+        */
+       smp_wmb();
+
+       /* This assignment also clears the queued flag. */
+       WRITE_ONCE(cache->index[physical_page], page - cache->cache);
+       return UDS_SUCCESS;
+}
+
+static void cancel_page_in_cache(struct page_cache *cache, u32 physical_page,
+                                struct cached_page *page)
+{
+       int result;
+
+       /* We hold the read_threads_mutex. */
+       result = ASSERT((page->read_pending), "page to install has a pending read");
+       if (result != UDS_SUCCESS)
+               return;
+
+       clear_cache_page(cache, page);
+       page->read_pending = false;
+
+       /* Clear the mapping and the queued flag for the new page. */
+       WRITE_ONCE(cache->index[physical_page], cache->cache_slots);
+}
+
+static inline u16 next_queue_position(u16 position)
+{
+       return (position + 1) % VOLUME_CACHE_MAX_QUEUED_READS;
+}
+
+static inline void advance_queue_position(u16 *position)
+{
+       *position = next_queue_position(*position);
+}
+
+static inline bool read_queue_is_full(struct page_cache *cache)
+{
+       return cache->read_queue_first == next_queue_position(cache->read_queue_last);
+}
+
+static bool enqueue_read(struct page_cache *cache, struct uds_request *request,
+                        u32 physical_page)
+{
+       struct queued_read *queue_entry;
+       u16 last = cache->read_queue_last;
+       u16 read_queue_index;
+
+       /* We hold the read_threads_mutex. */
+       if ((cache->index[physical_page] & VOLUME_CACHE_QUEUED_FLAG) == 0) {
+               /* This page has no existing entry in the queue. */
+               if (read_queue_is_full(cache))
+                       return false;
+
+               /* Fill in the read queue entry. */
+               cache->read_queue[last].physical_page = physical_page;
+               cache->read_queue[last].invalid = false;
+               cache->read_queue[last].first_request = NULL;
+               cache->read_queue[last].last_request = NULL;
+
+               /* Point the cache index to the read queue entry. */
+               read_queue_index = last;
+               WRITE_ONCE(cache->index[physical_page],
+                          read_queue_index | VOLUME_CACHE_QUEUED_FLAG);
+
+               advance_queue_position(&cache->read_queue_last);
+       } else {
+               /* It's already queued, so add this request to the existing entry. */
+               read_queue_index = cache->index[physical_page] & ~VOLUME_CACHE_QUEUED_FLAG;
+       }
+
+       request->next_request = NULL;
+       queue_entry = &cache->read_queue[read_queue_index];
+       if (queue_entry->first_request == NULL)
+               queue_entry->first_request = request;
+       else
+               queue_entry->last_request->next_request = request;
+       queue_entry->last_request = request;
+
+       return true;
+}
+
+static void enqueue_page_read(struct volume *volume, struct uds_request *request,
+                             u32 physical_page)
+{
+       /* Mark the page as queued, so that chapter invalidation knows to cancel a read. */
+       while (!enqueue_read(&volume->page_cache, request, physical_page)) {
+               uds_log_debug("Read queue full, waiting for reads to finish");
+               uds_wait_cond(&volume->read_threads_read_done_cond,
+                             &volume->read_threads_mutex);
+       }
+
+       uds_signal_cond(&volume->read_threads_cond);
+}
+
+/*
+ * Reserve the next read queue entry for processing, but do not actually remove it from the queue.
+ * Must be followed by release_queued_requests().
+ */
+static struct queued_read *reserve_read_queue_entry(struct page_cache *cache)
+{
+       /* We hold the read_threads_mutex. */
+       struct queued_read *entry;
+       u16 index_value;
+       bool queued;
+
+       /* No items to dequeue */
+       if (cache->read_queue_next_read == cache->read_queue_last)
+               return NULL;
+
+       entry = &cache->read_queue[cache->read_queue_next_read];
+       index_value = cache->index[entry->physical_page];
+       queued = (index_value & VOLUME_CACHE_QUEUED_FLAG) != 0;
+       /* Check to see if it's still queued before resetting. */
+       if (entry->invalid && queued)
+               WRITE_ONCE(cache->index[entry->physical_page], cache->cache_slots);
+
+       /*
+        * If a synchronous read has taken this page, set invalid to true so it doesn't get
+        * overwritten. Requests will just be requeued.
+        */
+       if (!queued)
+               entry->invalid = true;
+
+       entry->reserved = true;
+       advance_queue_position(&cache->read_queue_next_read);
+       return entry;
+}
+
+static inline struct queued_read *wait_to_reserve_read_queue_entry(struct volume *volume)
+{
+       struct queued_read *queue_entry = NULL;
+
+       while (!volume->read_threads_exiting) {
+               queue_entry = reserve_read_queue_entry(&volume->page_cache);
+               if (queue_entry != NULL)
+                       break;
+
+               uds_wait_cond(&volume->read_threads_cond, &volume->read_threads_mutex);
+       }
+
+       return queue_entry;
+}
+
+static int init_chapter_index_page(const struct volume *volume, u8 *index_page,
+                                  u32 chapter, u32 index_page_number,
+                                  struct delta_index_page *chapter_index_page)
+{
+       u64 ci_virtual;
+       u32 ci_chapter;
+       u32 lowest_list;
+       u32 highest_list;
+       struct index_geometry *geometry = volume->geometry;
+       int result;
+
+       result = uds_initialize_chapter_index_page(chapter_index_page, geometry,
+                                                  index_page, volume->nonce);
+       if (volume->lookup_mode == LOOKUP_FOR_REBUILD)
+               return result;
+
+       if (result != UDS_SUCCESS) {
+               return uds_log_error_strerror(result,
+                                             "Reading chapter index page for chapter %u page %u",
+                                             chapter, index_page_number);
+       }
+
+       uds_get_list_number_bounds(volume->index_page_map, chapter, index_page_number,
+                                  &lowest_list, &highest_list);
+       ci_virtual = chapter_index_page->virtual_chapter_number;
+       ci_chapter = uds_map_to_physical_chapter(geometry, ci_virtual);
+       if ((chapter == ci_chapter) &&
+           (lowest_list == chapter_index_page->lowest_list_number) &&
+           (highest_list == chapter_index_page->highest_list_number))
+               return UDS_SUCCESS;
+
+       uds_log_warning("Index page map updated to %llu",
+                       (unsigned long long) volume->index_page_map->last_update);
+       uds_log_warning("Page map expects that chapter %u page %u has range %u to %u, but chapter index page has chapter %llu with range %u to %u",
+                       chapter, index_page_number, lowest_list, highest_list,
+                       (unsigned long long) ci_virtual,
+                       chapter_index_page->lowest_list_number,
+                       chapter_index_page->highest_list_number);
+       return uds_log_error_strerror(UDS_CORRUPT_DATA,
+                                     "index page map mismatch with chapter index");
+}
+
+static int initialize_index_page(const struct volume *volume, u32 physical_page,
+                                struct cached_page *page)
+{
+       u32 chapter = map_to_chapter_number(volume->geometry, physical_page);
+       u32 index_page_number = map_to_page_number(volume->geometry, physical_page);
+
+       return init_chapter_index_page(volume, dm_bufio_get_block_data(page->buffer),
+                                      chapter, index_page_number, &page->index_page);
+}
+
+static bool search_record_page(const u8 record_page[],
+                              const struct uds_record_name *name,
+                              const struct index_geometry *geometry,
+                              struct uds_record_data *metadata)
+{
+       /*
+        * The array of records is sorted by name and stored as a binary tree in heap order, so the
+        * root of the tree is the first array element.
+        */
+       u32 node = 0;
+       const struct uds_volume_record *records = (const struct uds_volume_record *) record_page;
+
+       while (node < geometry->records_per_page) {
+               int result;
+               const struct uds_volume_record *record = &records[node];
+
+               result = memcmp(name, &record->name, UDS_RECORD_NAME_SIZE);
+               if (result == 0) {
+                       if (metadata != NULL)
+                               *metadata = record->data;
+                       return true;
+               }
+
+               /* The children of node N are at indexes 2N+1 and 2N+2. */
+               node = ((2 * node) + ((result < 0) ? 1 : 2));
+       }
+
+       return false;
+}
+
+/*
+ * If we've read in a record page, we're going to do an immediate search, to speed up processing by
+ * avoiding get_record_from_zone(), and to ensure that requests make progress even when queued. If
+ * we've read in an index page, we save the record page number so we don't have to resolve the
+ * index page again. We use the location, virtual_chapter, and old_metadata fields in the request
+ * to allow the index code to know where to begin processing the request again.
+ */
+static int search_page(struct cached_page *page, const struct volume *volume,
+                      struct uds_request *request, u32 physical_page)
+{
+       int result;
+       enum uds_index_region location;
+       u16 record_page_number;
+
+       if (is_record_page(volume->geometry, physical_page)) {
+               if (search_record_page(dm_bufio_get_block_data(page->buffer),
+                                      &request->record_name, volume->geometry,
+                                      &request->old_metadata))
+                       location = UDS_LOCATION_RECORD_PAGE_LOOKUP;
+               else
+                       location = UDS_LOCATION_UNAVAILABLE;
+       } else {
+               result = uds_search_chapter_index_page(&page->index_page,
+                                                      volume->geometry,
+                                                      &request->record_name,
+                                                      &record_page_number);
+               if (result != UDS_SUCCESS)
+                       return result;
+
+               if (record_page_number == NO_CHAPTER_INDEX_ENTRY) {
+                       location = UDS_LOCATION_UNAVAILABLE;
+               } else {
+                       location = UDS_LOCATION_INDEX_PAGE_LOOKUP;
+                       *((u16 *) &request->old_metadata) = record_page_number;
+               }
+       }
+
+       request->location = location;
+       request->found = false;
+       return UDS_SUCCESS;
+}
+
+static int process_entry(struct volume *volume, struct queued_read *entry)
+{
+       u32 page_number = entry->physical_page;
+       struct uds_request *request;
+       struct cached_page *page = NULL;
+       u8 *page_data;
+       int result;
+
+       if (entry->invalid) {
+               uds_log_debug("Requeuing requests for invalid page");
+               return UDS_SUCCESS;
+       }
+
+       page = select_victim_in_cache(&volume->page_cache);
+
+       mutex_unlock(&volume->read_threads_mutex);
+       page_data = dm_bufio_read(volume->client, page_number, &page->buffer);
+       mutex_lock(&volume->read_threads_mutex);
+       if (IS_ERR(page_data)) {
+               result = -PTR_ERR(page_data);
+               uds_log_warning_strerror(result,
+                                        "error reading physical page %u from volume",
+                                        page_number);
+               cancel_page_in_cache(&volume->page_cache, page_number, page);
+               return result;
+       }
+
+       if (entry->invalid) {
+               uds_log_warning("Page %u invalidated after read", page_number);
+               cancel_page_in_cache(&volume->page_cache, page_number, page);
+               return UDS_SUCCESS;
+       }
+
+       if (!is_record_page(volume->geometry, page_number)) {
+               result = initialize_index_page(volume, page_number, page);
+               if (result != UDS_SUCCESS) {
+                       uds_log_warning("Error initializing chapter index page");
+                       cancel_page_in_cache(&volume->page_cache, page_number, page);
+                       return result;
+               }
+       }
+
+       result = put_page_in_cache(&volume->page_cache, page_number, page);
+       if (result != UDS_SUCCESS) {
+               uds_log_warning("Error putting page %u in cache", page_number);
+               cancel_page_in_cache(&volume->page_cache, page_number, page);
+               return result;
+       }
+
+       request = entry->first_request;
+       while ((request != NULL) && (result == UDS_SUCCESS)) {
+               result = search_page(page, volume, request, page_number);
+               request = request->next_request;
+       }
+
+       return result;
+}
+
+static void release_queued_requests(struct volume *volume, struct queued_read *entry,
+                                   int result)
+{
+       struct page_cache *cache = &volume->page_cache;
+       u16 next_read = cache->read_queue_next_read;
+       struct uds_request *request;
+       struct uds_request *next;
+
+       for (request = entry->first_request; request != NULL; request = next) {
+               next = request->next_request;
+               request->status = result;
+               request->requeued = true;
+               uds_enqueue_request(request, STAGE_INDEX);
+       }
+
+       entry->reserved = false;
+
+       /* Move the read_queue_first pointer as far as we can. */
+       while ((cache->read_queue_first != next_read) &&
+              (!cache->read_queue[cache->read_queue_first].reserved))
+               advance_queue_position(&cache->read_queue_first);
+       uds_broadcast_cond(&volume->read_threads_read_done_cond);
+}
+
+static void read_thread_function(void *arg)
+{
+       struct volume *volume = arg;
+
+       uds_log_debug("reader starting");
+       mutex_lock(&volume->read_threads_mutex);
+       while (true) {
+               struct queued_read *queue_entry;
+               int result;
+
+               queue_entry = wait_to_reserve_read_queue_entry(volume);
+               if (volume->read_threads_exiting)
+                       break;
+
+               result = process_entry(volume, queue_entry);
+               release_queued_requests(volume, queue_entry, result);
+       }
+       mutex_unlock(&volume->read_threads_mutex);
+       uds_log_debug("reader done");
+}
+
+static void get_page_and_index(struct page_cache *cache, u32 physical_page,
+                              int *queue_index, struct cached_page **page_ptr)
+{
+       u16 index_value;
+       u16 index;
+       bool queued;
+
+       /*
+        * ASSERTION: We are either a zone thread holding a search_pending_counter, or we are any
+        * thread holding the read_threads_mutex.
+        *
+        * Holding only a search_pending_counter is the most frequent case.
+        */
+       /*
+        * It would be unlikely for the compiler to turn the usage of index_value into two reads of
+        * cache->index, but it would be possible and very bad if those reads did not return the
+        * same bits.
+        */
+       index_value = READ_ONCE(cache->index[physical_page]);
+       queued = (index_value & VOLUME_CACHE_QUEUED_FLAG) != 0;
+       index = index_value & ~VOLUME_CACHE_QUEUED_FLAG;
+
+       if (!queued && (index < cache->cache_slots)) {
+               *page_ptr = &cache->cache[index];
+               /*
+                * We have acquired access to the cached page, but unless we hold the
+                * read_threads_mutex, we need a read memory barrier now. The corresponding write
+                * memory barrier is in put_page_in_cache().
+                */
+               smp_rmb();
+       } else {
+               *page_ptr = NULL;
+       }
+
+       *queue_index = queued ? index : -1;
+}
+
+static void get_page_from_cache(struct page_cache *cache, u32 physical_page,
+                               struct cached_page **page)
+{
+       /*
+        * ASSERTION: We are in a zone thread.
+        * ASSERTION: We holding a search_pending_counter or the read_threads_mutex.
+        */
+       int queue_index = -1;
+
+       get_page_and_index(cache, physical_page, &queue_index, page);
+}
+
+static int read_page_locked(struct volume *volume, u32 physical_page,
+                           struct cached_page **page_ptr)
+{
+       int result = UDS_SUCCESS;
+       struct cached_page *page = NULL;
+       u8 *page_data;
+
+       page = select_victim_in_cache(&volume->page_cache);
+       page_data = dm_bufio_read(volume->client, physical_page, &page->buffer);
+       if (IS_ERR(page_data)) {
+               result = -PTR_ERR(page_data);
+               uds_log_warning_strerror(result,
+                                        "error reading physical page %u from volume",
+                                        physical_page);
+               cancel_page_in_cache(&volume->page_cache, physical_page, page);
+               return result;
+       }
+
+       if (!is_record_page(volume->geometry, physical_page)) {
+               result = initialize_index_page(volume, physical_page, page);
+               if (result != UDS_SUCCESS) {
+                       if (volume->lookup_mode != LOOKUP_FOR_REBUILD)
+                               uds_log_warning("Corrupt index page %u", physical_page);
+                       cancel_page_in_cache(&volume->page_cache, physical_page, page);
+                       return result;
+               }
+       }
+
+       result = put_page_in_cache(&volume->page_cache, physical_page, page);
+       if (result != UDS_SUCCESS) {
+               uds_log_warning("Error putting page %u in cache", physical_page);
+               cancel_page_in_cache(&volume->page_cache, physical_page, page);
+               return result;
+       }
+
+       *page_ptr = page;
+       return UDS_SUCCESS;
+}
+
+/* Retrieve a page from the cache while holding the read threads mutex. */
+static int get_volume_page_locked(struct volume *volume, u32 physical_page,
+                                 struct cached_page **page_ptr)
+{
+       int result;
+       struct cached_page *page = NULL;
+
+       get_page_from_cache(&volume->page_cache, physical_page, &page);
+       if (page == NULL) {
+               result = read_page_locked(volume, physical_page, &page);
+               if (result != UDS_SUCCESS)
+                       return result;
+       } else {
+               make_page_most_recent(&volume->page_cache, page);
+       }
+
+       *page_ptr = page;
+       return UDS_SUCCESS;
+}
+
+/* Retrieve a page from the cache while holding a search_pending lock. */
+static int get_volume_page_protected(struct volume *volume, struct uds_request *request,
+                                    u32 physical_page, struct cached_page **page_ptr)
+{
+       struct cached_page *page;
+
+       get_page_from_cache(&volume->page_cache, physical_page, &page);
+       if (page != NULL) {
+               if (request->zone_number == 0) {
+                       /* Only one zone is allowed to update the LRU. */
+                       make_page_most_recent(&volume->page_cache, page);
+               }
+
+               *page_ptr = page;
+               return UDS_SUCCESS;
+       }
+
+       /* Prepare to enqueue a read for the page. */
+       end_pending_search(&volume->page_cache, request->zone_number);
+       mutex_lock(&volume->read_threads_mutex);
+
+       /*
+        * Do the lookup again while holding the read mutex (no longer the fast case so this should
+        * be fine to repeat). We need to do this because a page may have been added to the cache
+        * by a reader thread between the time we searched above and the time we went to actually
+        * try to enqueue it below. This could result in us enqueuing another read for a page which
+        * is already in the cache, which would mean we end up with two entries in the cache for
+        * the same page.
+        */
+       get_page_from_cache(&volume->page_cache, physical_page, &page);
+       if (page == NULL) {
+               enqueue_page_read(volume, request, physical_page);
+               /*
+                * The performance gain from unlocking first, while "search pending" mode is off,
+                * turns out to be significant in some cases. The page is not available yet so
+                * the order does not matter for correctness as it does below.
+                */
+               mutex_unlock(&volume->read_threads_mutex);
+               begin_pending_search(&volume->page_cache, physical_page,
+                                    request->zone_number);
+               return UDS_QUEUED;
+       }
+
+       /*
+        * Now that the page is loaded, the volume needs to switch to "reader thread unlocked" and
+        * "search pending" state in careful order so no other thread can mess with the data before
+        * the caller gets to look at it.
+        */
+       begin_pending_search(&volume->page_cache, physical_page, request->zone_number);
+       mutex_unlock(&volume->read_threads_mutex);
+       *page_ptr = page;
+       return UDS_SUCCESS;
+}
+
+static int get_volume_page(struct volume *volume, u32 chapter, u32 page_number,
+                          struct cached_page **page_ptr)
+{
+       int result;
+       u32 physical_page = map_to_physical_page(volume->geometry, chapter, page_number);
+
+       mutex_lock(&volume->read_threads_mutex);
+       result = get_volume_page_locked(volume, physical_page, page_ptr);
+       mutex_unlock(&volume->read_threads_mutex);
+       return result;
+}
+
+int uds_get_volume_record_page(struct volume *volume, u32 chapter, u32 page_number,
+                              u8 **data_ptr)
+{
+       int result;
+       struct cached_page *page = NULL;
+
+       result = get_volume_page(volume, chapter, page_number, &page);
+       if (result == UDS_SUCCESS)
+               *data_ptr = dm_bufio_get_block_data(page->buffer);
+       return result;
+}
+
+int uds_get_volume_index_page(struct volume *volume, u32 chapter, u32 page_number,
+                             struct delta_index_page **index_page_ptr)
+{
+       int result;
+       struct cached_page *page = NULL;
+
+       result = get_volume_page(volume, chapter, page_number, &page);
+       if (result == UDS_SUCCESS)
+               *index_page_ptr = &page->index_page;
+       return result;
+}
+
+/*
+ * Find the record page associated with a name in a given index page. This will return UDS_QUEUED
+ * if the page in question must be read from storage.
+ */
+static int search_cached_index_page(struct volume *volume, struct uds_request *request,
+                                   u32 chapter, u32 index_page_number,
+                                   u16 *record_page_number)
+{
+       int result;
+       struct cached_page *page = NULL;
+       u32 physical_page = map_to_physical_page(volume->geometry, chapter,
+                                                index_page_number);
+
+       /*
+        * Make sure the invalidate counter is updated before we try and read the mapping. This
+        * prevents this thread from reading a page in the cache which has already been marked for
+        * invalidation by the reader thread, before the reader thread has noticed that the
+        * invalidate_counter has been incremented.
+        */
+       begin_pending_search(&volume->page_cache, physical_page, request->zone_number);
+
+       result = get_volume_page_protected(volume, request, physical_page, &page);
+       if (result != UDS_SUCCESS) {
+               end_pending_search(&volume->page_cache, request->zone_number);
+               return result;
+       }
+
+       result = uds_search_chapter_index_page(&page->index_page, volume->geometry,
+                                              &request->record_name,
+                                              record_page_number);
+       end_pending_search(&volume->page_cache, request->zone_number);
+       return result;
+}
+
+/*
+ * Find the metadata associated with a name in a given record page. This will return UDS_QUEUED if
+ * the page in question must be read from storage.
+ */
+int uds_search_cached_record_page(struct volume *volume, struct uds_request *request,
+                                 u32 chapter, u16 record_page_number, bool *found)
+{
+       struct cached_page *record_page;
+       struct index_geometry *geometry = volume->geometry;
+       int result;
+       u32 physical_page, page_number;
+
+       *found = false;
+       if (record_page_number == NO_CHAPTER_INDEX_ENTRY)
+               return UDS_SUCCESS;
+
+       result = ASSERT(record_page_number < geometry->record_pages_per_chapter,
+                       "0 <= %d < %u", record_page_number,
+                       geometry->record_pages_per_chapter);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       page_number = geometry->index_pages_per_chapter + record_page_number;
+
+       physical_page = map_to_physical_page(volume->geometry, chapter, page_number);
+
+       /*
+        * Make sure the invalidate counter is updated before we try and read the mapping. This
+        * prevents this thread from reading a page in the cache which has already been marked for
+        * invalidation by the reader thread, before the reader thread has noticed that the
+        * invalidate_counter has been incremented.
+        */
+       begin_pending_search(&volume->page_cache, physical_page, request->zone_number);
+
+       result = get_volume_page_protected(volume, request, physical_page, &record_page);
+       if (result != UDS_SUCCESS) {
+               end_pending_search(&volume->page_cache, request->zone_number);
+               return result;
+       }
+
+       if (search_record_page(dm_bufio_get_block_data(record_page->buffer),
+                              &request->record_name, geometry, &request->old_metadata))
+               *found = true;
+
+       end_pending_search(&volume->page_cache, request->zone_number);
+       return UDS_SUCCESS;
+}
+
+void uds_prefetch_volume_chapter(const struct volume *volume, u32 chapter)
+{
+       const struct index_geometry *geometry = volume->geometry;
+       u32 physical_page = map_to_physical_page(geometry, chapter, 0);
+
+       dm_bufio_prefetch(volume->client, physical_page, geometry->pages_per_chapter);
+}
+
+int uds_read_chapter_index_from_volume(const struct volume *volume, u64 virtual_chapter,
+                                      struct dm_buffer *volume_buffers[],
+                                      struct delta_index_page index_pages[])
+{
+       int result;
+       u32 i;
+       const struct index_geometry *geometry = volume->geometry;
+       u32 physical_chapter = uds_map_to_physical_chapter(geometry, virtual_chapter);
+       u32 physical_page = map_to_physical_page(geometry, physical_chapter, 0);
+
+       dm_bufio_prefetch(volume->client, physical_page, geometry->index_pages_per_chapter);
+       for (i = 0; i < geometry->index_pages_per_chapter; i++) {
+               u8 *index_page;
+
+               index_page = dm_bufio_read(volume->client, physical_page + i,
+                                          &volume_buffers[i]);
+               if (IS_ERR(index_page)) {
+                       result = -PTR_ERR(index_page);
+                       uds_log_warning_strerror(result,
+                                                "error reading physical page %u",
+                                                physical_page);
+                       return result;
+               }
+
+               result = init_chapter_index_page(volume, index_page, physical_chapter, i,
+                                                &index_pages[i]);
+               if (result != UDS_SUCCESS)
+                       return result;
+       }
+
+       return UDS_SUCCESS;
+}
+
+int uds_search_volume_page_cache(struct volume *volume, struct uds_request *request,
+                                bool *found)
+{
+       int result;
+       u32 physical_chapter =
+               uds_map_to_physical_chapter(volume->geometry, request->virtual_chapter);
+       u32 index_page_number;
+       u16 record_page_number;
+
+       index_page_number = uds_find_index_page_number(volume->index_page_map,
+                                                      &request->record_name,
+                                                      physical_chapter);
+
+       if (request->location == UDS_LOCATION_INDEX_PAGE_LOOKUP) {
+               record_page_number = *((u16 *) &request->old_metadata);
+       } else {
+               result = search_cached_index_page(volume, request, physical_chapter,
+                                                 index_page_number,
+                                                 &record_page_number);
+               if (result != UDS_SUCCESS)
+                       return result;
+       }
+
+       return uds_search_cached_record_page(volume, request, physical_chapter,
+                                            record_page_number, found);
+}
+
+int uds_search_volume_page_cache_for_rebuild(struct volume *volume,
+                                            const struct uds_record_name *name,
+                                            u64 virtual_chapter, bool *found)
+{
+       int result;
+       struct index_geometry *geometry = volume->geometry;
+       struct cached_page *page;
+       u32 physical_chapter = uds_map_to_physical_chapter(geometry, virtual_chapter);
+       u32 index_page_number;
+       u16 record_page_number;
+       u32 page_number;
+
+       *found = false;
+       index_page_number =
+               uds_find_index_page_number(volume->index_page_map, name,
+                                          physical_chapter);
+       result = get_volume_page(volume, physical_chapter, index_page_number, &page);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       result = uds_search_chapter_index_page(&page->index_page, geometry, name,
+                                              &record_page_number);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       if (record_page_number == NO_CHAPTER_INDEX_ENTRY)
+               return UDS_SUCCESS;
+
+       page_number = geometry->index_pages_per_chapter + record_page_number;
+       result = get_volume_page(volume, physical_chapter, page_number, &page);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       *found = search_record_page(dm_bufio_get_block_data(page->buffer), name,
+                                   geometry, NULL);
+       return UDS_SUCCESS;
+}
+
+static void invalidate_page(struct page_cache *cache, u32 physical_page)
+{
+       struct cached_page *page;
+       int queue_index = -1;
+
+       /* We hold the read_threads_mutex. */
+       get_page_and_index(cache, physical_page, &queue_index, &page);
+       if (page != NULL) {
+               WRITE_ONCE(cache->index[page->physical_page], cache->cache_slots);
+               wait_for_pending_searches(cache, page->physical_page);
+               clear_cache_page(cache, page);
+       } else if (queue_index > -1) {
+               uds_log_debug("setting pending read to invalid");
+               cache->read_queue[queue_index].invalid = true;
+       }
+}
+
+void uds_forget_chapter(struct volume *volume, u64 virtual_chapter)
+{
+       u32 physical_chapter =
+               uds_map_to_physical_chapter(volume->geometry, virtual_chapter);
+       u32 first_page = map_to_physical_page(volume->geometry, physical_chapter, 0);
+       u32 i;
+
+       uds_log_debug("forgetting chapter %llu", (unsigned long long) virtual_chapter);
+       mutex_lock(&volume->read_threads_mutex);
+       for (i = 0; i < volume->geometry->pages_per_chapter; i++)
+               invalidate_page(&volume->page_cache, first_page + i);
+       mutex_unlock(&volume->read_threads_mutex);
+}
+
+/*
+ * Donate an index pages from a newly written chapter to the page cache since it is likely to be
+ * used again soon. The caller must already hold the reader thread mutex.
+ */
+static int donate_index_page_locked(struct volume *volume, u32 physical_chapter,
+                                   u32 index_page_number, struct dm_buffer *page_buffer)
+{
+       int result;
+       struct cached_page *page = NULL;
+       u32 physical_page =
+               map_to_physical_page(volume->geometry, physical_chapter,
+                                    index_page_number);
+
+       page = select_victim_in_cache(&volume->page_cache);
+       page->buffer = page_buffer;
+       result = init_chapter_index_page(volume, dm_bufio_get_block_data(page_buffer),
+                                        physical_chapter, index_page_number,
+                                        &page->index_page);
+       if (result != UDS_SUCCESS) {
+               uds_log_warning("Error initialize chapter index page");
+               cancel_page_in_cache(&volume->page_cache, physical_page, page);
+               return result;
+       }
+
+       result = put_page_in_cache(&volume->page_cache, physical_page, page);
+       if (result != UDS_SUCCESS) {
+               uds_log_warning("Error putting page %u in cache", physical_page);
+               cancel_page_in_cache(&volume->page_cache, physical_page, page);
+               return result;
+       }
+
+       return UDS_SUCCESS;
+}
+
+static int write_index_pages(struct volume *volume, u32 physical_chapter_number,
+                            struct open_chapter_index *chapter_index)
+{
+       struct index_geometry *geometry = volume->geometry;
+       struct dm_buffer *page_buffer;
+       u32 first_index_page = map_to_physical_page(geometry, physical_chapter_number, 0);
+       u32 delta_list_number = 0;
+       u32 index_page_number;
+
+       for (index_page_number = 0;
+            index_page_number < geometry->index_pages_per_chapter;
+            index_page_number++) {
+               u8 *page_data;
+               u32 physical_page = first_index_page + index_page_number;
+               u32 lists_packed;
+               bool last_page;
+               int result;
+
+               page_data = dm_bufio_new(volume->client, physical_page, &page_buffer);
+               if (IS_ERR(page_data)) {
+                       return uds_log_warning_strerror(-PTR_ERR(page_data),
+                                                       "failed to prepare index page");
+               }
+
+               last_page = ((index_page_number + 1) == geometry->index_pages_per_chapter);
+               result = uds_pack_open_chapter_index_page(chapter_index, page_data,
+                                                         delta_list_number, last_page,
+                                                         &lists_packed);
+               if (result != UDS_SUCCESS) {
+                       dm_bufio_release(page_buffer);
+                       return uds_log_warning_strerror(result,
+                                                       "failed to pack index page");
+               }
+
+               dm_bufio_mark_buffer_dirty(page_buffer);
+
+               if (lists_packed == 0) {
+                       uds_log_debug("no delta lists packed on chapter %u page %u",
+                                     physical_chapter_number, index_page_number);
+               } else {
+                       delta_list_number += lists_packed;
+               }
+
+               uds_update_index_page_map(volume->index_page_map,
+                                         chapter_index->virtual_chapter_number,
+                                         physical_chapter_number, index_page_number,
+                                         delta_list_number - 1);
+
+               mutex_lock(&volume->read_threads_mutex);
+               result = donate_index_page_locked(volume, physical_chapter_number,
+                                                 index_page_number, page_buffer);
+               mutex_unlock(&volume->read_threads_mutex);
+               if (result != UDS_SUCCESS) {
+                       dm_bufio_release(page_buffer);
+                       return result;
+               }
+       }
+
+       return UDS_SUCCESS;
+}
+
+static u32 encode_tree(u8 record_page[],
+                      const struct uds_volume_record *sorted_pointers[],
+                      u32 next_record, u32 node, u32 node_count)
+{
+       if (node < node_count) {
+               u32 child = (2 * node) + 1;
+
+               next_record = encode_tree(record_page, sorted_pointers, next_record,
+                                         child, node_count);
+
+               /*
+                * In-order traversal: copy the contents of the next record into the page at the
+                * node offset.
+                */
+               memcpy(&record_page[node * BYTES_PER_RECORD],
+                      sorted_pointers[next_record++], BYTES_PER_RECORD);
+
+               next_record = encode_tree(record_page, sorted_pointers, next_record,
+                                         child + 1, node_count);
+       }
+
+       return next_record;
+}
+
+static int encode_record_page(const struct volume *volume,
+                             const struct uds_volume_record records[], u8 record_page[])
+{
+       int result;
+       u32 i;
+       u32 records_per_page = volume->geometry->records_per_page;
+       const struct uds_volume_record **record_pointers = volume->record_pointers;
+
+       for (i = 0; i < records_per_page; i++)
+               record_pointers[i] = &records[i];
+
+       /*
+        * Sort the record pointers by using just the names in the records, which is less work than
+        * sorting the entire record values.
+        */
+       BUILD_BUG_ON(offsetof(struct uds_volume_record, name) != 0);
+       result = uds_radix_sort(volume->radix_sorter, (const u8 **) record_pointers,
+                               records_per_page, UDS_RECORD_NAME_SIZE);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       encode_tree(record_page, record_pointers, 0, 0, records_per_page);
+       return UDS_SUCCESS;
+}
+
+static int write_record_pages(struct volume *volume, u32 physical_chapter_number,
+                             const struct uds_volume_record *records)
+{
+       u32 record_page_number;
+       struct index_geometry *geometry = volume->geometry;
+       struct dm_buffer *page_buffer;
+       const struct uds_volume_record *next_record = records;
+       u32 first_record_page = map_to_physical_page(geometry, physical_chapter_number,
+                                                    geometry->index_pages_per_chapter);
+
+       for (record_page_number = 0;
+            record_page_number < geometry->record_pages_per_chapter;
+            record_page_number++) {
+               u8 *page_data;
+               u32 physical_page = first_record_page + record_page_number;
+               int result;
+
+               page_data = dm_bufio_new(volume->client, physical_page, &page_buffer);
+               if (IS_ERR(page_data)) {
+                       return uds_log_warning_strerror(-PTR_ERR(page_data),
+                                                       "failed to prepare record page");
+               }
+
+               result = encode_record_page(volume, next_record, page_data);
+               if (result != UDS_SUCCESS) {
+                       dm_bufio_release(page_buffer);
+                       return uds_log_warning_strerror(result,
+                                                       "failed to encode record page %u",
+                                                       record_page_number);
+               }
+
+               next_record += geometry->records_per_page;
+               dm_bufio_mark_buffer_dirty(page_buffer);
+               dm_bufio_release(page_buffer);
+       }
+
+       return UDS_SUCCESS;
+}
+
+int uds_write_chapter(struct volume *volume, struct open_chapter_index *chapter_index,
+                     const struct uds_volume_record *records)
+{
+       int result;
+       u32 physical_chapter_number =
+               uds_map_to_physical_chapter(volume->geometry,
+                                           chapter_index->virtual_chapter_number);
+
+       result = write_index_pages(volume, physical_chapter_number, chapter_index);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       result = write_record_pages(volume, physical_chapter_number, records);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       result = -dm_bufio_write_dirty_buffers(volume->client);
+       if (result != UDS_SUCCESS)
+               uds_log_error_strerror(result, "cannot sync chapter to volume");
+
+       return result;
+}
+
+static void probe_chapter(struct volume *volume, u32 chapter_number,
+                         u64 *virtual_chapter_number)
+{
+       const struct index_geometry *geometry = volume->geometry;
+       u32 expected_list_number = 0;
+       u32 i;
+       u64 vcn = BAD_CHAPTER;
+
+       *virtual_chapter_number = BAD_CHAPTER;
+       dm_bufio_prefetch(volume->client,
+                         map_to_physical_page(geometry, chapter_number, 0),
+                         geometry->index_pages_per_chapter);
+
+       for (i = 0; i < geometry->index_pages_per_chapter; i++) {
+               struct delta_index_page *page;
+               int result;
+
+               result = uds_get_volume_index_page(volume, chapter_number, i, &page);
+               if (result != UDS_SUCCESS)
+                       return;
+
+               if (page->virtual_chapter_number == BAD_CHAPTER) {
+                       uds_log_error("corrupt index page in chapter %u",
+                                     chapter_number);
+                       return;
+               }
+
+               if (vcn == BAD_CHAPTER) {
+                       vcn = page->virtual_chapter_number;
+               } else if (page->virtual_chapter_number != vcn) {
+                       uds_log_error("inconsistent chapter %u index page %u: expected vcn %llu, got vcn %llu",
+                                     chapter_number, i, (unsigned long long) vcn,
+                                     (unsigned long long) page->virtual_chapter_number);
+                       return;
+               }
+
+               if (expected_list_number != page->lowest_list_number) {
+                       uds_log_error("inconsistent chapter %u index page %u: expected list number %u, got list number %u",
+                                     chapter_number, i, expected_list_number,
+                                     page->lowest_list_number);
+                       return;
+               }
+               expected_list_number = page->highest_list_number + 1;
+
+               result = uds_validate_chapter_index_page(page, geometry);
+               if (result != UDS_SUCCESS)
+                       return;
+       }
+
+       if (chapter_number != uds_map_to_physical_chapter(geometry, vcn)) {
+               uds_log_error("chapter %u vcn %llu is out of phase (%u)", chapter_number,
+                             (unsigned long long) vcn, geometry->chapters_per_volume);
+               return;
+       }
+
+       *virtual_chapter_number = vcn;
+}
+
+/* Find the last valid physical chapter in the volume. */
+static void find_real_end_of_volume(struct volume *volume, u32 limit, u32 *limit_ptr)
+{
+       u32 span = 1;
+       u32 tries = 0;
+
+       while (limit > 0) {
+               u32 chapter = (span > limit) ? 0 : limit - span;
+               u64 vcn = 0;
+
+               probe_chapter(volume, chapter, &vcn);
+               if (vcn == BAD_CHAPTER) {
+                       limit = chapter;
+                       if (++tries > 1)
+                               span *= 2;
+               } else {
+                       if (span == 1)
+                               break;
+                       span /= 2;
+                       tries = 0;
+               }
+       }
+
+       *limit_ptr = limit;
+}
+
+static int find_chapter_limits(struct volume *volume, u32 chapter_limit, u64 *lowest_vcn,
+                              u64 *highest_vcn)
+{
+       struct index_geometry *geometry = volume->geometry;
+       u64 zero_vcn;
+       u64 lowest = BAD_CHAPTER;
+       u64 highest = BAD_CHAPTER;
+       u64 moved_chapter = BAD_CHAPTER;
+       u32 left_chapter = 0;
+       u32 right_chapter = 0;
+       u32 bad_chapters = 0;
+
+       /*
+        * This method assumes there is at most one run of contiguous bad chapters caused by
+        * unflushed writes. Either the bad spot is at the beginning and end, or somewhere in the
+        * middle. Wherever it is, the highest and lowest VCNs are adjacent to it. Otherwise the
+        * volume is cleanly saved and somewhere in the middle of it the highest VCN immediately
+        * precedes the lowest one.
+        */
+
+       /* It doesn't matter if this results in a bad spot (BAD_CHAPTER). */
+       probe_chapter(volume, 0, &zero_vcn);
+
+       /*
+        * Binary search for end of the discontinuity in the monotonically increasing virtual
+        * chapter numbers; bad spots are treated as a span of BAD_CHAPTER values. In effect we're
+        * searching for the index of the smallest value less than zero_vcn. In the case we go off
+        * the end it means that chapter 0 has the lowest vcn.
+        *
+        * If a virtual chapter is out-of-order, it will be the one moved by conversion. Always
+        * skip over the moved chapter when searching, adding it to the range at the end if
+        * necessary.
+        */
+       if (geometry->remapped_physical > 0) {
+               u64 remapped_vcn;
+
+               probe_chapter(volume, geometry->remapped_physical, &remapped_vcn);
+               if (remapped_vcn == geometry->remapped_virtual)
+                       moved_chapter = geometry->remapped_physical;
+       }
+
+       left_chapter = 0;
+       right_chapter = chapter_limit;
+
+       while (left_chapter < right_chapter) {
+               u64 probe_vcn;
+               u32 chapter = (left_chapter + right_chapter) / 2;
+
+               if (chapter == moved_chapter)
+                       chapter--;
+
+               probe_chapter(volume, chapter, &probe_vcn);
+               if (zero_vcn <= probe_vcn) {
+                       left_chapter = chapter + 1;
+                       if (left_chapter == moved_chapter)
+                               left_chapter++;
+               } else {
+                       right_chapter = chapter;
+               }
+       }
+
+       /* If left_chapter goes off the end, chapter 0 has the lowest virtual chapter number.*/
+       if (left_chapter >= chapter_limit)
+               left_chapter = 0;
+
+       /* At this point, left_chapter is the chapter with the lowest virtual chapter number. */
+       probe_chapter(volume, left_chapter, &lowest);
+
+       /* The moved chapter might be the lowest in the range. */
+       if ((moved_chapter != BAD_CHAPTER) && (lowest == geometry->remapped_virtual + 1))
+               lowest = geometry->remapped_virtual;
+
+       /*
+        * Circularly scan backwards, moving over any bad chapters until encountering a good one,
+        * which is the chapter with the highest vcn.
+        */
+       while (highest == BAD_CHAPTER) {
+               right_chapter = (right_chapter + chapter_limit - 1) % chapter_limit;
+               if (right_chapter == moved_chapter)
+                       continue;
+
+               probe_chapter(volume, right_chapter, &highest);
+               if (bad_chapters++ >= MAX_BAD_CHAPTERS) {
+                       uds_log_error("too many bad chapters in volume: %u",
+                                     bad_chapters);
+                       return UDS_CORRUPT_DATA;
+               }
+       }
+
+       *lowest_vcn = lowest;
+       *highest_vcn = highest;
+       return UDS_SUCCESS;
+}
+
+/*
+ * Find the highest and lowest contiguous chapters present in the volume and determine their
+ * virtual chapter numbers. This is used by rebuild.
+ */
+int uds_find_volume_chapter_boundaries(struct volume *volume, u64 *lowest_vcn,
+                                      u64 *highest_vcn, bool *is_empty)
+{
+       u32 chapter_limit = volume->geometry->chapters_per_volume;
+
+       find_real_end_of_volume(volume, chapter_limit, &chapter_limit);
+       if (chapter_limit == 0) {
+               *lowest_vcn = 0;
+               *highest_vcn = 0;
+               *is_empty = true;
+               return UDS_SUCCESS;
+       }
+
+       *is_empty = false;
+       return find_chapter_limits(volume, chapter_limit, lowest_vcn, highest_vcn);
+}
+
+int __must_check uds_replace_volume_storage(struct volume *volume,
+                                           struct index_layout *layout,
+                                           struct block_device *bdev)
+{
+       int result;
+       u32 i;
+
+       result = uds_replace_index_layout_storage(layout, bdev);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       /* Release all outstanding dm_bufio objects */
+       for (i = 0; i < volume->page_cache.indexable_pages; i++)
+               volume->page_cache.index[i] = volume->page_cache.cache_slots;
+       for (i = 0; i < volume->page_cache.cache_slots; i++)
+               clear_cache_page(&volume->page_cache, &volume->page_cache.cache[i]);
+       if (volume->sparse_cache != NULL)
+               uds_invalidate_sparse_cache(volume->sparse_cache);
+       if (volume->client != NULL)
+               dm_bufio_client_destroy(uds_forget(volume->client));
+
+       return uds_open_volume_bufio(layout, volume->geometry->bytes_per_page,
+                                    volume->reserved_buffers, &volume->client);
+}
+
+static int __must_check initialize_page_cache(struct page_cache *cache,
+                                             const struct index_geometry *geometry,
+                                             u32 chapters_in_cache,
+                                             unsigned int zone_count)
+{
+       int result;
+       u32 i;
+
+       cache->indexable_pages = geometry->pages_per_volume + 1;
+       cache->cache_slots = chapters_in_cache * geometry->record_pages_per_chapter;
+       cache->zone_count = zone_count;
+       atomic64_set(&cache->clock, 1);
+
+       result = ASSERT((cache->cache_slots <= VOLUME_CACHE_MAX_ENTRIES),
+                       "requested cache size, %u, within limit %u",
+                       cache->cache_slots, VOLUME_CACHE_MAX_ENTRIES);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       result = uds_allocate(VOLUME_CACHE_MAX_QUEUED_READS, struct queued_read,
+                             "volume read queue", &cache->read_queue);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       result = uds_allocate(cache->zone_count, struct search_pending_counter,
+                             "Volume Cache Zones", &cache->search_pending_counters);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       result = uds_allocate(cache->indexable_pages, u16, "page cache index",
+                             &cache->index);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       result = uds_allocate(cache->cache_slots, struct cached_page, "page cache cache",
+                             &cache->cache);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       /* Initialize index values to invalid values. */
+       for (i = 0; i < cache->indexable_pages; i++)
+               cache->index[i] = cache->cache_slots;
+
+       for (i = 0; i < cache->cache_slots; i++)
+               clear_cache_page(cache, &cache->cache[i]);
+
+       return UDS_SUCCESS;
+}
+
+int uds_make_volume(const struct uds_configuration *config, struct index_layout *layout,
+                   struct volume **new_volume)
+{
+       unsigned int i;
+       struct volume *volume = NULL;
+       struct index_geometry *geometry;
+       unsigned int reserved_buffers;
+       int result;
+
+       result = uds_allocate(1, struct volume, "volume", &volume);
+       if (result != UDS_SUCCESS)
+               return result;
+
+       volume->nonce = uds_get_volume_nonce(layout);
+
+       result = uds_copy_index_geometry(config->geometry, &volume->geometry);
+       if (result != UDS_SUCCESS) {
+               uds_free_volume(volume);
+               return uds_log_warning_strerror(result,
+                                               "failed to allocate geometry: error");
+       }
+       geometry = volume->geometry;
+
+       /*
+        * Reserve a buffer for each entry in the page cache, one for the chapter writer, and one
+        * for each entry in the sparse cache.
+        */
+       reserved_buffers = config->cache_chapters * geometry->record_pages_per_chapter;
+       reserved_buffers += 1;
+       if (uds_is_sparse_index_geometry(geometry))
+               reserved_buffers += (config->cache_chapters * geometry->index_pages_per_chapter);
+       volume->reserved_buffers = reserved_buffers;
+       result = uds_open_volume_bufio(layout, geometry->bytes_per_page,
+                                      volume->reserved_buffers, &volume->client);
+       if (result != UDS_SUCCESS) {
+               uds_free_volume(volume);
+               return result;
+       }
+
+       result = uds_make_radix_sorter(geometry->records_per_page,
+                                      &volume->radix_sorter);
+       if (result != UDS_SUCCESS) {
+               uds_free_volume(volume);
+               return result;
+       }
+
+       result = uds_allocate(geometry->records_per_page,
+                             const struct uds_volume_record *, "record pointers",
+                             &volume->record_pointers);
+       if (result != UDS_SUCCESS) {
+               uds_free_volume(volume);
+               return result;
+       }
+
+       if (uds_is_sparse_index_geometry(geometry)) {
+               size_t page_size = sizeof(struct delta_index_page) + geometry->bytes_per_page;
+
+               result = uds_make_sparse_cache(geometry, config->cache_chapters,
+                                              config->zone_count,
+                                              &volume->sparse_cache);
+               if (result != UDS_SUCCESS) {
+                       uds_free_volume(volume);
+                       return result;
+               }
+
+               volume->cache_size =
+                       page_size * geometry->index_pages_per_chapter * config->cache_chapters;
+       }
+
+       result = initialize_page_cache(&volume->page_cache, geometry,
+                                      config->cache_chapters, config->zone_count);
+       if (result != UDS_SUCCESS) {
+               uds_free_volume(volume);
+               return result;
+       }
+
+       volume->cache_size += volume->page_cache.cache_slots * sizeof(struct delta_index_page);
+       result = uds_make_index_page_map(geometry, &volume->index_page_map);
+       if (result != UDS_SUCCESS) {
+               uds_free_volume(volume);
+               return result;
+       }
+
+       mutex_init(&volume->read_threads_mutex);
+       uds_init_cond(&volume->read_threads_read_done_cond);
+       uds_init_cond(&volume->read_threads_cond);
+
+       result = uds_allocate(config->read_threads, struct thread *, "reader threads",
+                             &volume->reader_threads);
+       if (result != UDS_SUCCESS) {
+               uds_free_volume(volume);
+               return result;
+       }
+
+       for (i = 0; i < config->read_threads; i++) {
+               result = vdo_create_thread(read_thread_function, (void *) volume,
+                                          "reader", &volume->reader_threads[i]);
+               if (result != UDS_SUCCESS) {
+                       uds_free_volume(volume);
+                       return result;
+               }
+
+               volume->read_thread_count = i + 1;
+       }
+
+       *new_volume = volume;
+       return UDS_SUCCESS;
+}
+
+static void uninitialize_page_cache(struct page_cache *cache)
+{
+       u16 i;
+
+       if (cache->cache != NULL) {
+               for (i = 0; i < cache->cache_slots; i++)
+                       release_page_buffer(&cache->cache[i]);
+       }
+       uds_free(cache->index);
+       uds_free(cache->cache);
+       uds_free(cache->search_pending_counters);
+       uds_free(cache->read_queue);
+}
+
+void uds_free_volume(struct volume *volume)
+{
+       if (volume == NULL)
+               return;
+
+       if (volume->reader_threads != NULL) {
+               unsigned int i;
+
+               /* This works even if some threads weren't started. */
+               mutex_lock(&volume->read_threads_mutex);
+               volume->read_threads_exiting = true;
+               uds_broadcast_cond(&volume->read_threads_cond);
+               mutex_unlock(&volume->read_threads_mutex);
+               for (i = 0; i < volume->read_thread_count; i++)
+                       vdo_join_threads(volume->reader_threads[i]);
+               uds_free(volume->reader_threads);
+               volume->reader_threads = NULL;
+       }
+
+       /* Must destroy the client AFTER freeing the cached pages. */
+       uninitialize_page_cache(&volume->page_cache);
+       uds_free_sparse_cache(volume->sparse_cache);
+       if (volume->client != NULL)
+               dm_bufio_client_destroy(uds_forget(volume->client));
+
+       uds_free_index_page_map(volume->index_page_map);
+       uds_free_radix_sorter(volume->radix_sorter);
+       uds_free(volume->geometry);
+       uds_free(volume->record_pointers);
+       uds_free(volume);
+}
diff --git a/drivers/md/dm-vdo/indexer/volume.h b/drivers/md/dm-vdo/indexer/volume.h
new file mode 100644 (file)
index 0000000..8679a5e
--- /dev/null
@@ -0,0 +1,172 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#ifndef UDS_VOLUME_H
+#define UDS_VOLUME_H
+
+#include <linux/atomic.h>
+#include <linux/cache.h>
+#include <linux/dm-bufio.h>
+#include <linux/limits.h>
+
+#include "permassert.h"
+#include "thread-utils.h"
+
+#include "chapter-index.h"
+#include "config.h"
+#include "geometry.h"
+#include "indexer.h"
+#include "index-layout.h"
+#include "index-page-map.h"
+#include "radix-sort.h"
+#include "sparse-cache.h"
+
+/*
+ * The volume manages deduplication records on permanent storage. The term "volume" can also refer
+ * to the region of permanent storage where the records (and the chapters containing them) are
+ * stored. The volume handles all I/O to this region by reading, caching, and writing chapter pages
+ * as necessary.
+ */
+
+enum index_lookup_mode {
+       /* Always do lookups in all chapters normally */
+       LOOKUP_NORMAL,
+       /* Only do a subset of lookups needed when rebuilding an index */
+       LOOKUP_FOR_REBUILD,
+};
+
+struct queued_read {
+       bool invalid;
+       bool reserved;
+       u32 physical_page;
+       struct uds_request *first_request;
+       struct uds_request *last_request;
+};
+
+struct __aligned(L1_CACHE_BYTES) search_pending_counter {
+       u64 atomic_value;
+};
+
+struct cached_page {
+       /* Whether this page is currently being read asynchronously */
+       bool read_pending;
+       /* The physical page stored in this cache entry */
+       u32 physical_page;
+       /* The value of the volume clock when this page was last used */
+       s64 last_used;
+       /* The cached page buffer */
+       struct dm_buffer *buffer;
+       /* The chapter index page, meaningless for record pages */
+       struct delta_index_page index_page;
+};
+
+struct page_cache {
+       /* The number of zones */
+       unsigned int zone_count;
+       /* The number of volume pages that can be cached */
+       u32 indexable_pages;
+       /* The maximum number of simultaneously cached pages */
+       u16 cache_slots;
+       /* An index for each physical page noting where it is in the cache */
+       u16 *index;
+       /* The array of cached pages */
+       struct cached_page *cache;
+       /* A counter for each zone tracking if a search is occurring there */
+       struct search_pending_counter *search_pending_counters;
+       /* The read queue entries as a circular array */
+       struct queued_read *read_queue;
+
+       /* All entries above this point are constant after initialization. */
+
+       /*
+        * These values are all indexes into the array of read queue entries. New entries in the
+        * read queue are enqueued at read_queue_last. To dequeue entries, a reader thread gets the
+        * lock and then claims the entry pointed to by read_queue_next_read and increments that
+        * value. After the read is completed, the reader thread calls release_read_queue_entry(),
+        * which increments read_queue_first until it points to a pending read, or is equal to
+        * read_queue_next_read. This means that if multiple reads are outstanding,
+        * read_queue_first might not advance until the last of the reads finishes.
+        */
+       u16 read_queue_first;
+       u16 read_queue_next_read;
+       u16 read_queue_last;
+
+       atomic64_t clock;
+};
+
+struct volume {
+       struct index_geometry *geometry;
+       struct dm_bufio_client *client;
+       u64 nonce;
+       size_t cache_size;
+
+       /* A single page worth of records, for sorting */
+       const struct uds_volume_record **record_pointers;
+       /* Sorter for sorting records within each page */
+       struct radix_sorter *radix_sorter;
+
+       struct sparse_cache *sparse_cache;
+       struct page_cache page_cache;
+       struct index_page_map *index_page_map;
+
+       struct mutex read_threads_mutex;
+       struct cond_var read_threads_cond;
+       struct cond_var read_threads_read_done_cond;
+       struct thread **reader_threads;
+       unsigned int read_thread_count;
+       bool read_threads_exiting;
+
+       enum index_lookup_mode lookup_mode;
+       unsigned int reserved_buffers;
+};
+
+int __must_check uds_make_volume(const struct uds_configuration *config,
+                                struct index_layout *layout,
+                                struct volume **new_volume);
+
+void uds_free_volume(struct volume *volume);
+
+int __must_check uds_replace_volume_storage(struct volume *volume,
+                                           struct index_layout *layout,
+                                           struct block_device *bdev);
+
+int __must_check uds_find_volume_chapter_boundaries(struct volume *volume,
+                                                   u64 *lowest_vcn, u64 *highest_vcn,
+                                                   bool *is_empty);
+
+int __must_check uds_search_volume_page_cache(struct volume *volume,
+                                             struct uds_request *request,
+                                             bool *found);
+
+int __must_check uds_search_volume_page_cache_for_rebuild(struct volume *volume,
+                                                         const struct uds_record_name *name,
+                                                         u64 virtual_chapter,
+                                                         bool *found);
+
+int __must_check uds_search_cached_record_page(struct volume *volume,
+                                              struct uds_request *request, u32 chapter,
+                                              u16 record_page_number, bool *found);
+
+void uds_forget_chapter(struct volume *volume, u64 chapter);
+
+int __must_check uds_write_chapter(struct volume *volume,
+                                  struct open_chapter_index *chapter_index,
+                                  const struct uds_volume_record records[]);
+
+void uds_prefetch_volume_chapter(const struct volume *volume, u32 chapter);
+
+int __must_check uds_read_chapter_index_from_volume(const struct volume *volume,
+                                                   u64 virtual_chapter,
+                                                   struct dm_buffer *volume_buffers[],
+                                                   struct delta_index_page index_pages[]);
+
+int __must_check uds_get_volume_record_page(struct volume *volume, u32 chapter,
+                                           u32 page_number, u8 **data_ptr);
+
+int __must_check uds_get_volume_index_page(struct volume *volume, u32 chapter,
+                                          u32 page_number,
+                                          struct delta_index_page **page_ptr);
+
+#endif /* UDS_VOLUME_H */
diff --git a/drivers/md/dm-vdo/io-factory.c b/drivers/md/dm-vdo/io-factory.c
deleted file mode 100644 (file)
index 02242df..0000000
+++ /dev/null
@@ -1,415 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright 2023 Red Hat
- */
-
-#include "io-factory.h"
-
-#include <linux/atomic.h>
-#include <linux/blkdev.h>
-#include <linux/err.h>
-#include <linux/mount.h>
-
-#include "logger.h"
-#include "memory-alloc.h"
-#include "numeric.h"
-
-/*
- * The I/O factory object manages access to index storage, which is a contiguous range of blocks on
- * a block device.
- *
- * The factory holds the open device and is responsible for closing it. The factory has methods to
- * make helper structures that can be used to access sections of the index.
- */
-struct io_factory {
-       struct block_device *bdev;
-       atomic_t ref_count;
-};
-
-/* The buffered reader allows efficient I/O by reading page-sized segments into a buffer. */
-struct buffered_reader {
-       struct io_factory *factory;
-       struct dm_bufio_client *client;
-       struct dm_buffer *buffer;
-       sector_t limit;
-       sector_t block_number;
-       u8 *start;
-       u8 *end;
-};
-
-enum { MAX_READ_AHEAD_BLOCKS = 4 };
-
-/*
- * The buffered writer allows efficient I/O by buffering writes and committing page-sized segments
- * to storage.
- */
-struct buffered_writer {
-       struct io_factory *factory;
-       struct dm_bufio_client *client;
-       struct dm_buffer *buffer;
-       sector_t limit;
-       sector_t block_number;
-       u8 *start;
-       u8 *end;
-       int error;
-};
-
-static void uds_get_io_factory(struct io_factory *factory)
-{
-       atomic_inc(&factory->ref_count);
-}
-
-int uds_make_io_factory(struct block_device *bdev, struct io_factory **factory_ptr)
-{
-       int result;
-       struct io_factory *factory;
-
-       result = uds_allocate(1, struct io_factory, __func__, &factory);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       factory->bdev = bdev;
-       atomic_set_release(&factory->ref_count, 1);
-
-       *factory_ptr = factory;
-       return UDS_SUCCESS;
-}
-
-int uds_replace_storage(struct io_factory *factory, struct block_device *bdev)
-{
-       factory->bdev = bdev;
-       return UDS_SUCCESS;
-}
-
-/* Free an I/O factory once all references have been released. */
-void uds_put_io_factory(struct io_factory *factory)
-{
-       if (atomic_add_return(-1, &factory->ref_count) <= 0)
-               uds_free(factory);
-}
-
-size_t uds_get_writable_size(struct io_factory *factory)
-{
-       return i_size_read(factory->bdev->bd_inode);
-}
-
-/* Create a struct dm_bufio_client for an index region starting at offset. */
-int uds_make_bufio(struct io_factory *factory, off_t block_offset, size_t block_size,
-                  unsigned int reserved_buffers, struct dm_bufio_client **client_ptr)
-{
-       struct dm_bufio_client *client;
-
-       client = dm_bufio_client_create(factory->bdev, block_size, reserved_buffers, 0,
-                                       NULL, NULL, 0);
-       if (IS_ERR(client))
-               return -PTR_ERR(client);
-
-       dm_bufio_set_sector_offset(client, block_offset * SECTORS_PER_BLOCK);
-       *client_ptr = client;
-       return UDS_SUCCESS;
-}
-
-static void read_ahead(struct buffered_reader *reader, sector_t block_number)
-{
-       if (block_number < reader->limit) {
-               sector_t read_ahead = min((sector_t) MAX_READ_AHEAD_BLOCKS,
-                                         reader->limit - block_number);
-
-               dm_bufio_prefetch(reader->client, block_number, read_ahead);
-       }
-}
-
-void uds_free_buffered_reader(struct buffered_reader *reader)
-{
-       if (reader == NULL)
-               return;
-
-       if (reader->buffer != NULL)
-               dm_bufio_release(reader->buffer);
-
-       dm_bufio_client_destroy(reader->client);
-       uds_put_io_factory(reader->factory);
-       uds_free(reader);
-}
-
-/* Create a buffered reader for an index region starting at offset. */
-int uds_make_buffered_reader(struct io_factory *factory, off_t offset, u64 block_count,
-                            struct buffered_reader **reader_ptr)
-{
-       int result;
-       struct dm_bufio_client *client = NULL;
-       struct buffered_reader *reader = NULL;
-
-       result = uds_make_bufio(factory, offset, UDS_BLOCK_SIZE, 1, &client);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       result = uds_allocate(1, struct buffered_reader, "buffered reader", &reader);
-       if (result != UDS_SUCCESS) {
-               dm_bufio_client_destroy(client);
-               return result;
-       }
-
-       *reader = (struct buffered_reader) {
-               .factory = factory,
-               .client = client,
-               .buffer = NULL,
-               .limit = block_count,
-               .block_number = 0,
-               .start = NULL,
-               .end = NULL,
-       };
-
-       read_ahead(reader, 0);
-       uds_get_io_factory(factory);
-       *reader_ptr = reader;
-       return UDS_SUCCESS;
-}
-
-static int position_reader(struct buffered_reader *reader, sector_t block_number,
-                          off_t offset)
-{
-       struct dm_buffer *buffer = NULL;
-       void *data;
-
-       if ((reader->end == NULL) || (block_number != reader->block_number)) {
-               if (block_number >= reader->limit)
-                       return UDS_OUT_OF_RANGE;
-
-               if (reader->buffer != NULL)
-                       dm_bufio_release(uds_forget(reader->buffer));
-
-               data = dm_bufio_read(reader->client, block_number, &buffer);
-               if (IS_ERR(data))
-                       return -PTR_ERR(data);
-
-               reader->buffer = buffer;
-               reader->start = data;
-               if (block_number == reader->block_number + 1)
-                       read_ahead(reader, block_number + 1);
-       }
-
-       reader->block_number = block_number;
-       reader->end = reader->start + offset;
-       return UDS_SUCCESS;
-}
-
-static size_t bytes_remaining_in_read_buffer(struct buffered_reader *reader)
-{
-       return (reader->end == NULL) ? 0 : reader->start + UDS_BLOCK_SIZE - reader->end;
-}
-
-static int reset_reader(struct buffered_reader *reader)
-{
-       sector_t block_number;
-
-       if (bytes_remaining_in_read_buffer(reader) > 0)
-               return UDS_SUCCESS;
-
-       block_number = reader->block_number;
-       if (reader->end != NULL)
-               block_number++;
-
-       return position_reader(reader, block_number, 0);
-}
-
-int uds_read_from_buffered_reader(struct buffered_reader *reader, u8 *data,
-                                 size_t length)
-{
-       int result = UDS_SUCCESS;
-       size_t chunk_size;
-
-       while (length > 0) {
-               result = reset_reader(reader);
-               if (result != UDS_SUCCESS)
-                       return result;
-
-               chunk_size = min(length, bytes_remaining_in_read_buffer(reader));
-               memcpy(data, reader->end, chunk_size);
-               length -= chunk_size;
-               data += chunk_size;
-               reader->end += chunk_size;
-       }
-
-       return UDS_SUCCESS;
-}
-
-/*
- * Verify that the next data on the reader matches the required value. If the value matches, the
- * matching contents are consumed. If the value does not match, the reader state is unchanged.
- */
-int uds_verify_buffered_data(struct buffered_reader *reader, const u8 *value,
-                            size_t length)
-{
-       int result = UDS_SUCCESS;
-       size_t chunk_size;
-       sector_t start_block_number = reader->block_number;
-       int start_offset = reader->end - reader->start;
-
-       while (length > 0) {
-               result = reset_reader(reader);
-               if (result != UDS_SUCCESS) {
-                       result = UDS_CORRUPT_DATA;
-                       break;
-               }
-
-               chunk_size = min(length, bytes_remaining_in_read_buffer(reader));
-               if (memcmp(value, reader->end, chunk_size) != 0) {
-                       result = UDS_CORRUPT_DATA;
-                       break;
-               }
-
-               length -= chunk_size;
-               value += chunk_size;
-               reader->end += chunk_size;
-       }
-
-       if (result != UDS_SUCCESS)
-               position_reader(reader, start_block_number, start_offset);
-
-       return result;
-}
-
-/* Create a buffered writer for an index region starting at offset. */
-int uds_make_buffered_writer(struct io_factory *factory, off_t offset, u64 block_count,
-                            struct buffered_writer **writer_ptr)
-{
-       int result;
-       struct dm_bufio_client *client = NULL;
-       struct buffered_writer *writer;
-
-       result = uds_make_bufio(factory, offset, UDS_BLOCK_SIZE, 1, &client);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       result = uds_allocate(1, struct buffered_writer, "buffered writer", &writer);
-       if (result != UDS_SUCCESS) {
-               dm_bufio_client_destroy(client);
-               return result;
-       }
-
-       *writer = (struct buffered_writer) {
-               .factory = factory,
-               .client = client,
-               .buffer = NULL,
-               .limit = block_count,
-               .start = NULL,
-               .end = NULL,
-               .block_number = 0,
-               .error = UDS_SUCCESS,
-       };
-
-       uds_get_io_factory(factory);
-       *writer_ptr = writer;
-       return UDS_SUCCESS;
-}
-
-static size_t get_remaining_write_space(struct buffered_writer *writer)
-{
-       return writer->start + UDS_BLOCK_SIZE - writer->end;
-}
-
-static int __must_check prepare_next_buffer(struct buffered_writer *writer)
-{
-       struct dm_buffer *buffer = NULL;
-       void *data;
-
-       if (writer->block_number >= writer->limit) {
-               writer->error = UDS_OUT_OF_RANGE;
-               return UDS_OUT_OF_RANGE;
-       }
-
-       data = dm_bufio_new(writer->client, writer->block_number, &buffer);
-       if (IS_ERR(data)) {
-               writer->error = -PTR_ERR(data);
-               return writer->error;
-       }
-
-       writer->buffer = buffer;
-       writer->start = data;
-       writer->end = data;
-       return UDS_SUCCESS;
-}
-
-static int flush_previous_buffer(struct buffered_writer *writer)
-{
-       size_t available;
-
-       if (writer->buffer == NULL)
-               return writer->error;
-
-       if (writer->error == UDS_SUCCESS) {
-               available = get_remaining_write_space(writer);
-
-               if (available > 0)
-                       memset(writer->end, 0, available);
-
-               dm_bufio_mark_buffer_dirty(writer->buffer);
-       }
-
-       dm_bufio_release(writer->buffer);
-       writer->buffer = NULL;
-       writer->start = NULL;
-       writer->end = NULL;
-       writer->block_number++;
-       return writer->error;
-}
-
-void uds_free_buffered_writer(struct buffered_writer *writer)
-{
-       int result;
-
-       if (writer == NULL)
-               return;
-
-       flush_previous_buffer(writer);
-       result = -dm_bufio_write_dirty_buffers(writer->client);
-       if (result != UDS_SUCCESS)
-               uds_log_warning_strerror(result, "%s: failed to sync storage", __func__);
-
-       dm_bufio_client_destroy(writer->client);
-       uds_put_io_factory(writer->factory);
-       uds_free(writer);
-}
-
-/*
- * Append data to the buffer, writing as needed. If no data is provided, zeros are written instead.
- * If a write error occurs, it is recorded and returned on every subsequent write attempt.
- */
-int uds_write_to_buffered_writer(struct buffered_writer *writer, const u8 *data,
-                                size_t length)
-{
-       int result = writer->error;
-       size_t chunk_size;
-
-       while ((length > 0) && (result == UDS_SUCCESS)) {
-               if (writer->buffer == NULL) {
-                       result = prepare_next_buffer(writer);
-                       continue;
-               }
-
-               chunk_size = min(length, get_remaining_write_space(writer));
-               if (data == NULL) {
-                       memset(writer->end, 0, chunk_size);
-               } else {
-                       memcpy(writer->end, data, chunk_size);
-                       data += chunk_size;
-               }
-
-               length -= chunk_size;
-               writer->end += chunk_size;
-
-               if (get_remaining_write_space(writer) == 0)
-                       result = uds_flush_buffered_writer(writer);
-       }
-
-       return result;
-}
-
-int uds_flush_buffered_writer(struct buffered_writer *writer)
-{
-       if (writer->error != UDS_SUCCESS)
-               return writer->error;
-
-       return flush_previous_buffer(writer);
-}
diff --git a/drivers/md/dm-vdo/io-factory.h b/drivers/md/dm-vdo/io-factory.h
deleted file mode 100644 (file)
index 7fb5a06..0000000
+++ /dev/null
@@ -1,64 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright 2023 Red Hat
- */
-
-#ifndef UDS_IO_FACTORY_H
-#define UDS_IO_FACTORY_H
-
-#include <linux/dm-bufio.h>
-
-/*
- * The I/O factory manages all low-level I/O operations to the underlying storage device. Its main
- * clients are the index layout and the volume. The buffered reader and buffered writer interfaces
- * are helpers for accessing data in a contiguous range of storage blocks.
- */
-
-struct buffered_reader;
-struct buffered_writer;
-
-struct io_factory;
-
-enum {
-       UDS_BLOCK_SIZE = 4096,
-       SECTORS_PER_BLOCK = UDS_BLOCK_SIZE >> SECTOR_SHIFT,
-};
-
-int __must_check uds_make_io_factory(struct block_device *bdev,
-                                    struct io_factory **factory_ptr);
-
-int __must_check uds_replace_storage(struct io_factory *factory,
-                                    struct block_device *bdev);
-
-void uds_put_io_factory(struct io_factory *factory);
-
-size_t __must_check uds_get_writable_size(struct io_factory *factory);
-
-int __must_check uds_make_bufio(struct io_factory *factory, off_t block_offset,
-                               size_t block_size, unsigned int reserved_buffers,
-                               struct dm_bufio_client **client_ptr);
-
-int __must_check uds_make_buffered_reader(struct io_factory *factory, off_t offset,
-                                         u64 block_count,
-                                         struct buffered_reader **reader_ptr);
-
-void uds_free_buffered_reader(struct buffered_reader *reader);
-
-int __must_check uds_read_from_buffered_reader(struct buffered_reader *reader, u8 *data,
-                                              size_t length);
-
-int __must_check uds_verify_buffered_data(struct buffered_reader *reader, const u8 *value,
-                                         size_t length);
-
-int __must_check uds_make_buffered_writer(struct io_factory *factory, off_t offset,
-                                         u64 block_count,
-                                         struct buffered_writer **writer_ptr);
-
-void uds_free_buffered_writer(struct buffered_writer *buffer);
-
-int __must_check uds_write_to_buffered_writer(struct buffered_writer *writer,
-                                             const u8 *data, size_t length);
-
-int __must_check uds_flush_buffered_writer(struct buffered_writer *writer);
-
-#endif /* UDS_IO_FACTORY_H */
diff --git a/drivers/md/dm-vdo/open-chapter.c b/drivers/md/dm-vdo/open-chapter.c
deleted file mode 100644 (file)
index d9d6e5d..0000000
+++ /dev/null
@@ -1,427 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright 2023 Red Hat
- */
-
-#include "open-chapter.h"
-
-#include <linux/log2.h>
-
-#include "config.h"
-#include "hash-utils.h"
-#include "logger.h"
-#include "memory-alloc.h"
-#include "numeric.h"
-#include "permassert.h"
-
-/*
- * Each index zone has a dedicated open chapter zone structure which gets an equal share of the
- * open chapter space. Records are assigned to zones based on their record name. Within each zone,
- * records are stored in an array in the order they arrive. Additionally, a reference to each
- * record is stored in a hash table to help determine if a new record duplicates an existing one.
- * If new metadata for an existing name arrives, the record is altered in place. The array of
- * records is 1-based so that record number 0 can be used to indicate an unused hash slot.
- *
- * Deleted records are marked with a flag rather than actually removed to simplify hash table
- * management. The array of deleted flags overlays the array of hash slots, but the flags are
- * indexed by record number instead of by record name. The number of hash slots will always be a
- * power of two that is greater than the number of records to be indexed, guaranteeing that hash
- * insertion cannot fail, and that there are sufficient flags for all records.
- *
- * Once any open chapter zone fills its available space, the chapter is closed. The records from
- * each zone are interleaved to attempt to preserve temporal locality and assigned to record pages.
- * Empty or deleted records are replaced by copies of a valid record so that the record pages only
- * contain valid records. The chapter then constructs a delta index which maps each record name to
- * the record page on which that record can be found, which is split into index pages. These
- * structures are then passed to the volume to be recorded on storage.
- *
- * When the index is saved, the open chapter records are saved in a single array, once again
- * interleaved to attempt to preserve temporal locality. When the index is reloaded, there may be a
- * different number of zones than previously, so the records must be parcelled out to their new
- * zones. In addition, depending on the distribution of record names, a new zone may have more
- * records than it has space. In this case, the latest records for that zone will be discarded.
- */
-
-static const u8 OPEN_CHAPTER_MAGIC[] = "ALBOC";
-static const u8 OPEN_CHAPTER_VERSION[] = "02.00";
-
-enum {
-       OPEN_CHAPTER_MAGIC_LENGTH = sizeof(OPEN_CHAPTER_MAGIC) - 1,
-       OPEN_CHAPTER_VERSION_LENGTH = sizeof(OPEN_CHAPTER_VERSION) - 1,
-       LOAD_RATIO = 2,
-};
-
-static inline size_t records_size(const struct open_chapter_zone *open_chapter)
-{
-       return sizeof(struct uds_volume_record) * (1 + open_chapter->capacity);
-}
-
-static inline size_t slots_size(size_t slot_count)
-{
-       return sizeof(struct open_chapter_zone_slot) * slot_count;
-}
-
-int uds_make_open_chapter(const struct index_geometry *geometry, unsigned int zone_count,
-                         struct open_chapter_zone **open_chapter_ptr)
-{
-       int result;
-       struct open_chapter_zone *open_chapter;
-       size_t capacity = geometry->records_per_chapter / zone_count;
-       size_t slot_count = (1 << bits_per(capacity * LOAD_RATIO));
-
-       result = uds_allocate_extended(struct open_chapter_zone, slot_count,
-                                      struct open_chapter_zone_slot, "open chapter",
-                                      &open_chapter);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       open_chapter->slot_count = slot_count;
-       open_chapter->capacity = capacity;
-       result = uds_allocate_cache_aligned(records_size(open_chapter), "record pages",
-                                           &open_chapter->records);
-       if (result != UDS_SUCCESS) {
-               uds_free_open_chapter(open_chapter);
-               return result;
-       }
-
-       *open_chapter_ptr = open_chapter;
-       return UDS_SUCCESS;
-}
-
-void uds_reset_open_chapter(struct open_chapter_zone *open_chapter)
-{
-       open_chapter->size = 0;
-       open_chapter->deletions = 0;
-
-       memset(open_chapter->records, 0, records_size(open_chapter));
-       memset(open_chapter->slots, 0, slots_size(open_chapter->slot_count));
-}
-
-static unsigned int probe_chapter_slots(struct open_chapter_zone *open_chapter,
-                                       const struct uds_record_name *name)
-{
-       struct uds_volume_record *record;
-       unsigned int slot_count = open_chapter->slot_count;
-       unsigned int slot = uds_name_to_hash_slot(name, slot_count);
-       unsigned int record_number;
-       unsigned int attempts = 1;
-
-       while (true) {
-               record_number = open_chapter->slots[slot].record_number;
-
-               /*
-                * If the hash slot is empty, we've reached the end of a chain without finding the
-                * record and should terminate the search.
-                */
-               if (record_number == 0)
-                       return slot;
-
-               /*
-                * If the name of the record referenced by the slot matches and has not been
-                * deleted, then we've found the requested name.
-                */
-               record = &open_chapter->records[record_number];
-               if ((memcmp(&record->name, name, UDS_RECORD_NAME_SIZE) == 0) &&
-                   !open_chapter->slots[record_number].deleted)
-                       return slot;
-
-               /*
-                * Quadratic probing: advance the probe by 1, 2, 3, etc. and try again. This
-                * performs better than linear probing and works best for 2^N slots.
-                */
-               slot = (slot + attempts++) % slot_count;
-       }
-}
-
-void uds_search_open_chapter(struct open_chapter_zone *open_chapter,
-                            const struct uds_record_name *name,
-                            struct uds_record_data *metadata, bool *found)
-{
-       unsigned int slot;
-       unsigned int record_number;
-
-       slot = probe_chapter_slots(open_chapter, name);
-       record_number = open_chapter->slots[slot].record_number;
-       if (record_number == 0) {
-               *found = false;
-       } else {
-               *found = true;
-               *metadata = open_chapter->records[record_number].data;
-       }
-}
-
-/* Add a record to the open chapter zone and return the remaining space. */
-int uds_put_open_chapter(struct open_chapter_zone *open_chapter,
-                        const struct uds_record_name *name,
-                        const struct uds_record_data *metadata)
-{
-       unsigned int slot;
-       unsigned int record_number;
-       struct uds_volume_record *record;
-
-       if (open_chapter->size >= open_chapter->capacity)
-               return 0;
-
-       slot = probe_chapter_slots(open_chapter, name);
-       record_number = open_chapter->slots[slot].record_number;
-
-       if (record_number == 0) {
-               record_number = ++open_chapter->size;
-               open_chapter->slots[slot].record_number = record_number;
-       }
-
-       record = &open_chapter->records[record_number];
-       record->name = *name;
-       record->data = *metadata;
-
-       return open_chapter->capacity - open_chapter->size;
-}
-
-void uds_remove_from_open_chapter(struct open_chapter_zone *open_chapter,
-                                 const struct uds_record_name *name)
-{
-       unsigned int slot;
-       unsigned int record_number;
-
-       slot = probe_chapter_slots(open_chapter, name);
-       record_number = open_chapter->slots[slot].record_number;
-
-       if (record_number > 0) {
-               open_chapter->slots[record_number].deleted = true;
-               open_chapter->deletions += 1;
-       }
-}
-
-void uds_free_open_chapter(struct open_chapter_zone *open_chapter)
-{
-       if (open_chapter != NULL) {
-               uds_free(open_chapter->records);
-               uds_free(open_chapter);
-       }
-}
-
-/* Map each record name to its record page number in the delta chapter index. */
-static int fill_delta_chapter_index(struct open_chapter_zone **chapter_zones,
-                                   unsigned int zone_count,
-                                   struct open_chapter_index *index,
-                                   struct uds_volume_record *collated_records)
-{
-       int result;
-       unsigned int records_per_chapter;
-       unsigned int records_per_page;
-       unsigned int record_index;
-       unsigned int records = 0;
-       u32 page_number;
-       unsigned int z;
-       int overflow_count = 0;
-       struct uds_volume_record *fill_record = NULL;
-
-       /*
-        * The record pages should not have any empty space, so find a record with which to fill
-        * the chapter zone if it was closed early, and also to replace any deleted records. The
-        * last record in any filled zone is guaranteed to not have been deleted, so use one of
-        * those.
-        */
-       for (z = 0; z < zone_count; z++) {
-               struct open_chapter_zone *zone = chapter_zones[z];
-
-               if (zone->size == zone->capacity) {
-                       fill_record = &zone->records[zone->size];
-                       break;
-               }
-       }
-
-       records_per_chapter = index->geometry->records_per_chapter;
-       records_per_page = index->geometry->records_per_page;
-
-       for (records = 0; records < records_per_chapter; records++) {
-               struct uds_volume_record *record = &collated_records[records];
-               struct open_chapter_zone *open_chapter;
-
-               /* The record arrays in the zones are 1-based. */
-               record_index = 1 + (records / zone_count);
-               page_number = records / records_per_page;
-               open_chapter = chapter_zones[records % zone_count];
-
-               /* Use the fill record in place of an unused record. */
-               if (record_index > open_chapter->size ||
-                   open_chapter->slots[record_index].deleted) {
-                       *record = *fill_record;
-                       continue;
-               }
-
-               *record = open_chapter->records[record_index];
-               result = uds_put_open_chapter_index_record(index, &record->name,
-                                                          page_number);
-               switch (result) {
-               case UDS_SUCCESS:
-                       break;
-               case UDS_OVERFLOW:
-                       overflow_count++;
-                       break;
-               default:
-                       uds_log_error_strerror(result,
-                                              "failed to build open chapter index");
-                       return result;
-               }
-       }
-
-       if (overflow_count > 0)
-               uds_log_warning("Failed to add %d entries to chapter index",
-                               overflow_count);
-
-       return UDS_SUCCESS;
-}
-
-int uds_close_open_chapter(struct open_chapter_zone **chapter_zones,
-                          unsigned int zone_count, struct volume *volume,
-                          struct open_chapter_index *chapter_index,
-                          struct uds_volume_record *collated_records,
-                          u64 virtual_chapter_number)
-{
-       int result;
-
-       uds_empty_open_chapter_index(chapter_index, virtual_chapter_number);
-       result = fill_delta_chapter_index(chapter_zones, zone_count, chapter_index,
-                                         collated_records);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       return uds_write_chapter(volume, chapter_index, collated_records);
-}
-
-int uds_save_open_chapter(struct uds_index *index, struct buffered_writer *writer)
-{
-       int result;
-       struct open_chapter_zone *open_chapter;
-       struct uds_volume_record *record;
-       u8 record_count_data[sizeof(u32)];
-       u32 record_count = 0;
-       unsigned int record_index;
-       unsigned int z;
-
-       result = uds_write_to_buffered_writer(writer, OPEN_CHAPTER_MAGIC,
-                                             OPEN_CHAPTER_MAGIC_LENGTH);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       result = uds_write_to_buffered_writer(writer, OPEN_CHAPTER_VERSION,
-                                             OPEN_CHAPTER_VERSION_LENGTH);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       for (z = 0; z < index->zone_count; z++) {
-               open_chapter = index->zones[z]->open_chapter;
-               record_count += open_chapter->size - open_chapter->deletions;
-       }
-
-       put_unaligned_le32(record_count, record_count_data);
-       result = uds_write_to_buffered_writer(writer, record_count_data,
-                                             sizeof(record_count_data));
-       if (result != UDS_SUCCESS)
-               return result;
-
-       record_index = 1;
-       while (record_count > 0) {
-               for (z = 0; z < index->zone_count; z++) {
-                       open_chapter = index->zones[z]->open_chapter;
-                       if (record_index > open_chapter->size)
-                               continue;
-
-                       if (open_chapter->slots[record_index].deleted)
-                               continue;
-
-                       record = &open_chapter->records[record_index];
-                       result = uds_write_to_buffered_writer(writer, (u8 *) record,
-                                                             sizeof(*record));
-                       if (result != UDS_SUCCESS)
-                               return result;
-
-                       record_count--;
-               }
-
-               record_index++;
-       }
-
-       return uds_flush_buffered_writer(writer);
-}
-
-u64 uds_compute_saved_open_chapter_size(struct index_geometry *geometry)
-{
-       unsigned int records_per_chapter = geometry->records_per_chapter;
-
-       return OPEN_CHAPTER_MAGIC_LENGTH + OPEN_CHAPTER_VERSION_LENGTH + sizeof(u32) +
-               records_per_chapter * sizeof(struct uds_volume_record);
-}
-
-static int load_version20(struct uds_index *index, struct buffered_reader *reader)
-{
-       int result;
-       u32 record_count;
-       u8 record_count_data[sizeof(u32)];
-       struct uds_volume_record record;
-
-       /*
-        * Track which zones cannot accept any more records. If the open chapter had a different
-        * number of zones previously, some new zones may have more records than they have space
-        * for. These overflow records will be discarded.
-        */
-       bool full_flags[MAX_ZONES] = {
-               false,
-       };
-
-       result = uds_read_from_buffered_reader(reader, (u8 *) &record_count_data,
-                                              sizeof(record_count_data));
-       if (result != UDS_SUCCESS)
-               return result;
-
-       record_count = get_unaligned_le32(record_count_data);
-       while (record_count-- > 0) {
-               unsigned int zone = 0;
-
-               result = uds_read_from_buffered_reader(reader, (u8 *) &record,
-                                                      sizeof(record));
-               if (result != UDS_SUCCESS)
-                       return result;
-
-               if (index->zone_count > 1)
-                       zone = uds_get_volume_index_zone(index->volume_index,
-                                                        &record.name);
-
-               if (!full_flags[zone]) {
-                       struct open_chapter_zone *open_chapter;
-                       unsigned int remaining;
-
-                       open_chapter = index->zones[zone]->open_chapter;
-                       remaining = uds_put_open_chapter(open_chapter, &record.name,
-                                                        &record.data);
-                       /* Do not allow any zone to fill completely. */
-                       full_flags[zone] = (remaining <= 1);
-               }
-       }
-
-       return UDS_SUCCESS;
-}
-
-int uds_load_open_chapter(struct uds_index *index, struct buffered_reader *reader)
-{
-       u8 version[OPEN_CHAPTER_VERSION_LENGTH];
-       int result;
-
-       result = uds_verify_buffered_data(reader, OPEN_CHAPTER_MAGIC,
-                                         OPEN_CHAPTER_MAGIC_LENGTH);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       result = uds_read_from_buffered_reader(reader, version, sizeof(version));
-       if (result != UDS_SUCCESS)
-               return result;
-
-       if (memcmp(OPEN_CHAPTER_VERSION, version, sizeof(version)) != 0) {
-               return uds_log_error_strerror(UDS_CORRUPT_DATA,
-                                             "Invalid open chapter version: %.*s",
-                                             (int) sizeof(version), version);
-       }
-
-       return load_version20(index, reader);
-}
diff --git a/drivers/md/dm-vdo/open-chapter.h b/drivers/md/dm-vdo/open-chapter.h
deleted file mode 100644 (file)
index a4250bb..0000000
+++ /dev/null
@@ -1,79 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright 2023 Red Hat
- */
-
-#ifndef UDS_OPEN_CHAPTER_H
-#define UDS_OPEN_CHAPTER_H
-
-#include "chapter-index.h"
-#include "geometry.h"
-#include "index.h"
-#include "volume.h"
-
-/*
- * The open chapter tracks the newest records in memory. Like the index as a whole, each open
- * chapter is divided into a number of independent zones which are interleaved when the chapter is
- * committed to the volume.
- */
-
-enum {
-       OPEN_CHAPTER_RECORD_NUMBER_BITS = 23,
-};
-
-struct open_chapter_zone_slot {
-       /* If non-zero, the record number addressed by this hash slot */
-       unsigned int record_number : OPEN_CHAPTER_RECORD_NUMBER_BITS;
-       /* If true, the record at the index of this hash slot was deleted */
-       bool deleted : 1;
-} __packed;
-
-struct open_chapter_zone {
-       /* The maximum number of records that can be stored */
-       unsigned int capacity;
-       /* The number of records stored */
-       unsigned int size;
-       /* The number of deleted records */
-       unsigned int deletions;
-       /* Array of chunk records, 1-based */
-       struct uds_volume_record *records;
-       /* The number of slots in the hash table */
-       unsigned int slot_count;
-       /* The hash table slots, referencing virtual record numbers */
-       struct open_chapter_zone_slot slots[];
-};
-
-int __must_check uds_make_open_chapter(const struct index_geometry *geometry,
-                                      unsigned int zone_count,
-                                      struct open_chapter_zone **open_chapter_ptr);
-
-void uds_reset_open_chapter(struct open_chapter_zone *open_chapter);
-
-void uds_search_open_chapter(struct open_chapter_zone *open_chapter,
-                            const struct uds_record_name *name,
-                            struct uds_record_data *metadata, bool *found);
-
-int __must_check uds_put_open_chapter(struct open_chapter_zone *open_chapter,
-                                     const struct uds_record_name *name,
-                                     const struct uds_record_data *metadata);
-
-void uds_remove_from_open_chapter(struct open_chapter_zone *open_chapter,
-                                 const struct uds_record_name *name);
-
-void uds_free_open_chapter(struct open_chapter_zone *open_chapter);
-
-int __must_check uds_close_open_chapter(struct open_chapter_zone **chapter_zones,
-                                       unsigned int zone_count, struct volume *volume,
-                                       struct open_chapter_index *chapter_index,
-                                       struct uds_volume_record *collated_records,
-                                       u64 virtual_chapter_number);
-
-int __must_check uds_save_open_chapter(struct uds_index *index,
-                                      struct buffered_writer *writer);
-
-int __must_check uds_load_open_chapter(struct uds_index *index,
-                                      struct buffered_reader *reader);
-
-u64 uds_compute_saved_open_chapter_size(struct index_geometry *geometry);
-
-#endif /* UDS_OPEN_CHAPTER_H */
diff --git a/drivers/md/dm-vdo/radix-sort.c b/drivers/md/dm-vdo/radix-sort.c
deleted file mode 100644 (file)
index 1f17c70..0000000
+++ /dev/null
@@ -1,332 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright 2023 Red Hat
- */
-
-#include "radix-sort.h"
-
-#include <linux/limits.h>
-#include <linux/types.h>
-
-#include "memory-alloc.h"
-#include "string-utils.h"
-
-/*
- * This implementation allocates one large object to do the sorting, which can be reused as many
- * times as desired. The amount of memory required is logarithmically proportional to the number of
- * keys to be sorted.
- */
-
-enum {
-       /* Piles smaller than this are handled with a simple insertion sort. */
-       INSERTION_SORT_THRESHOLD = 12,
-};
-
-/* Sort keys are pointers to immutable fixed-length arrays of bytes. */
-typedef const u8 *sort_key_t;
-
-/*
- * The keys are separated into piles based on the byte in each keys at the current offset, so the
- * number of keys with each byte must be counted.
- */
-struct histogram {
-       /* The number of non-empty bins */
-       u16 used;
-       /* The index (key byte) of the first non-empty bin */
-       u16 first;
-       /* The index (key byte) of the last non-empty bin */
-       u16 last;
-       /* The number of occurrences of each specific byte */
-       u32 size[256];
-};
-
-/*
- * Sub-tasks are manually managed on a stack, both for performance and to put a logarithmic bound
- * on the stack space needed.
- */
-struct task {
-       /* Pointer to the first key to sort. */
-       sort_key_t *first_key;
-       /* Pointer to the last key to sort. */
-       sort_key_t *last_key;
-       /* The offset into the key at which to continue sorting. */
-       u16 offset;
-       /* The number of bytes remaining in the sort keys. */
-       u16 length;
-};
-
-struct radix_sorter {
-       unsigned int count;
-       struct histogram bins;
-       sort_key_t *pile[256];
-       struct task *end_of_stack;
-       struct task insertion_list[256];
-       struct task stack[];
-};
-
-/* Compare a segment of two fixed-length keys starting at an offset. */
-static inline int compare(sort_key_t key1, sort_key_t key2, u16 offset, u16 length)
-{
-       return memcmp(&key1[offset], &key2[offset], length);
-}
-
-/* Insert the next unsorted key into an array of sorted keys. */
-static inline void insert_key(const struct task task, sort_key_t *next)
-{
-       /* Pull the unsorted key out, freeing up the array slot. */
-       sort_key_t unsorted = *next;
-
-       /* Compare the key to the preceding sorted entries, shifting down ones that are larger. */
-       while ((--next >= task.first_key) &&
-              (compare(unsorted, next[0], task.offset, task.length) < 0))
-               next[1] = next[0];
-
-       /* Insert the key into the last slot that was cleared, sorting it. */
-       next[1] = unsorted;
-}
-
-/*
- * Sort a range of key segments using an insertion sort. This simple sort is faster than the
- * 256-way radix sort when the number of keys to sort is small.
- */
-static inline void insertion_sort(const struct task task)
-{
-       sort_key_t *next;
-
-       for (next = task.first_key + 1; next <= task.last_key; next++)
-               insert_key(task, next);
-}
-
-/* Push a sorting task onto a task stack. */
-static inline void push_task(struct task **stack_pointer, sort_key_t *first_key,
-                            u32 count, u16 offset, u16 length)
-{
-       struct task *task = (*stack_pointer)++;
-
-       task->first_key = first_key;
-       task->last_key = &first_key[count - 1];
-       task->offset = offset;
-       task->length = length;
-}
-
-static inline void swap_keys(sort_key_t *a, sort_key_t *b)
-{
-       sort_key_t c = *a;
-       *a = *b;
-       *b = c;
-}
-
-/*
- * Count the number of times each byte value appears in the arrays of keys to sort at the current
- * offset, keeping track of the number of non-empty bins, and the index of the first and last
- * non-empty bin.
- */
-static inline void measure_bins(const struct task task, struct histogram *bins)
-{
-       sort_key_t *key_ptr;
-
-       /*
-        * Subtle invariant: bins->used and bins->size[] are zero because the sorting code clears
-        * it all out as it goes. Even though this structure is re-used, we don't need to pay to
-        * zero it before starting a new tally.
-        */
-       bins->first = U8_MAX;
-       bins->last = 0;
-
-       for (key_ptr = task.first_key; key_ptr <= task.last_key; key_ptr++) {
-               /* Increment the count for the byte in the key at the current offset. */
-               u8 bin = (*key_ptr)[task.offset];
-               u32 size = ++bins->size[bin];
-
-               /* Track non-empty bins. */
-               if (size == 1) {
-                       bins->used += 1;
-                       if (bin < bins->first)
-                               bins->first = bin;
-
-                       if (bin > bins->last)
-                               bins->last = bin;
-               }
-       }
-}
-
-/*
- * Convert the bin sizes to pointers to where each pile goes.
- *
- *   pile[0] = first_key + bin->size[0],
- *   pile[1] = pile[0]  + bin->size[1], etc.
- *
- * After the keys are moved to the appropriate pile, we'll need to sort each of the piles by the
- * next radix position. A new task is put on the stack for each pile containing lots of keys, or a
- * new task is put on the list for each pile containing few keys.
- *
- * @stack: pointer the top of the stack
- * @end_of_stack: the end of the stack
- * @list: pointer the head of the list
- * @pile: array for pointers to the end of each pile
- * @bins: the histogram of the sizes of each pile
- * @first_key: the first key of the stack
- * @offset: the next radix position to sort by
- * @length: the number of bytes remaining in the sort keys
- *
- * Return: UDS_SUCCESS or an error code
- */
-static inline int push_bins(struct task **stack, struct task *end_of_stack,
-                           struct task **list, sort_key_t *pile[],
-                           struct histogram *bins, sort_key_t *first_key,
-                           u16 offset, u16 length)
-{
-       sort_key_t *pile_start = first_key;
-       int bin;
-
-       for (bin = bins->first; ; bin++) {
-               u32 size = bins->size[bin];
-
-               /* Skip empty piles. */
-               if (size == 0)
-                       continue;
-
-               /* There's no need to sort empty keys. */
-               if (length > 0) {
-                       if (size > INSERTION_SORT_THRESHOLD) {
-                               if (*stack >= end_of_stack)
-                                       return UDS_BAD_STATE;
-
-                               push_task(stack, pile_start, size, offset, length);
-                       } else if (size > 1) {
-                               push_task(list, pile_start, size, offset, length);
-                       }
-               }
-
-               pile_start += size;
-               pile[bin] = pile_start;
-               if (--bins->used == 0)
-                       break;
-       }
-
-       return UDS_SUCCESS;
-}
-
-int uds_make_radix_sorter(unsigned int count, struct radix_sorter **sorter)
-{
-       int result;
-       unsigned int stack_size = count / INSERTION_SORT_THRESHOLD;
-       struct radix_sorter *radix_sorter;
-
-       result = uds_allocate_extended(struct radix_sorter, stack_size, struct task,
-                                      __func__, &radix_sorter);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       radix_sorter->count = count;
-       radix_sorter->end_of_stack = radix_sorter->stack + stack_size;
-       *sorter = radix_sorter;
-       return UDS_SUCCESS;
-}
-
-void uds_free_radix_sorter(struct radix_sorter *sorter)
-{
-       uds_free(sorter);
-}
-
-/*
- * Sort pointers to fixed-length keys (arrays of bytes) using a radix sort. The sort implementation
- * is unstable, so the relative ordering of equal keys is not preserved.
- */
-int uds_radix_sort(struct radix_sorter *sorter, const unsigned char *keys[],
-                  unsigned int count, unsigned short length)
-{
-       struct task start;
-       struct histogram *bins = &sorter->bins;
-       sort_key_t **pile = sorter->pile;
-       struct task *task_stack = sorter->stack;
-
-       /* All zero-length keys are identical and therefore already sorted. */
-       if ((count == 0) || (length == 0))
-               return UDS_SUCCESS;
-
-       /* The initial task is to sort the entire length of all the keys. */
-       start = (struct task) {
-               .first_key = keys,
-               .last_key = &keys[count - 1],
-               .offset = 0,
-               .length = length,
-       };
-
-       if (count <= INSERTION_SORT_THRESHOLD) {
-               insertion_sort(start);
-               return UDS_SUCCESS;
-       }
-
-       if (count > sorter->count)
-               return UDS_INVALID_ARGUMENT;
-
-       /*
-        * Repeatedly consume a sorting task from the stack and process it, pushing new sub-tasks
-        * onto the stack for each radix-sorted pile. When all tasks and sub-tasks have been
-        * processed, the stack will be empty and all the keys in the starting task will be fully
-        * sorted.
-        */
-       for (*task_stack = start; task_stack >= sorter->stack; task_stack--) {
-               const struct task task = *task_stack;
-               struct task *insertion_task_list;
-               int result;
-               sort_key_t *fence;
-               sort_key_t *end;
-
-               measure_bins(task, bins);
-
-               /*
-                * Now that we know how large each bin is, generate pointers for each of the piles
-                * and push a new task to sort each pile by the next radix byte.
-                */
-               insertion_task_list = sorter->insertion_list;
-               result = push_bins(&task_stack, sorter->end_of_stack,
-                                  &insertion_task_list, pile, bins, task.first_key,
-                                  task.offset + 1, task.length - 1);
-               if (result != UDS_SUCCESS) {
-                       memset(bins, 0, sizeof(*bins));
-                       return result;
-               }
-
-               /* Now bins->used is zero again. */
-
-               /*
-                * Don't bother processing the last pile: when piles 0..N-1 are all in place, then
-                * pile N must also be in place.
-                */
-               end = task.last_key - bins->size[bins->last];
-               bins->size[bins->last] = 0;
-
-               for (fence = task.first_key; fence <= end; ) {
-                       u8 bin;
-                       sort_key_t key = *fence;
-
-                       /*
-                        * The radix byte of the key tells us which pile it belongs in. Swap it for
-                        * an unprocessed item just below that pile, and repeat.
-                        */
-                       while (--pile[bin = key[task.offset]] > fence)
-                               swap_keys(pile[bin], &key);
-
-                       /*
-                        * The pile reached the fence. Put the key at the bottom of that pile,
-                        * completing it, and advance the fence to the next pile.
-                        */
-                       *fence = key;
-                       fence += bins->size[bin];
-                       bins->size[bin] = 0;
-               }
-
-               /* Now bins->size[] is all zero again. */
-
-               /*
-                * When the number of keys in a task gets small enough, it is faster to use an
-                * insertion sort than to keep subdividing into tiny piles.
-                */
-               while (--insertion_task_list >= sorter->insertion_list)
-                       insertion_sort(*insertion_task_list);
-       }
-
-       return UDS_SUCCESS;
-}
diff --git a/drivers/md/dm-vdo/radix-sort.h b/drivers/md/dm-vdo/radix-sort.h
deleted file mode 100644 (file)
index 812949b..0000000
+++ /dev/null
@@ -1,26 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright 2023 Red Hat
- */
-
-#ifndef UDS_RADIX_SORT_H
-#define UDS_RADIX_SORT_H
-
-/*
- * Radix sort is implemented using an American Flag sort, an unstable, in-place 8-bit radix
- * exchange sort. This is adapted from the algorithm in the paper by Peter M. McIlroy, Keith
- * Bostic, and M. Douglas McIlroy, "Engineering Radix Sort".
- *
- * http://www.usenix.org/publications/compsystems/1993/win_mcilroy.pdf
- */
-
-struct radix_sorter;
-
-int __must_check uds_make_radix_sorter(unsigned int count, struct radix_sorter **sorter);
-
-void uds_free_radix_sorter(struct radix_sorter *sorter);
-
-int __must_check uds_radix_sort(struct radix_sorter *sorter, const unsigned char *keys[],
-                               unsigned int count, unsigned short length);
-
-#endif /* UDS_RADIX_SORT_H */
diff --git a/drivers/md/dm-vdo/sparse-cache.c b/drivers/md/dm-vdo/sparse-cache.c
deleted file mode 100644 (file)
index b43a626..0000000
+++ /dev/null
@@ -1,625 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright 2023 Red Hat
- */
-
-#include "sparse-cache.h"
-
-#include <linux/cache.h>
-#include <linux/delay.h>
-#include <linux/dm-bufio.h>
-
-#include "chapter-index.h"
-#include "config.h"
-#include "index.h"
-#include "logger.h"
-#include "memory-alloc.h"
-#include "permassert.h"
-
-/*
- * Since the cache is small, it is implemented as a simple array of cache entries. Searching for a
- * specific virtual chapter is implemented as a linear search. The cache replacement policy is
- * least-recently-used (LRU). Again, the small size of the cache allows the LRU order to be
- * maintained by shifting entries in an array list.
- *
- * Changing the contents of the cache requires the coordinated participation of all zone threads
- * via the careful use of barrier messages sent to all the index zones by the triage queue worker
- * thread. The critical invariant for coordination is that the cache membership must not change
- * between updates, so that all calls to uds_sparse_cache_contains() from the zone threads must all
- * receive the same results for every virtual chapter number. To ensure that critical invariant,
- * state changes such as "that virtual chapter is no longer in the volume" and "skip searching that
- * chapter because it has had too many cache misses" are represented separately from the cache
- * membership information (the virtual chapter number).
- *
- * As a result of this invariant, we have the guarantee that every zone thread will call
- * uds_update_sparse_cache() once and exactly once to request a chapter that is not in the cache,
- * and the serialization of the barrier requests from the triage queue ensures they will all
- * request the same chapter number. This means the only synchronization we need can be provided by
- * a pair of thread barriers used only in the uds_update_sparse_cache() call, providing a critical
- * section where a single zone thread can drive the cache update while all the other zone threads
- * are known to be blocked, waiting in the second barrier. Outside that critical section, all the
- * zone threads implicitly hold a shared lock. Inside it, the thread for zone zero holds an
- * exclusive lock. No other threads may access or modify the cache entries.
- *
- * Chapter statistics must only be modified by a single thread, which is also the zone zero thread.
- * All fields that might be frequently updated by that thread are kept in separate cache-aligned
- * structures so they will not cause cache contention via "false sharing" with the fields that are
- * frequently accessed by all of the zone threads.
- *
- * The LRU order is managed independently by each zone thread, and each zone uses its own list for
- * searching and cache membership queries. The zone zero list is used to decide which chapter to
- * evict when the cache is updated, and its search list is copied to the other threads at that
- * time.
- *
- * The virtual chapter number field of the cache entry is the single field indicating whether a
- * chapter is a member of the cache or not. The value NO_CHAPTER is used to represent a null or
- * undefined chapter number. When present in the virtual chapter number field of a
- * cached_chapter_index, it indicates that the cache entry is dead, and all the other fields of
- * that entry (other than immutable pointers to cache memory) are undefined and irrelevant. Any
- * cache entry that is not marked as dead is fully defined and a member of the cache, and
- * uds_sparse_cache_contains() will always return true for any virtual chapter number that appears
- * in any of the cache entries.
- *
- * A chapter index that is a member of the cache may be excluded from searches between calls to
- * uds_update_sparse_cache() in two different ways. First, when a chapter falls off the end of the
- * volume, its virtual chapter number will be less that the oldest virtual chapter number. Since
- * that chapter is no longer part of the volume, there's no point in continuing to search that
- * chapter index. Once invalidated, that virtual chapter will still be considered a member of the
- * cache, but it will no longer be searched for matching names.
- *
- * The second mechanism is a heuristic based on keeping track of the number of consecutive search
- * misses in a given chapter index. Once that count exceeds a threshold, the skip_search flag will
- * be set to true, causing the chapter to be skipped when searching the entire cache, but still
- * allowing it to be found when searching for a hook in that specific chapter. Finding a hook will
- * clear the skip_search flag, once again allowing the non-hook searches to use that cache entry.
- * Again, regardless of the state of the skip_search flag, the virtual chapter must still
- * considered to be a member of the cache for uds_sparse_cache_contains().
- */
-
-enum {
-       SKIP_SEARCH_THRESHOLD = 20000,
-       ZONE_ZERO = 0,
-};
-
-/*
- * These counters are essentially fields of the struct cached_chapter_index, but are segregated
- * into this structure because they are frequently modified. They are grouped and aligned to keep
- * them on different cache lines from the chapter fields that are accessed far more often than they
- * are updated.
- */
-struct __aligned(L1_CACHE_BYTES) cached_index_counters {
-       u64 consecutive_misses;
-};
-
-struct __aligned(L1_CACHE_BYTES) cached_chapter_index {
-       /*
-        * The virtual chapter number of the cached chapter index. NO_CHAPTER means this cache
-        * entry is unused. This field must only be modified in the critical section in
-        * uds_update_sparse_cache().
-        */
-       u64 virtual_chapter;
-
-       u32 index_pages_count;
-
-       /*
-        * These pointers are immutable during the life of the cache. The contents of the arrays
-        * change when the cache entry is replaced.
-        */
-       struct delta_index_page *index_pages;
-       struct dm_buffer **page_buffers;
-
-       /*
-        * If set, skip the chapter when searching the entire cache. This flag is just a
-        * performance optimization. This flag is mutable between cache updates, but it rarely
-        * changes and is frequently accessed, so it groups with the immutable fields.
-        */
-       bool skip_search;
-
-       /*
-        * The cache-aligned counters change often and are placed at the end of the structure to
-        * prevent false sharing with the more stable fields above.
-        */
-       struct cached_index_counters counters;
-};
-
-/*
- * A search_list represents an ordering of the sparse chapter index cache entry array, from most
- * recently accessed to least recently accessed, which is the order in which the indexes should be
- * searched and the reverse order in which they should be evicted from the cache.
- *
- * Cache entries that are dead or empty are kept at the end of the list, avoiding the need to even
- * iterate over them to search, and ensuring that dead entries are replaced before any live entries
- * are evicted.
- *
- * The search list is instantiated for each zone thread, avoiding any need for synchronization. The
- * structure is allocated on a cache boundary to avoid false sharing of memory cache lines between
- * zone threads.
- */
-struct search_list {
-       u8 capacity;
-       u8 first_dead_entry;
-       struct cached_chapter_index *entries[];
-};
-
-struct threads_barrier {
-       /* Lock for this barrier object */
-       struct semaphore lock;
-       /* Semaphore for threads waiting at this barrier */
-       struct semaphore wait;
-       /* Number of threads which have arrived */
-       int arrived;
-       /* Total number of threads using this barrier */
-       int thread_count;
-};
-
-struct sparse_cache {
-       const struct index_geometry *geometry;
-       unsigned int capacity;
-       unsigned int zone_count;
-
-       unsigned int skip_threshold;
-       struct search_list *search_lists[MAX_ZONES];
-       struct cached_chapter_index **scratch_entries;
-
-       struct threads_barrier begin_update_barrier;
-       struct threads_barrier end_update_barrier;
-
-       struct cached_chapter_index chapters[];
-};
-
-static void initialize_threads_barrier(struct threads_barrier *barrier,
-                                      unsigned int thread_count)
-{
-       sema_init(&barrier->lock, 1);
-       barrier->arrived = 0;
-       barrier->thread_count = thread_count;
-       sema_init(&barrier->wait, 0);
-}
-
-static inline void __down(struct semaphore *semaphore)
-{
-       /*
-        * Do not use down(semaphore). Instead use down_interruptible so that
-        * we do not get 120 second stall messages in kern.log.
-        */
-       while (down_interruptible(semaphore) != 0) {
-               /*
-                * If we're called from a user-mode process (e.g., "dmsetup
-                * remove") while waiting for an operation that may take a
-                * while (e.g., UDS index save), and a signal is sent (SIGINT,
-                * SIGUSR2), then down_interruptible will not block. If that
-                * happens, sleep briefly to avoid keeping the CPU locked up in
-                * this loop. We could just call cond_resched, but then we'd
-                * still keep consuming CPU time slices and swamp other threads
-                * trying to do computational work.
-                */
-               fsleep(1000);
-       }
-}
-
-static void enter_threads_barrier(struct threads_barrier *barrier)
-{
-       __down(&barrier->lock);
-       if (++barrier->arrived == barrier->thread_count) {
-               /* last thread */
-               int i;
-
-               for (i = 1; i < barrier->thread_count; i++)
-                       up(&barrier->wait);
-
-               barrier->arrived = 0;
-               up(&barrier->lock);
-       } else {
-               up(&barrier->lock);
-               __down(&barrier->wait);
-       }
-}
-
-static int __must_check initialize_cached_chapter_index(struct cached_chapter_index *chapter,
-                                                       const struct index_geometry *geometry)
-{
-       int result;
-
-       chapter->virtual_chapter = NO_CHAPTER;
-       chapter->index_pages_count = geometry->index_pages_per_chapter;
-
-       result = uds_allocate(chapter->index_pages_count, struct delta_index_page,
-                             __func__, &chapter->index_pages);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       return uds_allocate(chapter->index_pages_count, struct dm_buffer *,
-                           "sparse index volume pages", &chapter->page_buffers);
-}
-
-static int __must_check make_search_list(struct sparse_cache *cache,
-                                        struct search_list **list_ptr)
-{
-       struct search_list *list;
-       unsigned int bytes;
-       u8 i;
-       int result;
-
-       bytes = (sizeof(struct search_list) +
-                (cache->capacity * sizeof(struct cached_chapter_index *)));
-       result = uds_allocate_cache_aligned(bytes, "search list", &list);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       list->capacity = cache->capacity;
-       list->first_dead_entry = 0;
-
-       for (i = 0; i < list->capacity; i++)
-               list->entries[i] = &cache->chapters[i];
-
-       *list_ptr = list;
-       return UDS_SUCCESS;
-}
-
-int uds_make_sparse_cache(const struct index_geometry *geometry, unsigned int capacity,
-                         unsigned int zone_count, struct sparse_cache **cache_ptr)
-{
-       int result;
-       unsigned int i;
-       struct sparse_cache *cache;
-       unsigned int bytes;
-
-       bytes = (sizeof(struct sparse_cache) + (capacity * sizeof(struct cached_chapter_index)));
-       result = uds_allocate_cache_aligned(bytes, "sparse cache", &cache);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       cache->geometry = geometry;
-       cache->capacity = capacity;
-       cache->zone_count = zone_count;
-
-       /*
-        * Scale down the skip threshold since the cache only counts cache misses in zone zero, but
-        * requests are being handled in all zones.
-        */
-       cache->skip_threshold = (SKIP_SEARCH_THRESHOLD / zone_count);
-
-       initialize_threads_barrier(&cache->begin_update_barrier, zone_count);
-       initialize_threads_barrier(&cache->end_update_barrier, zone_count);
-
-       for (i = 0; i < capacity; i++) {
-               result = initialize_cached_chapter_index(&cache->chapters[i], geometry);
-               if (result != UDS_SUCCESS)
-                       goto out;
-       }
-
-       for (i = 0; i < zone_count; i++) {
-               result = make_search_list(cache, &cache->search_lists[i]);
-               if (result != UDS_SUCCESS)
-                       goto out;
-       }
-
-       /* purge_search_list() needs some temporary lists for sorting. */
-       result = uds_allocate(capacity * 2, struct cached_chapter_index *,
-                             "scratch entries", &cache->scratch_entries);
-       if (result != UDS_SUCCESS)
-               goto out;
-
-       *cache_ptr = cache;
-       return UDS_SUCCESS;
-out:
-       uds_free_sparse_cache(cache);
-       return result;
-}
-
-static inline void set_skip_search(struct cached_chapter_index *chapter,
-                                  bool skip_search)
-{
-       /* Check before setting to reduce cache line contention. */
-       if (READ_ONCE(chapter->skip_search) != skip_search)
-               WRITE_ONCE(chapter->skip_search, skip_search);
-}
-
-static void score_search_hit(struct cached_chapter_index *chapter)
-{
-       chapter->counters.consecutive_misses = 0;
-       set_skip_search(chapter, false);
-}
-
-static void score_search_miss(struct sparse_cache *cache,
-                             struct cached_chapter_index *chapter)
-{
-       chapter->counters.consecutive_misses++;
-       if (chapter->counters.consecutive_misses > cache->skip_threshold)
-               set_skip_search(chapter, true);
-}
-
-static void release_cached_chapter_index(struct cached_chapter_index *chapter)
-{
-       unsigned int i;
-
-       chapter->virtual_chapter = NO_CHAPTER;
-       if (chapter->page_buffers == NULL)
-               return;
-
-       for (i = 0; i < chapter->index_pages_count; i++) {
-               if (chapter->page_buffers[i] != NULL)
-                       dm_bufio_release(uds_forget(chapter->page_buffers[i]));
-       }
-}
-
-void uds_free_sparse_cache(struct sparse_cache *cache)
-{
-       unsigned int i;
-
-       if (cache == NULL)
-               return;
-
-       uds_free(cache->scratch_entries);
-
-       for (i = 0; i < cache->zone_count; i++)
-               uds_free(cache->search_lists[i]);
-
-       for (i = 0; i < cache->capacity; i++) {
-               release_cached_chapter_index(&cache->chapters[i]);
-               uds_free(cache->chapters[i].index_pages);
-               uds_free(cache->chapters[i].page_buffers);
-       }
-
-       uds_free(cache);
-}
-
-/*
- * Take the indicated element of the search list and move it to the start, pushing the pointers
- * previously before it back down the list.
- */
-static inline void set_newest_entry(struct search_list *search_list, u8 index)
-{
-       struct cached_chapter_index *newest;
-
-       if (index > 0) {
-               newest = search_list->entries[index];
-               memmove(&search_list->entries[1], &search_list->entries[0],
-                       index * sizeof(struct cached_chapter_index *));
-               search_list->entries[0] = newest;
-       }
-
-       /*
-        * This function may have moved a dead chapter to the front of the list for reuse, in which
-        * case the set of dead chapters becomes smaller.
-        */
-       if (search_list->first_dead_entry <= index)
-               search_list->first_dead_entry++;
-}
-
-bool uds_sparse_cache_contains(struct sparse_cache *cache, u64 virtual_chapter,
-                              unsigned int zone_number)
-{
-       struct search_list *search_list;
-       struct cached_chapter_index *chapter;
-       u8 i;
-
-       /*
-        * The correctness of the barriers depends on the invariant that between calls to
-        * uds_update_sparse_cache(), the answers this function returns must never vary: the result
-        * for a given chapter must be identical across zones. That invariant must be maintained
-        * even if the chapter falls off the end of the volume, or if searching it is disabled
-        * because of too many search misses.
-        */
-       search_list = cache->search_lists[zone_number];
-       for (i = 0; i < search_list->first_dead_entry; i++) {
-               chapter = search_list->entries[i];
-
-               if (virtual_chapter == chapter->virtual_chapter) {
-                       if (zone_number == ZONE_ZERO)
-                               score_search_hit(chapter);
-
-                       set_newest_entry(search_list, i);
-                       return true;
-               }
-       }
-
-       return false;
-}
-
-/*
- * Re-sort cache entries into three sets (active, skippable, and dead) while maintaining the LRU
- * ordering that already existed. This operation must only be called during the critical section in
- * uds_update_sparse_cache().
- */
-static void purge_search_list(struct search_list *search_list,
-                             struct sparse_cache *cache, u64 oldest_virtual_chapter)
-{
-       struct cached_chapter_index **entries;
-       struct cached_chapter_index **skipped;
-       struct cached_chapter_index **dead;
-       struct cached_chapter_index *chapter;
-       unsigned int next_alive = 0;
-       unsigned int next_skipped = 0;
-       unsigned int next_dead = 0;
-       unsigned int i;
-
-       entries = &search_list->entries[0];
-       skipped = &cache->scratch_entries[0];
-       dead = &cache->scratch_entries[search_list->capacity];
-
-       for (i = 0; i < search_list->first_dead_entry; i++) {
-               chapter = search_list->entries[i];
-               if ((chapter->virtual_chapter < oldest_virtual_chapter) ||
-                   (chapter->virtual_chapter == NO_CHAPTER))
-                       dead[next_dead++] = chapter;
-               else if (chapter->skip_search)
-                       skipped[next_skipped++] = chapter;
-               else
-                       entries[next_alive++] = chapter;
-       }
-
-       memcpy(&entries[next_alive], skipped,
-              next_skipped * sizeof(struct cached_chapter_index *));
-       memcpy(&entries[next_alive + next_skipped], dead,
-              next_dead * sizeof(struct cached_chapter_index *));
-       search_list->first_dead_entry = next_alive + next_skipped;
-}
-
-static int __must_check cache_chapter_index(struct cached_chapter_index *chapter,
-                                           u64 virtual_chapter,
-                                           const struct volume *volume)
-{
-       int result;
-
-       release_cached_chapter_index(chapter);
-
-       result = uds_read_chapter_index_from_volume(volume, virtual_chapter,
-                                                   chapter->page_buffers,
-                                                   chapter->index_pages);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       chapter->counters.consecutive_misses = 0;
-       chapter->virtual_chapter = virtual_chapter;
-       chapter->skip_search = false;
-
-       return UDS_SUCCESS;
-}
-
-static inline void copy_search_list(const struct search_list *source,
-                                   struct search_list *target)
-{
-       *target = *source;
-       memcpy(target->entries, source->entries,
-              source->capacity * sizeof(struct cached_chapter_index *));
-}
-
-/*
- * Update the sparse cache to contain a chapter index. This function must be called by all the zone
- * threads with the same chapter number to correctly enter the thread barriers used to synchronize
- * the cache updates.
- */
-int uds_update_sparse_cache(struct index_zone *zone, u64 virtual_chapter)
-{
-       int result = UDS_SUCCESS;
-       const struct uds_index *index = zone->index;
-       struct sparse_cache *cache = index->volume->sparse_cache;
-
-       if (uds_sparse_cache_contains(cache, virtual_chapter, zone->id))
-               return UDS_SUCCESS;
-
-       /*
-        * Wait for every zone thread to reach its corresponding barrier request and invoke this
-        * function before starting to modify the cache.
-        */
-       enter_threads_barrier(&cache->begin_update_barrier);
-
-       /*
-        * This is the start of the critical section: the zone zero thread is captain, effectively
-        * holding an exclusive lock on the sparse cache. All the other zone threads must do
-        * nothing between the two barriers. They will wait at the end_update_barrier again for the
-        * captain to finish the update.
-        */
-
-       if (zone->id == ZONE_ZERO) {
-               unsigned int z;
-               struct search_list *list = cache->search_lists[ZONE_ZERO];
-
-               purge_search_list(list, cache, zone->oldest_virtual_chapter);
-
-               if (virtual_chapter >= index->oldest_virtual_chapter) {
-                       set_newest_entry(list, list->capacity - 1);
-                       result = cache_chapter_index(list->entries[0], virtual_chapter,
-                                                    index->volume);
-               }
-
-               for (z = 1; z < cache->zone_count; z++)
-                       copy_search_list(list, cache->search_lists[z]);
-       }
-
-       /*
-        * This is the end of the critical section. All cache invariants must have been restored.
-        */
-       enter_threads_barrier(&cache->end_update_barrier);
-       return result;
-}
-
-void uds_invalidate_sparse_cache(struct sparse_cache *cache)
-{
-       unsigned int i;
-
-       for (i = 0; i < cache->capacity; i++)
-               release_cached_chapter_index(&cache->chapters[i]);
-}
-
-static inline bool should_skip_chapter(struct cached_chapter_index *chapter,
-                                      u64 oldest_chapter, u64 requested_chapter)
-{
-       if ((chapter->virtual_chapter == NO_CHAPTER) ||
-           (chapter->virtual_chapter < oldest_chapter))
-               return true;
-
-       if (requested_chapter != NO_CHAPTER)
-               return requested_chapter != chapter->virtual_chapter;
-       else
-               return READ_ONCE(chapter->skip_search);
-}
-
-static int __must_check search_cached_chapter_index(struct cached_chapter_index *chapter,
-                                                   const struct index_geometry *geometry,
-                                                   const struct index_page_map *index_page_map,
-                                                   const struct uds_record_name *name,
-                                                   u16 *record_page_ptr)
-{
-       u32 physical_chapter =
-               uds_map_to_physical_chapter(geometry, chapter->virtual_chapter);
-       u32 index_page_number =
-               uds_find_index_page_number(index_page_map, name, physical_chapter);
-       struct delta_index_page *index_page =
-               &chapter->index_pages[index_page_number];
-
-       return uds_search_chapter_index_page(index_page, geometry, name,
-                                            record_page_ptr);
-}
-
-int uds_search_sparse_cache(struct index_zone *zone, const struct uds_record_name *name,
-                           u64 *virtual_chapter_ptr, u16 *record_page_ptr)
-{
-       int result;
-       struct volume *volume = zone->index->volume;
-       struct sparse_cache *cache = volume->sparse_cache;
-       struct cached_chapter_index *chapter;
-       struct search_list *search_list;
-       u8 i;
-       /* Search the entire cache unless a specific chapter was requested. */
-       bool search_one = (*virtual_chapter_ptr != NO_CHAPTER);
-
-       *record_page_ptr = NO_CHAPTER_INDEX_ENTRY;
-       search_list = cache->search_lists[zone->id];
-       for (i = 0; i < search_list->first_dead_entry; i++) {
-               chapter = search_list->entries[i];
-
-               if (should_skip_chapter(chapter, zone->oldest_virtual_chapter,
-                                       *virtual_chapter_ptr))
-                       continue;
-
-               result = search_cached_chapter_index(chapter, cache->geometry,
-                                                    volume->index_page_map, name,
-                                                    record_page_ptr);
-               if (result != UDS_SUCCESS)
-                       return result;
-
-               if (*record_page_ptr != NO_CHAPTER_INDEX_ENTRY) {
-                       /*
-                        * In theory, this might be a false match while a true match exists in
-                        * another chapter, but that's a very rare case and not worth the extra
-                        * search complexity.
-                        */
-                       set_newest_entry(search_list, i);
-                       if (zone->id == ZONE_ZERO)
-                               score_search_hit(chapter);
-
-                       *virtual_chapter_ptr = chapter->virtual_chapter;
-                       return UDS_SUCCESS;
-               }
-
-               if (zone->id == ZONE_ZERO)
-                       score_search_miss(cache, chapter);
-
-               if (search_one)
-                       break;
-       }
-
-       return UDS_SUCCESS;
-}
diff --git a/drivers/md/dm-vdo/sparse-cache.h b/drivers/md/dm-vdo/sparse-cache.h
deleted file mode 100644 (file)
index 45e2dcf..0000000
+++ /dev/null
@@ -1,46 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright 2023 Red Hat
- */
-
-#ifndef UDS_SPARSE_CACHE_H
-#define UDS_SPARSE_CACHE_H
-
-#include "geometry.h"
-#include "indexer.h"
-
-/*
- * The sparse cache is a cache of entire chapter indexes from sparse chapters used for searching
- * for names after all other search paths have failed. It contains only complete chapter indexes;
- * record pages from sparse chapters and single index pages used for resolving hooks are kept in
- * the regular page cache in the volume.
- *
- * The most important property of this cache is the absence of synchronization for read operations.
- * Safe concurrent access to the cache by the zone threads is controlled by the triage queue and
- * the barrier requests it issues to the zone queues. The set of cached chapters does not and must
- * not change between the carefully coordinated calls to uds_update_sparse_cache() from the zone
- * threads. Outside of updates, every zone will get the same result when calling
- * uds_sparse_cache_contains() as every other zone.
- */
-
-struct index_zone;
-struct sparse_cache;
-
-int __must_check uds_make_sparse_cache(const struct index_geometry *geometry,
-                                      unsigned int capacity, unsigned int zone_count,
-                                      struct sparse_cache **cache_ptr);
-
-void uds_free_sparse_cache(struct sparse_cache *cache);
-
-bool uds_sparse_cache_contains(struct sparse_cache *cache, u64 virtual_chapter,
-                              unsigned int zone_number);
-
-int __must_check uds_update_sparse_cache(struct index_zone *zone, u64 virtual_chapter);
-
-void uds_invalidate_sparse_cache(struct sparse_cache *cache);
-
-int __must_check uds_search_sparse_cache(struct index_zone *zone,
-                                        const struct uds_record_name *name,
-                                        u64 *virtual_chapter_ptr, u16 *record_page_ptr);
-
-#endif /* UDS_SPARSE_CACHE_H */
index 1548092e7de1db60c9493050987db49bf25600a3..2c4fb277ba388b4a121ab232813dc39207fb6f3c 100644 (file)
@@ -9,11 +9,12 @@
 #include <linux/module.h>
 #include <linux/slab.h>
 
-#include "indexer.h"
 #include "logger.h"
 #include "memory-alloc.h"
 #include "string-utils.h"
 
+#include "indexer.h"
+
 #define UDS_SYSFS_NAME "uds"
 
 static struct {
diff --git a/drivers/md/dm-vdo/volume-index.c b/drivers/md/dm-vdo/volume-index.c
deleted file mode 100644 (file)
index 36e3c2e..0000000
+++ /dev/null
@@ -1,1280 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright 2023 Red Hat
- */
-#include "volume-index.h"
-
-#include <linux/bitops.h>
-#include <linux/bits.h>
-#include <linux/cache.h>
-#include <linux/compiler.h>
-#include <linux/log2.h>
-
-#include "config.h"
-#include "errors.h"
-#include "geometry.h"
-#include "hash-utils.h"
-#include "indexer.h"
-#include "logger.h"
-#include "memory-alloc.h"
-#include "numeric.h"
-#include "permassert.h"
-#include "thread-utils.h"
-
-/*
- * The volume index is a combination of two separate subindexes, one containing sparse hook entries
- * (retained for all chapters), and one containing the remaining entries (retained only for the
- * dense chapters). If there are no sparse chapters, only the non-hook sub index is used, and it
- * will contain all records for all chapters.
- *
- * The volume index is also divided into zones, with one thread operating on each zone. Each
- * incoming request is dispatched to the appropriate thread, and then to the appropriate subindex.
- * Each delta list is handled by a single zone. To ensure that the distribution of delta lists to
- * zones doesn't underflow (leaving some zone with no delta lists), the minimum number of delta
- * lists must be the square of the maximum zone count for both subindexes.
- *
- * Each subindex zone is a delta index where the payload is a chapter number. The volume index can
- * compute the delta list number, address, and zone number from the record name in order to
- * dispatch record handling to the correct structures.
- *
- * Most operations that use all the zones take place either before request processing is allowed,
- * or after all requests have been flushed in order to shut down. The only multi-threaded operation
- * supported during normal operation is the uds_lookup_volume_index_name() method, used to determine
- * whether a new chapter should be loaded into the sparse index cache. This operation only uses the
- * sparse hook subindex, and the zone mutexes are used to make this operation safe.
- *
- * There are three ways of expressing chapter numbers in the volume index: virtual, index, and
- * rolling. The interface to the volume index uses virtual chapter numbers, which are 64 bits long.
- * Internally the subindex stores only the minimal number of bits necessary by masking away the
- * high-order bits. When the index needs to deal with ordering of index chapter numbers, as when
- * flushing entries from older chapters, it rolls the index chapter number around so that the
- * smallest one in use is mapped to 0. See convert_index_to_virtual() or flush_invalid_entries()
- * for an example of this technique.
- *
- * For efficiency, when older chapter numbers become invalid, the index does not immediately remove
- * the invalidated entries. Instead it lazily removes them from a given delta list the next time it
- * walks that list during normal operation. Because of this, the index size must be increased
- * somewhat to accommodate all the invalid entries that have not yet been removed. For the standard
- * index sizes, this requires about 4 chapters of old entries per 1024 chapters of valid entries in
- * the index.
- */
-
-struct sub_index_parameters {
-       /* The number of bits in address mask */
-       u8 address_bits;
-       /* The number of bits in chapter number */
-       u8 chapter_bits;
-       /* The mean delta */
-       u32 mean_delta;
-       /* The number of delta lists */
-       u64 list_count;
-       /* The number of chapters used */
-       u32 chapter_count;
-       /* The number of bits per chapter */
-       size_t chapter_size_in_bits;
-       /* The number of bytes of delta list memory */
-       size_t memory_size;
-       /* The number of bytes the index should keep free at all times */
-       size_t target_free_bytes;
-};
-
-struct split_config {
-       /* The hook subindex configuration */
-       struct uds_configuration hook_config;
-       struct index_geometry hook_geometry;
-
-       /* The non-hook subindex configuration */
-       struct uds_configuration non_hook_config;
-       struct index_geometry non_hook_geometry;
-};
-
-struct chapter_range {
-       u32 chapter_start;
-       u32 chapter_count;
-};
-
-enum { MAGIC_SIZE = 8 };
-static const char MAGIC_START_5[] = "MI5-0005";
-
-struct sub_index_data {
-       char magic[MAGIC_SIZE]; /* MAGIC_START_5 */
-       u64 volume_nonce;
-       u64 virtual_chapter_low;
-       u64 virtual_chapter_high;
-       u32 first_list;
-       u32 list_count;
-};
-
-static const char MAGIC_START_6[] = "MI6-0001";
-
-struct volume_index_data {
-       char magic[MAGIC_SIZE]; /* MAGIC_START_6 */
-       u32 sparse_sample_rate;
-};
-
-static inline u32 extract_address(const struct volume_sub_index *sub_index,
-                                 const struct uds_record_name *name)
-{
-       return uds_extract_volume_index_bytes(name) & sub_index->address_mask;
-}
-
-static inline u32 extract_dlist_num(const struct volume_sub_index *sub_index,
-                                   const struct uds_record_name *name)
-{
-       u64 bits = uds_extract_volume_index_bytes(name);
-
-       return (bits >> sub_index->address_bits) % sub_index->list_count;
-}
-
-static inline const struct volume_sub_index_zone *
-get_zone_for_record(const struct volume_index_record *record)
-{
-       return &record->sub_index->zones[record->zone_number];
-}
-
-static inline u64 convert_index_to_virtual(const struct volume_index_record *record,
-                                          u32 index_chapter)
-{
-       const struct volume_sub_index_zone *volume_index_zone = get_zone_for_record(record);
-       u32 rolling_chapter = ((index_chapter - volume_index_zone->virtual_chapter_low) &
-                              record->sub_index->chapter_mask);
-
-       return volume_index_zone->virtual_chapter_low + rolling_chapter;
-}
-
-static inline u32 convert_virtual_to_index(const struct volume_sub_index *sub_index,
-                                          u64 virtual_chapter)
-{
-       return virtual_chapter & sub_index->chapter_mask;
-}
-
-static inline bool is_virtual_chapter_indexed(const struct volume_index_record *record,
-                                             u64 virtual_chapter)
-{
-       const struct volume_sub_index_zone *volume_index_zone = get_zone_for_record(record);
-
-       return ((virtual_chapter >= volume_index_zone->virtual_chapter_low) &&
-               (virtual_chapter <= volume_index_zone->virtual_chapter_high));
-}
-
-static inline bool has_sparse(const struct volume_index *volume_index)
-{
-       return volume_index->sparse_sample_rate > 0;
-}
-
-bool uds_is_volume_index_sample(const struct volume_index *volume_index,
-                               const struct uds_record_name *name)
-{
-       if (!has_sparse(volume_index))
-               return false;
-
-       return (uds_extract_sampling_bytes(name) % volume_index->sparse_sample_rate) == 0;
-}
-
-static inline const struct volume_sub_index *
-get_volume_sub_index(const struct volume_index *volume_index,
-                    const struct uds_record_name *name)
-{
-       return (uds_is_volume_index_sample(volume_index, name) ?
-               &volume_index->vi_hook :
-               &volume_index->vi_non_hook);
-}
-
-static unsigned int get_volume_sub_index_zone(const struct volume_sub_index *sub_index,
-                                             const struct uds_record_name *name)
-{
-       return extract_dlist_num(sub_index, name) / sub_index->delta_index.lists_per_zone;
-}
-
-unsigned int uds_get_volume_index_zone(const struct volume_index *volume_index,
-                                      const struct uds_record_name *name)
-{
-       return get_volume_sub_index_zone(get_volume_sub_index(volume_index, name), name);
-}
-
-static int compute_volume_sub_index_parameters(const struct uds_configuration *config,
-                                              struct sub_index_parameters *params)
-{
-       enum { DELTA_LIST_SIZE = 256 };
-       u64 entries_in_volume_index, address_span;
-       u32 chapters_in_volume_index, invalid_chapters;
-       u32 rounded_chapters;
-       u64 delta_list_records;
-       u32 address_count;
-       u64 index_size_in_bits;
-       size_t expected_index_size;
-       u64 min_delta_lists = MAX_ZONES * MAX_ZONES;
-       struct index_geometry *geometry = config->geometry;
-       u64 records_per_chapter = geometry->records_per_chapter;
-
-       params->chapter_count = geometry->chapters_per_volume;
-       /*
-        * Make sure that the number of delta list records in the volume index does not change when
-        * the volume is reduced by one chapter. This preserves the mapping from name to volume
-        * index delta list.
-        */
-       rounded_chapters = params->chapter_count;
-       if (uds_is_reduced_index_geometry(geometry))
-               rounded_chapters += 1;
-       delta_list_records = records_per_chapter * rounded_chapters;
-       address_count = config->volume_index_mean_delta * DELTA_LIST_SIZE;
-       params->list_count = max(delta_list_records / DELTA_LIST_SIZE, min_delta_lists);
-       params->address_bits = bits_per(address_count - 1);
-       params->chapter_bits = bits_per(rounded_chapters - 1);
-       if ((u32) params->list_count != params->list_count) {
-               return uds_log_warning_strerror(UDS_INVALID_ARGUMENT,
-                                               "cannot initialize volume index with %llu delta lists",
-                                               (unsigned long long) params->list_count);
-       }
-
-       if (params->address_bits > 31) {
-               return uds_log_warning_strerror(UDS_INVALID_ARGUMENT,
-                                               "cannot initialize volume index with %u address bits",
-                                               params->address_bits);
-       }
-
-       /*
-        * The probability that a given delta list is not touched during the writing of an entire
-        * chapter is:
-        *
-        * double p_not_touched = pow((double) (params->list_count - 1) / params->list_count,
-        *                            records_per_chapter);
-        *
-        * For the standard index sizes, about 78% of the delta lists are not touched, and
-        * therefore contain old index entries that have not been eliminated by the lazy LRU
-        * processing. Then the number of old index entries that accumulate over the entire index,
-        * in terms of full chapters worth of entries, is:
-        *
-        * double invalid_chapters = p_not_touched / (1.0 - p_not_touched);
-        *
-        * For the standard index sizes, the index needs about 3.5 chapters of space for the old
-        * entries in a 1024 chapter index, so round this up to use 4 chapters per 1024 chapters in
-        * the index.
-        */
-       invalid_chapters = max(rounded_chapters / 256, 2U);
-       chapters_in_volume_index = rounded_chapters + invalid_chapters;
-       entries_in_volume_index = records_per_chapter * chapters_in_volume_index;
-
-       address_span = params->list_count << params->address_bits;
-       params->mean_delta = address_span / entries_in_volume_index;
-
-       /*
-        * Compute the expected size of a full index, then set the total memory to be 6% larger
-        * than that expected size. This number should be large enough that there are not many
-        * rebalances when the index is full.
-        */
-       params->chapter_size_in_bits = uds_compute_delta_index_size(records_per_chapter,
-                                                                   params->mean_delta,
-                                                                   params->chapter_bits);
-       index_size_in_bits = params->chapter_size_in_bits * chapters_in_volume_index;
-       expected_index_size = index_size_in_bits / BITS_PER_BYTE;
-       params->memory_size = expected_index_size * 106 / 100;
-
-       params->target_free_bytes = expected_index_size / 20;
-       return UDS_SUCCESS;
-}
-
-static void uninitialize_volume_sub_index(struct volume_sub_index *sub_index)
-{
-       uds_free(uds_forget(sub_index->flush_chapters));
-       uds_free(uds_forget(sub_index->zones));
-       uds_uninitialize_delta_index(&sub_index->delta_index);
-}
-
-void uds_free_volume_index(struct volume_index *volume_index)
-{
-       if (volume_index == NULL)
-               return;
-
-       if (volume_index->zones != NULL)
-               uds_free(uds_forget(volume_index->zones));
-
-       uninitialize_volume_sub_index(&volume_index->vi_non_hook);
-       uninitialize_volume_sub_index(&volume_index->vi_hook);
-       uds_free(volume_index);
-}
-
-
-static int compute_volume_sub_index_save_bytes(const struct uds_configuration *config,
-                                              size_t *bytes)
-{
-       struct sub_index_parameters params = { .address_bits = 0 };
-       int result;
-
-       result = compute_volume_sub_index_parameters(config, &params);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       *bytes = (sizeof(struct sub_index_data) + params.list_count * sizeof(u64) +
-                 uds_compute_delta_index_save_bytes(params.list_count,
-                                                    params.memory_size));
-       return UDS_SUCCESS;
-}
-
-/* This function is only useful if the configuration includes sparse chapters. */
-static void split_configuration(const struct uds_configuration *config,
-                               struct split_config *split)
-{
-       u64 sample_rate, sample_records;
-       u64 dense_chapters, sparse_chapters;
-
-       /* Start with copies of the base configuration. */
-       split->hook_config = *config;
-       split->hook_geometry = *config->geometry;
-       split->hook_config.geometry = &split->hook_geometry;
-       split->non_hook_config = *config;
-       split->non_hook_geometry = *config->geometry;
-       split->non_hook_config.geometry = &split->non_hook_geometry;
-
-       sample_rate = config->sparse_sample_rate;
-       sparse_chapters = config->geometry->sparse_chapters_per_volume;
-       dense_chapters = config->geometry->chapters_per_volume - sparse_chapters;
-       sample_records = config->geometry->records_per_chapter / sample_rate;
-
-       /* Adjust the number of records indexed for each chapter. */
-       split->hook_geometry.records_per_chapter = sample_records;
-       split->non_hook_geometry.records_per_chapter -= sample_records;
-
-       /* Adjust the number of chapters indexed. */
-       split->hook_geometry.sparse_chapters_per_volume = 0;
-       split->non_hook_geometry.sparse_chapters_per_volume = 0;
-       split->non_hook_geometry.chapters_per_volume = dense_chapters;
-}
-
-static int compute_volume_index_save_bytes(const struct uds_configuration *config,
-                                          size_t *bytes)
-{
-       size_t hook_bytes, non_hook_bytes;
-       struct split_config split;
-       int result;
-
-       if (!uds_is_sparse_index_geometry(config->geometry))
-               return compute_volume_sub_index_save_bytes(config, bytes);
-
-       split_configuration(config, &split);
-       result = compute_volume_sub_index_save_bytes(&split.hook_config, &hook_bytes);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       result = compute_volume_sub_index_save_bytes(&split.non_hook_config,
-                                                    &non_hook_bytes);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       *bytes = sizeof(struct volume_index_data) + hook_bytes + non_hook_bytes;
-       return UDS_SUCCESS;
-}
-
-int uds_compute_volume_index_save_blocks(const struct uds_configuration *config,
-                                        size_t block_size, u64 *block_count)
-{
-       size_t bytes;
-       int result;
-
-       result = compute_volume_index_save_bytes(config, &bytes);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       bytes += sizeof(struct delta_list_save_info);
-       *block_count = DIV_ROUND_UP(bytes, block_size) + MAX_ZONES;
-       return UDS_SUCCESS;
-}
-
-/* Flush invalid entries while walking the delta list. */
-static inline int flush_invalid_entries(struct volume_index_record *record,
-                                       struct chapter_range *flush_range,
-                                       u32 *next_chapter_to_invalidate)
-{
-       int result;
-
-       result = uds_next_delta_index_entry(&record->delta_entry);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       while (!record->delta_entry.at_end) {
-               u32 index_chapter = uds_get_delta_entry_value(&record->delta_entry);
-               u32 relative_chapter = ((index_chapter - flush_range->chapter_start) &
-                                       record->sub_index->chapter_mask);
-
-               if (likely(relative_chapter >= flush_range->chapter_count)) {
-                       if (relative_chapter < *next_chapter_to_invalidate)
-                               *next_chapter_to_invalidate = relative_chapter;
-                       break;
-               }
-
-               result = uds_remove_delta_index_entry(&record->delta_entry);
-               if (result != UDS_SUCCESS)
-                       return result;
-       }
-
-       return UDS_SUCCESS;
-}
-
-/* Find the matching record, or the list offset where the record would go. */
-static int get_volume_index_entry(struct volume_index_record *record, u32 list_number,
-                                 u32 key, struct chapter_range *flush_range)
-{
-       struct volume_index_record other_record;
-       const struct volume_sub_index *sub_index = record->sub_index;
-       u32 next_chapter_to_invalidate = sub_index->chapter_mask;
-       int result;
-
-       result = uds_start_delta_index_search(&sub_index->delta_index, list_number, 0,
-                                             &record->delta_entry);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       do {
-               result = flush_invalid_entries(record, flush_range,
-                                              &next_chapter_to_invalidate);
-               if (result != UDS_SUCCESS)
-                       return result;
-       } while (!record->delta_entry.at_end && (key > record->delta_entry.key));
-
-       result = uds_remember_delta_index_offset(&record->delta_entry);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       /* Check any collision records for a more precise match. */
-       other_record = *record;
-       if (!other_record.delta_entry.at_end && (key == other_record.delta_entry.key)) {
-               for (;;) {
-                       u8 collision_name[UDS_RECORD_NAME_SIZE];
-
-                       result = flush_invalid_entries(&other_record, flush_range,
-                                                      &next_chapter_to_invalidate);
-                       if (result != UDS_SUCCESS)
-                               return result;
-
-                       if (other_record.delta_entry.at_end ||
-                           !other_record.delta_entry.is_collision)
-                               break;
-
-                       result = uds_get_delta_entry_collision(&other_record.delta_entry,
-                                                              collision_name);
-                       if (result != UDS_SUCCESS)
-                               return result;
-
-                       if (memcmp(collision_name, record->name, UDS_RECORD_NAME_SIZE) == 0) {
-                               *record = other_record;
-                               break;
-                       }
-               }
-       }
-       while (!other_record.delta_entry.at_end) {
-               result = flush_invalid_entries(&other_record, flush_range,
-                                              &next_chapter_to_invalidate);
-               if (result != UDS_SUCCESS)
-                       return result;
-       }
-       next_chapter_to_invalidate += flush_range->chapter_start;
-       next_chapter_to_invalidate &= sub_index->chapter_mask;
-       flush_range->chapter_start = next_chapter_to_invalidate;
-       flush_range->chapter_count = 0;
-       return UDS_SUCCESS;
-}
-
-static int get_volume_sub_index_record(struct volume_sub_index *sub_index,
-                                      const struct uds_record_name *name,
-                                      struct volume_index_record *record)
-{
-       int result;
-       const struct volume_sub_index_zone *volume_index_zone;
-       u32 address = extract_address(sub_index, name);
-       u32 delta_list_number = extract_dlist_num(sub_index, name);
-       u64 flush_chapter = sub_index->flush_chapters[delta_list_number];
-
-       record->sub_index = sub_index;
-       record->mutex = NULL;
-       record->name = name;
-       record->zone_number = delta_list_number / sub_index->delta_index.lists_per_zone;
-       volume_index_zone = get_zone_for_record(record);
-
-       if (flush_chapter < volume_index_zone->virtual_chapter_low) {
-               struct chapter_range range;
-               u64 flush_count = volume_index_zone->virtual_chapter_low - flush_chapter;
-
-               range.chapter_start = convert_virtual_to_index(sub_index, flush_chapter);
-               range.chapter_count = (flush_count > sub_index->chapter_mask ?
-                                      sub_index->chapter_mask + 1 :
-                                      flush_count);
-               result = get_volume_index_entry(record, delta_list_number, address,
-                                               &range);
-               flush_chapter = convert_index_to_virtual(record, range.chapter_start);
-               if (flush_chapter > volume_index_zone->virtual_chapter_high)
-                       flush_chapter = volume_index_zone->virtual_chapter_high;
-               sub_index->flush_chapters[delta_list_number] = flush_chapter;
-       } else {
-               result = uds_get_delta_index_entry(&sub_index->delta_index,
-                                                  delta_list_number, address,
-                                                  name->name, &record->delta_entry);
-       }
-
-       if (result != UDS_SUCCESS)
-               return result;
-
-       record->is_found =
-               (!record->delta_entry.at_end && (record->delta_entry.key == address));
-       if (record->is_found) {
-               u32 index_chapter = uds_get_delta_entry_value(&record->delta_entry);
-
-               record->virtual_chapter = convert_index_to_virtual(record, index_chapter);
-       }
-
-       record->is_collision = record->delta_entry.is_collision;
-       return UDS_SUCCESS;
-}
-
-int uds_get_volume_index_record(struct volume_index *volume_index,
-                               const struct uds_record_name *name,
-                               struct volume_index_record *record)
-{
-       int result;
-
-       if (uds_is_volume_index_sample(volume_index, name)) {
-               /*
-                * Other threads cannot be allowed to call uds_lookup_volume_index_name() while
-                * this thread is finding the volume index record. Due to the lazy LRU flushing of
-                * the volume index, uds_get_volume_index_record() is not a read-only operation.
-                */
-               unsigned int zone =
-                       get_volume_sub_index_zone(&volume_index->vi_hook, name);
-               struct mutex *mutex = &volume_index->zones[zone].hook_mutex;
-
-               mutex_lock(mutex);
-               result = get_volume_sub_index_record(&volume_index->vi_hook, name,
-                                                    record);
-               mutex_unlock(mutex);
-               /* Remember the mutex so that other operations on the index record can use it. */
-               record->mutex = mutex;
-       } else {
-               result = get_volume_sub_index_record(&volume_index->vi_non_hook, name,
-                                                    record);
-       }
-
-       return result;
-}
-
-int uds_put_volume_index_record(struct volume_index_record *record, u64 virtual_chapter)
-{
-       int result;
-       u32 address;
-       const struct volume_sub_index *sub_index = record->sub_index;
-
-       if (!is_virtual_chapter_indexed(record, virtual_chapter)) {
-               u64 low = get_zone_for_record(record)->virtual_chapter_low;
-               u64 high = get_zone_for_record(record)->virtual_chapter_high;
-
-               return uds_log_warning_strerror(UDS_INVALID_ARGUMENT,
-                                               "cannot put record into chapter number %llu that is out of the valid range %llu to %llu",
-                                               (unsigned long long) virtual_chapter,
-                                               (unsigned long long) low,
-                                               (unsigned long long) high);
-       }
-       address = extract_address(sub_index, record->name);
-       if (unlikely(record->mutex != NULL))
-               mutex_lock(record->mutex);
-       result = uds_put_delta_index_entry(&record->delta_entry, address,
-                                          convert_virtual_to_index(sub_index,
-                                                                   virtual_chapter),
-                                          record->is_found ? record->name->name : NULL);
-       if (unlikely(record->mutex != NULL))
-               mutex_unlock(record->mutex);
-       switch (result) {
-       case UDS_SUCCESS:
-               record->virtual_chapter = virtual_chapter;
-               record->is_collision = record->delta_entry.is_collision;
-               record->is_found = true;
-               break;
-       case UDS_OVERFLOW:
-               uds_log_ratelimit(uds_log_warning_strerror, UDS_OVERFLOW,
-                                 "Volume index entry dropped due to overflow condition");
-               uds_log_delta_index_entry(&record->delta_entry);
-               break;
-       default:
-               break;
-       }
-
-       return result;
-}
-
-int uds_remove_volume_index_record(struct volume_index_record *record)
-{
-       int result;
-
-       if (!record->is_found)
-               return uds_log_warning_strerror(UDS_BAD_STATE,
-                                               "illegal operation on new record");
-
-       /* Mark the record so that it cannot be used again */
-       record->is_found = false;
-       if (unlikely(record->mutex != NULL))
-               mutex_lock(record->mutex);
-       result = uds_remove_delta_index_entry(&record->delta_entry);
-       if (unlikely(record->mutex != NULL))
-               mutex_unlock(record->mutex);
-       return result;
-}
-
-static void set_volume_sub_index_zone_open_chapter(struct volume_sub_index *sub_index,
-                                                  unsigned int zone_number,
-                                                  u64 virtual_chapter)
-{
-       u64 used_bits = 0;
-       struct volume_sub_index_zone *zone = &sub_index->zones[zone_number];
-       struct delta_zone *delta_zone;
-       u32 i;
-
-       zone->virtual_chapter_low = (virtual_chapter >= sub_index->chapter_count ?
-                                    virtual_chapter - sub_index->chapter_count + 1 :
-                                    0);
-       zone->virtual_chapter_high = virtual_chapter;
-
-       /* Check to see if the new zone data is too large. */
-       delta_zone = &sub_index->delta_index.delta_zones[zone_number];
-       for (i = 1; i <= delta_zone->list_count; i++)
-               used_bits += delta_zone->delta_lists[i].size;
-
-       if (used_bits > sub_index->max_zone_bits) {
-               /* Expire enough chapters to free the desired space. */
-               u64 expire_count =
-                       1 + (used_bits - sub_index->max_zone_bits) / sub_index->chapter_zone_bits;
-
-               if (expire_count == 1) {
-                       uds_log_ratelimit(uds_log_info,
-                                         "zone %u:  At chapter %llu, expiring chapter %llu early",
-                                         zone_number,
-                                         (unsigned long long) virtual_chapter,
-                                         (unsigned long long) zone->virtual_chapter_low);
-                       zone->early_flushes++;
-                       zone->virtual_chapter_low++;
-               } else {
-                       u64 first_expired = zone->virtual_chapter_low;
-
-                       if (first_expired + expire_count < zone->virtual_chapter_high) {
-                               zone->early_flushes += expire_count;
-                               zone->virtual_chapter_low += expire_count;
-                       } else {
-                               zone->early_flushes +=
-                                       zone->virtual_chapter_high - zone->virtual_chapter_low;
-                               zone->virtual_chapter_low = zone->virtual_chapter_high;
-                       }
-                       uds_log_ratelimit(uds_log_info,
-                                         "zone %u:  At chapter %llu, expiring chapters %llu to %llu early",
-                                         zone_number,
-                                         (unsigned long long) virtual_chapter,
-                                         (unsigned long long) first_expired,
-                                         (unsigned long long) zone->virtual_chapter_low - 1);
-               }
-       }
-}
-
-void uds_set_volume_index_zone_open_chapter(struct volume_index *volume_index,
-                                           unsigned int zone_number,
-                                           u64 virtual_chapter)
-{
-       struct mutex *mutex = &volume_index->zones[zone_number].hook_mutex;
-
-       set_volume_sub_index_zone_open_chapter(&volume_index->vi_non_hook, zone_number,
-                                              virtual_chapter);
-
-       /*
-        * Other threads cannot be allowed to call uds_lookup_volume_index_name() while the open
-        * chapter number is changing.
-        */
-       if (has_sparse(volume_index)) {
-               mutex_lock(mutex);
-               set_volume_sub_index_zone_open_chapter(&volume_index->vi_hook,
-                                                      zone_number, virtual_chapter);
-               mutex_unlock(mutex);
-       }
-}
-
-/*
- * Set the newest open chapter number for the index, while also advancing the oldest valid chapter
- * number.
- */
-void uds_set_volume_index_open_chapter(struct volume_index *volume_index,
-                                      u64 virtual_chapter)
-{
-       unsigned int zone;
-
-       for (zone = 0; zone < volume_index->zone_count; zone++)
-               uds_set_volume_index_zone_open_chapter(volume_index, zone, virtual_chapter);
-}
-
-int uds_set_volume_index_record_chapter(struct volume_index_record *record,
-                                       u64 virtual_chapter)
-{
-       const struct volume_sub_index *sub_index = record->sub_index;
-       int result;
-
-       if (!record->is_found)
-               return uds_log_warning_strerror(UDS_BAD_STATE,
-                                               "illegal operation on new record");
-
-       if (!is_virtual_chapter_indexed(record, virtual_chapter)) {
-               u64 low = get_zone_for_record(record)->virtual_chapter_low;
-               u64 high = get_zone_for_record(record)->virtual_chapter_high;
-
-               return uds_log_warning_strerror(UDS_INVALID_ARGUMENT,
-                                               "cannot set chapter number %llu that is out of the valid range %llu to %llu",
-                                               (unsigned long long) virtual_chapter,
-                                               (unsigned long long) low,
-                                               (unsigned long long) high);
-       }
-
-       if (unlikely(record->mutex != NULL))
-               mutex_lock(record->mutex);
-       result = uds_set_delta_entry_value(&record->delta_entry,
-                                          convert_virtual_to_index(sub_index,
-                                                                   virtual_chapter));
-       if (unlikely(record->mutex != NULL))
-               mutex_unlock(record->mutex);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       record->virtual_chapter = virtual_chapter;
-       return UDS_SUCCESS;
-}
-
-static u64 lookup_volume_sub_index_name(const struct volume_sub_index *sub_index,
-                                       const struct uds_record_name *name)
-{
-       int result;
-       u32 address = extract_address(sub_index, name);
-       u32 delta_list_number = extract_dlist_num(sub_index, name);
-       unsigned int zone_number = get_volume_sub_index_zone(sub_index, name);
-       const struct volume_sub_index_zone *zone = &sub_index->zones[zone_number];
-       u64 virtual_chapter;
-       u32 index_chapter;
-       u32 rolling_chapter;
-       struct delta_index_entry delta_entry;
-
-       result = uds_get_delta_index_entry(&sub_index->delta_index, delta_list_number,
-                                          address, name->name, &delta_entry);
-       if (result != UDS_SUCCESS)
-               return NO_CHAPTER;
-
-       if (delta_entry.at_end || (delta_entry.key != address))
-               return NO_CHAPTER;
-
-       index_chapter = uds_get_delta_entry_value(&delta_entry);
-       rolling_chapter = (index_chapter - zone->virtual_chapter_low) & sub_index->chapter_mask;
-
-       virtual_chapter = zone->virtual_chapter_low + rolling_chapter;
-       if (virtual_chapter > zone->virtual_chapter_high)
-               return NO_CHAPTER;
-
-       return virtual_chapter;
-}
-
-/* Do a read-only lookup of the record name for sparse cache management. */
-u64 uds_lookup_volume_index_name(const struct volume_index *volume_index,
-                                const struct uds_record_name *name)
-{
-       unsigned int zone_number = uds_get_volume_index_zone(volume_index, name);
-       struct mutex *mutex = &volume_index->zones[zone_number].hook_mutex;
-       u64 virtual_chapter;
-
-       if (!uds_is_volume_index_sample(volume_index, name))
-               return NO_CHAPTER;
-
-       mutex_lock(mutex);
-       virtual_chapter = lookup_volume_sub_index_name(&volume_index->vi_hook, name);
-       mutex_unlock(mutex);
-
-       return virtual_chapter;
-}
-
-static void abort_restoring_volume_sub_index(struct volume_sub_index *sub_index)
-{
-       uds_reset_delta_index(&sub_index->delta_index);
-}
-
-static void abort_restoring_volume_index(struct volume_index *volume_index)
-{
-       abort_restoring_volume_sub_index(&volume_index->vi_non_hook);
-       if (has_sparse(volume_index))
-               abort_restoring_volume_sub_index(&volume_index->vi_hook);
-}
-
-static int start_restoring_volume_sub_index(struct volume_sub_index *sub_index,
-                                           struct buffered_reader **readers,
-                                           unsigned int reader_count)
-{
-       unsigned int z;
-       int result;
-       u64 virtual_chapter_low = 0, virtual_chapter_high = 0;
-       unsigned int i;
-
-       for (i = 0; i < reader_count; i++) {
-               struct sub_index_data header;
-               u8 buffer[sizeof(struct sub_index_data)];
-               size_t offset = 0;
-               u32 j;
-
-               result = uds_read_from_buffered_reader(readers[i], buffer,
-                                                      sizeof(buffer));
-               if (result != UDS_SUCCESS) {
-                       return uds_log_warning_strerror(result,
-                                                       "failed to read volume index header");
-               }
-
-               memcpy(&header.magic, buffer, MAGIC_SIZE);
-               offset += MAGIC_SIZE;
-               decode_u64_le(buffer, &offset, &header.volume_nonce);
-               decode_u64_le(buffer, &offset, &header.virtual_chapter_low);
-               decode_u64_le(buffer, &offset, &header.virtual_chapter_high);
-               decode_u32_le(buffer, &offset, &header.first_list);
-               decode_u32_le(buffer, &offset, &header.list_count);
-
-               result = ASSERT(offset == sizeof(buffer),
-                               "%zu bytes decoded of %zu expected", offset,
-                               sizeof(buffer));
-               if (result != UDS_SUCCESS)
-                       result = UDS_CORRUPT_DATA;
-
-               if (memcmp(header.magic, MAGIC_START_5, MAGIC_SIZE) != 0) {
-                       return uds_log_warning_strerror(UDS_CORRUPT_DATA,
-                                                       "volume index file had bad magic number");
-               }
-
-               if (sub_index->volume_nonce == 0) {
-                       sub_index->volume_nonce = header.volume_nonce;
-               } else if (header.volume_nonce != sub_index->volume_nonce) {
-                       return uds_log_warning_strerror(UDS_CORRUPT_DATA,
-                                                       "volume index volume nonce incorrect");
-               }
-
-               if (i == 0) {
-                       virtual_chapter_low = header.virtual_chapter_low;
-                       virtual_chapter_high = header.virtual_chapter_high;
-               } else if (virtual_chapter_high != header.virtual_chapter_high) {
-                       u64 low = header.virtual_chapter_low;
-                       u64 high = header.virtual_chapter_high;
-
-                       return uds_log_warning_strerror(UDS_CORRUPT_DATA,
-                                                       "Inconsistent volume index zone files: Chapter range is [%llu,%llu], chapter range %d is [%llu,%llu]",
-                                                       (unsigned long long) virtual_chapter_low,
-                                                       (unsigned long long) virtual_chapter_high,
-                                                       i, (unsigned long long) low,
-                                                       (unsigned long long) high);
-               } else if (virtual_chapter_low < header.virtual_chapter_low) {
-                       virtual_chapter_low = header.virtual_chapter_low;
-               }
-
-               for (j = 0; j < header.list_count; j++) {
-                       u8 decoded[sizeof(u64)];
-
-                       result = uds_read_from_buffered_reader(readers[i], decoded,
-                                                              sizeof(u64));
-                       if (result != UDS_SUCCESS) {
-                               return uds_log_warning_strerror(result,
-                                                               "failed to read volume index flush ranges");
-                       }
-
-                       sub_index->flush_chapters[header.first_list + j] =
-                               get_unaligned_le64(decoded);
-               }
-       }
-
-       for (z = 0; z < sub_index->zone_count; z++) {
-               memset(&sub_index->zones[z], 0, sizeof(struct volume_sub_index_zone));
-               sub_index->zones[z].virtual_chapter_low = virtual_chapter_low;
-               sub_index->zones[z].virtual_chapter_high = virtual_chapter_high;
-       }
-
-       result = uds_start_restoring_delta_index(&sub_index->delta_index, readers,
-                                                reader_count);
-       if (result != UDS_SUCCESS)
-               return uds_log_warning_strerror(result, "restoring delta index failed");
-
-       return UDS_SUCCESS;
-}
-
-static int start_restoring_volume_index(struct volume_index *volume_index,
-                                       struct buffered_reader **buffered_readers,
-                                       unsigned int reader_count)
-{
-       unsigned int i;
-       int result;
-
-       if (!has_sparse(volume_index)) {
-               return start_restoring_volume_sub_index(&volume_index->vi_non_hook,
-                                                       buffered_readers, reader_count);
-       }
-
-       for (i = 0; i < reader_count; i++) {
-               struct volume_index_data header;
-               u8 buffer[sizeof(struct volume_index_data)];
-               size_t offset = 0;
-
-               result = uds_read_from_buffered_reader(buffered_readers[i], buffer,
-                                                      sizeof(buffer));
-               if (result != UDS_SUCCESS) {
-                       return uds_log_warning_strerror(result,
-                                                       "failed to read volume index header");
-               }
-
-               memcpy(&header.magic, buffer, MAGIC_SIZE);
-               offset += MAGIC_SIZE;
-               decode_u32_le(buffer, &offset, &header.sparse_sample_rate);
-
-               result = ASSERT(offset == sizeof(buffer),
-                               "%zu bytes decoded of %zu expected", offset,
-                               sizeof(buffer));
-               if (result != UDS_SUCCESS)
-                       result = UDS_CORRUPT_DATA;
-
-               if (memcmp(header.magic, MAGIC_START_6, MAGIC_SIZE) != 0)
-                       return uds_log_warning_strerror(UDS_CORRUPT_DATA,
-                                                       "volume index file had bad magic number");
-
-               if (i == 0) {
-                       volume_index->sparse_sample_rate = header.sparse_sample_rate;
-               } else if (volume_index->sparse_sample_rate != header.sparse_sample_rate) {
-                       uds_log_warning_strerror(UDS_CORRUPT_DATA,
-                                                "Inconsistent sparse sample rate in delta index zone files: %u vs. %u",
-                                                volume_index->sparse_sample_rate,
-                                                header.sparse_sample_rate);
-                       return UDS_CORRUPT_DATA;
-               }
-       }
-
-       result = start_restoring_volume_sub_index(&volume_index->vi_non_hook,
-                                                 buffered_readers, reader_count);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       return start_restoring_volume_sub_index(&volume_index->vi_hook, buffered_readers,
-                                               reader_count);
-}
-
-static int finish_restoring_volume_sub_index(struct volume_sub_index *sub_index,
-                                            struct buffered_reader **buffered_readers,
-                                            unsigned int reader_count)
-{
-       return uds_finish_restoring_delta_index(&sub_index->delta_index,
-                                               buffered_readers, reader_count);
-}
-
-static int finish_restoring_volume_index(struct volume_index *volume_index,
-                                        struct buffered_reader **buffered_readers,
-                                        unsigned int reader_count)
-{
-       int result;
-
-       result = finish_restoring_volume_sub_index(&volume_index->vi_non_hook,
-                                                  buffered_readers, reader_count);
-       if ((result == UDS_SUCCESS) && has_sparse(volume_index)) {
-               result = finish_restoring_volume_sub_index(&volume_index->vi_hook,
-                                                          buffered_readers,
-                                                          reader_count);
-       }
-
-       return result;
-}
-
-int uds_load_volume_index(struct volume_index *volume_index,
-                         struct buffered_reader **readers, unsigned int reader_count)
-{
-       int result;
-
-       /* Start by reading the header section of the stream. */
-       result = start_restoring_volume_index(volume_index, readers, reader_count);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       result = finish_restoring_volume_index(volume_index, readers, reader_count);
-       if (result != UDS_SUCCESS) {
-               abort_restoring_volume_index(volume_index);
-               return result;
-       }
-
-       /* Check the final guard lists to make sure there is no extra data. */
-       result = uds_check_guard_delta_lists(readers, reader_count);
-       if (result != UDS_SUCCESS)
-               abort_restoring_volume_index(volume_index);
-
-       return result;
-}
-
-static int start_saving_volume_sub_index(const struct volume_sub_index *sub_index,
-                                        unsigned int zone_number,
-                                        struct buffered_writer *buffered_writer)
-{
-       int result;
-       struct volume_sub_index_zone *volume_index_zone = &sub_index->zones[zone_number];
-       u32 first_list = sub_index->delta_index.delta_zones[zone_number].first_list;
-       u32 list_count = sub_index->delta_index.delta_zones[zone_number].list_count;
-       u8 buffer[sizeof(struct sub_index_data)];
-       size_t offset = 0;
-       u32 i;
-
-       memcpy(buffer, MAGIC_START_5, MAGIC_SIZE);
-       offset += MAGIC_SIZE;
-       encode_u64_le(buffer, &offset, sub_index->volume_nonce);
-       encode_u64_le(buffer, &offset, volume_index_zone->virtual_chapter_low);
-       encode_u64_le(buffer, &offset, volume_index_zone->virtual_chapter_high);
-       encode_u32_le(buffer, &offset, first_list);
-       encode_u32_le(buffer, &offset, list_count);
-
-       result =  ASSERT(offset == sizeof(struct sub_index_data),
-                        "%zu bytes of config written, of %zu expected", offset,
-                        sizeof(struct sub_index_data));
-       if (result != UDS_SUCCESS)
-               return result;
-
-       result = uds_write_to_buffered_writer(buffered_writer, buffer, offset);
-       if (result != UDS_SUCCESS)
-               return uds_log_warning_strerror(result,
-                                               "failed to write volume index header");
-
-       for (i = 0; i < list_count; i++) {
-               u8 encoded[sizeof(u64)];
-
-               put_unaligned_le64(sub_index->flush_chapters[first_list + i], &encoded);
-               result = uds_write_to_buffered_writer(buffered_writer, encoded,
-                                                     sizeof(u64));
-               if (result != UDS_SUCCESS) {
-                       return uds_log_warning_strerror(result,
-                                                       "failed to write volume index flush ranges");
-               }
-       }
-
-       return uds_start_saving_delta_index(&sub_index->delta_index, zone_number,
-                                           buffered_writer);
-}
-
-static int start_saving_volume_index(const struct volume_index *volume_index,
-                                    unsigned int zone_number,
-                                    struct buffered_writer *writer)
-{
-       u8 buffer[sizeof(struct volume_index_data)];
-       size_t offset = 0;
-       int result;
-
-       if (!has_sparse(volume_index)) {
-               return start_saving_volume_sub_index(&volume_index->vi_non_hook,
-                                                    zone_number, writer);
-       }
-
-       memcpy(buffer, MAGIC_START_6, MAGIC_SIZE);
-       offset += MAGIC_SIZE;
-       encode_u32_le(buffer, &offset, volume_index->sparse_sample_rate);
-       result = ASSERT(offset == sizeof(struct volume_index_data),
-                       "%zu bytes of header written, of %zu expected", offset,
-                       sizeof(struct volume_index_data));
-       if (result != UDS_SUCCESS)
-               return result;
-
-       result = uds_write_to_buffered_writer(writer, buffer, offset);
-       if (result != UDS_SUCCESS) {
-               uds_log_warning_strerror(result, "failed to write volume index header");
-               return result;
-       }
-
-       result = start_saving_volume_sub_index(&volume_index->vi_non_hook, zone_number,
-                                              writer);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       return start_saving_volume_sub_index(&volume_index->vi_hook, zone_number,
-                                            writer);
-}
-
-static int finish_saving_volume_sub_index(const struct volume_sub_index *sub_index,
-                                         unsigned int zone_number)
-{
-       return uds_finish_saving_delta_index(&sub_index->delta_index, zone_number);
-}
-
-static int finish_saving_volume_index(const struct volume_index *volume_index,
-                                     unsigned int zone_number)
-{
-       int result;
-
-       result = finish_saving_volume_sub_index(&volume_index->vi_non_hook, zone_number);
-       if ((result == UDS_SUCCESS) && has_sparse(volume_index))
-               result = finish_saving_volume_sub_index(&volume_index->vi_hook, zone_number);
-       return result;
-}
-
-int uds_save_volume_index(struct volume_index *volume_index,
-                         struct buffered_writer **writers, unsigned int writer_count)
-{
-       int result = UDS_SUCCESS;
-       unsigned int zone;
-
-       for (zone = 0; zone < writer_count; zone++) {
-               result = start_saving_volume_index(volume_index, zone, writers[zone]);
-               if (result != UDS_SUCCESS)
-                       break;
-
-               result = finish_saving_volume_index(volume_index, zone);
-               if (result != UDS_SUCCESS)
-                       break;
-
-               result = uds_write_guard_delta_list(writers[zone]);
-               if (result != UDS_SUCCESS)
-                       break;
-
-               result = uds_flush_buffered_writer(writers[zone]);
-               if (result != UDS_SUCCESS)
-                       break;
-       }
-
-       return result;
-}
-
-static void get_volume_sub_index_stats(const struct volume_sub_index *sub_index,
-                                      struct volume_index_stats *stats)
-{
-       struct delta_index_stats dis;
-       unsigned int z;
-
-       uds_get_delta_index_stats(&sub_index->delta_index, &dis);
-       stats->rebalance_time = dis.rebalance_time;
-       stats->rebalance_count = dis.rebalance_count;
-       stats->record_count = dis.record_count;
-       stats->collision_count = dis.collision_count;
-       stats->discard_count = dis.discard_count;
-       stats->overflow_count = dis.overflow_count;
-       stats->delta_lists = dis.list_count;
-       stats->early_flushes = 0;
-       for (z = 0; z < sub_index->zone_count; z++)
-               stats->early_flushes += sub_index->zones[z].early_flushes;
-}
-
-void uds_get_volume_index_stats(const struct volume_index *volume_index,
-                               struct volume_index_stats *stats)
-{
-       struct volume_index_stats sparse_stats;
-
-       get_volume_sub_index_stats(&volume_index->vi_non_hook, stats);
-       if (!has_sparse(volume_index))
-               return;
-
-       get_volume_sub_index_stats(&volume_index->vi_hook, &sparse_stats);
-       stats->rebalance_time += sparse_stats.rebalance_time;
-       stats->rebalance_count += sparse_stats.rebalance_count;
-       stats->record_count += sparse_stats.record_count;
-       stats->collision_count += sparse_stats.collision_count;
-       stats->discard_count += sparse_stats.discard_count;
-       stats->overflow_count += sparse_stats.overflow_count;
-       stats->delta_lists += sparse_stats.delta_lists;
-       stats->early_flushes += sparse_stats.early_flushes;
-}
-
-static int initialize_volume_sub_index(const struct uds_configuration *config,
-                                      u64 volume_nonce, u8 tag,
-                                      struct volume_sub_index *sub_index)
-{
-       struct sub_index_parameters params = { .address_bits = 0 };
-       unsigned int zone_count = config->zone_count;
-       u64 available_bytes = 0;
-       unsigned int z;
-       int result;
-
-       result = compute_volume_sub_index_parameters(config, &params);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       sub_index->address_bits = params.address_bits;
-       sub_index->address_mask = (1u << params.address_bits) - 1;
-       sub_index->chapter_bits = params.chapter_bits;
-       sub_index->chapter_mask = (1u << params.chapter_bits) - 1;
-       sub_index->chapter_count = params.chapter_count;
-       sub_index->list_count = params.list_count;
-       sub_index->zone_count = zone_count;
-       sub_index->chapter_zone_bits = params.chapter_size_in_bits / zone_count;
-       sub_index->volume_nonce = volume_nonce;
-
-       result = uds_initialize_delta_index(&sub_index->delta_index, zone_count,
-                                           params.list_count, params.mean_delta,
-                                           params.chapter_bits, params.memory_size,
-                                           tag);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       for (z = 0; z < sub_index->delta_index.zone_count; z++)
-               available_bytes += sub_index->delta_index.delta_zones[z].size;
-       available_bytes -= params.target_free_bytes;
-       sub_index->max_zone_bits = (available_bytes * BITS_PER_BYTE) / zone_count;
-       sub_index->memory_size = (sub_index->delta_index.memory_size +
-                                 sizeof(struct volume_sub_index) +
-                                 (params.list_count * sizeof(u64)) +
-                                 (zone_count * sizeof(struct volume_sub_index_zone)));
-
-       /* The following arrays are initialized to all zeros. */
-       result = uds_allocate(params.list_count, u64, "first chapter to flush",
-                             &sub_index->flush_chapters);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       return uds_allocate(zone_count, struct volume_sub_index_zone,
-                           "volume index zones", &sub_index->zones);
-}
-
-int uds_make_volume_index(const struct uds_configuration *config, u64 volume_nonce,
-                         struct volume_index **volume_index_ptr)
-{
-       struct split_config split;
-       unsigned int zone;
-       struct volume_index *volume_index;
-       int result;
-
-       result = uds_allocate(1, struct volume_index, "volume index", &volume_index);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       volume_index->zone_count = config->zone_count;
-
-       if (!uds_is_sparse_index_geometry(config->geometry)) {
-               result = initialize_volume_sub_index(config, volume_nonce, 'm',
-                                                    &volume_index->vi_non_hook);
-               if (result != UDS_SUCCESS) {
-                       uds_free_volume_index(volume_index);
-                       return result;
-               }
-
-               volume_index->memory_size = volume_index->vi_non_hook.memory_size;
-               *volume_index_ptr = volume_index;
-               return UDS_SUCCESS;
-       }
-
-       volume_index->sparse_sample_rate = config->sparse_sample_rate;
-
-       result = uds_allocate(config->zone_count, struct volume_index_zone,
-                             "volume index zones", &volume_index->zones);
-       if (result != UDS_SUCCESS) {
-               uds_free_volume_index(volume_index);
-               return result;
-       }
-
-       for (zone = 0; zone < config->zone_count; zone++)
-               mutex_init(&volume_index->zones[zone].hook_mutex);
-
-       split_configuration(config, &split);
-       result = initialize_volume_sub_index(&split.non_hook_config, volume_nonce, 'd',
-                                            &volume_index->vi_non_hook);
-       if (result != UDS_SUCCESS) {
-               uds_free_volume_index(volume_index);
-               return uds_log_error_strerror(result,
-                                             "Error creating non hook volume index");
-       }
-
-       result = initialize_volume_sub_index(&split.hook_config, volume_nonce, 's',
-                                            &volume_index->vi_hook);
-       if (result != UDS_SUCCESS) {
-               uds_free_volume_index(volume_index);
-               return uds_log_error_strerror(result,
-                                             "Error creating hook volume index");
-       }
-
-       volume_index->memory_size =
-               volume_index->vi_non_hook.memory_size + volume_index->vi_hook.memory_size;
-       *volume_index_ptr = volume_index;
-       return UDS_SUCCESS;
-}
diff --git a/drivers/md/dm-vdo/volume-index.h b/drivers/md/dm-vdo/volume-index.h
deleted file mode 100644 (file)
index 66bf14f..0000000
+++ /dev/null
@@ -1,192 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright 2023 Red Hat
- */
-
-#ifndef UDS_VOLUME_INDEX_H
-#define UDS_VOLUME_INDEX_H
-
-#include <linux/limits.h>
-
-#include "config.h"
-#include "delta-index.h"
-#include "indexer.h"
-#include "thread-utils.h"
-
-/*
- * The volume index is the primary top-level index for UDS. It contains records which map a record
- * name to the chapter where a record with that name is stored. This mapping can definitively say
- * when no record exists. However, because we only use a subset of the name for this index, it
- * cannot definitively say that a record for the entry does exist. It can only say that if a record
- * exists, it will be in a particular chapter. The request can then be dispatched to that chapter
- * for further processing.
- *
- * If the volume_index_record does not actually match the record name, the index can store a more
- * specific collision record to disambiguate the new entry from the existing one. Index entries are
- * managed with volume_index_record structures.
- */
-
-#define NO_CHAPTER U64_MAX
-
-struct volume_index_stats {
-       /* Nanoseconds spent rebalancing */
-       ktime_t rebalance_time;
-       /* Number of memory rebalances */
-       u32 rebalance_count;
-       /* The number of records in the index */
-       u64 record_count;
-       /* The number of collision records */
-       u64 collision_count;
-       /* The number of records removed */
-       u64 discard_count;
-       /* The number of UDS_OVERFLOWs detected */
-       u64 overflow_count;
-       /* The number of delta lists */
-       u32 delta_lists;
-       /* Number of early flushes */
-       u64 early_flushes;
-};
-
-struct volume_sub_index_zone {
-       u64 virtual_chapter_low;
-       u64 virtual_chapter_high;
-       u64 early_flushes;
-} __aligned(L1_CACHE_BYTES);
-
-struct volume_sub_index {
-       /* The delta index */
-       struct delta_index delta_index;
-       /* The first chapter to be flushed in each zone */
-       u64 *flush_chapters;
-       /* The zones */
-       struct volume_sub_index_zone *zones;
-       /* The volume nonce */
-       u64 volume_nonce;
-       /* Expected size of a chapter (per zone) */
-       u64 chapter_zone_bits;
-       /* Maximum size of the index (per zone) */
-       u64 max_zone_bits;
-       /* The number of bits in address mask */
-       u8 address_bits;
-       /* Mask to get address within delta list */
-       u32 address_mask;
-       /* The number of bits in chapter number */
-       u8 chapter_bits;
-       /* The largest storable chapter number */
-       u32 chapter_mask;
-       /* The number of chapters used */
-       u32 chapter_count;
-       /* The number of delta lists */
-       u32 list_count;
-       /* The number of zones */
-       unsigned int zone_count;
-       /* The amount of memory allocated */
-       u64 memory_size;
-};
-
-struct volume_index_zone {
-       /* Protects the sampled index in this zone */
-       struct mutex hook_mutex;
-} __aligned(L1_CACHE_BYTES);
-
-struct volume_index {
-       u32 sparse_sample_rate;
-       unsigned int zone_count;
-       u64 memory_size;
-       struct volume_sub_index vi_non_hook;
-       struct volume_sub_index vi_hook;
-       struct volume_index_zone *zones;
-};
-
-/*
- * The volume_index_record structure is used to facilitate processing of a record name. A client
- * first calls uds_get_volume_index_record() to find the volume index record for a record name. The
- * fields of the record can then be examined to determine the state of the record.
- *
- * If is_found is false, then the index did not find an entry for the record name. Calling
- * uds_put_volume_index_record() will insert a new entry for that name at the proper place.
- *
- * If is_found is true, then we did find an entry for the record name, and the virtual_chapter and
- * is_collision fields reflect the entry found. Subsequently, a call to
- * uds_remove_volume_index_record() will remove the entry, a call to
- * uds_set_volume_index_record_chapter() will update the existing entry, and a call to
- * uds_put_volume_index_record() will insert a new collision record after the existing entry.
- */
-struct volume_index_record {
-       /* Public fields */
-
-       /* Chapter where the record info is found */
-       u64 virtual_chapter;
-       /* This record is a collision */
-       bool is_collision;
-       /* This record is the requested record */
-       bool is_found;
-
-       /* Private fields */
-
-       /* Zone that contains this name */
-       unsigned int zone_number;
-       /* The volume index */
-       struct volume_sub_index *sub_index;
-       /* Mutex for accessing this delta index entry in the hook index */
-       struct mutex *mutex;
-       /* The record name to which this record refers */
-       const struct uds_record_name *name;
-       /* The delta index entry for this record */
-       struct delta_index_entry delta_entry;
-};
-
-int __must_check uds_make_volume_index(const struct uds_configuration *config,
-                                      u64 volume_nonce,
-                                      struct volume_index **volume_index);
-
-void uds_free_volume_index(struct volume_index *volume_index);
-
-int __must_check uds_compute_volume_index_save_blocks(const struct uds_configuration *config,
-                                                     size_t block_size,
-                                                     u64 *block_count);
-
-unsigned int __must_check uds_get_volume_index_zone(const struct volume_index *volume_index,
-                                                   const struct uds_record_name *name);
-
-bool __must_check uds_is_volume_index_sample(const struct volume_index *volume_index,
-                                            const struct uds_record_name *name);
-
-/*
- * This function is only used to manage sparse cache membership. Most requests should use
- * uds_get_volume_index_record() to look up index records instead.
- */
-u64 __must_check uds_lookup_volume_index_name(const struct volume_index *volume_index,
-                                             const struct uds_record_name *name);
-
-int __must_check uds_get_volume_index_record(struct volume_index *volume_index,
-                                            const struct uds_record_name *name,
-                                            struct volume_index_record *record);
-
-int __must_check uds_put_volume_index_record(struct volume_index_record *record,
-                                            u64 virtual_chapter);
-
-int __must_check uds_remove_volume_index_record(struct volume_index_record *record);
-
-int __must_check uds_set_volume_index_record_chapter(struct volume_index_record *record,
-                                                    u64 virtual_chapter);
-
-void uds_set_volume_index_open_chapter(struct volume_index *volume_index,
-                                      u64 virtual_chapter);
-
-void uds_set_volume_index_zone_open_chapter(struct volume_index *volume_index,
-                                           unsigned int zone_number,
-                                           u64 virtual_chapter);
-
-int __must_check uds_load_volume_index(struct volume_index *volume_index,
-                                      struct buffered_reader **readers,
-                                      unsigned int reader_count);
-
-int __must_check uds_save_volume_index(struct volume_index *volume_index,
-                                      struct buffered_writer **writers,
-                                      unsigned int writer_count);
-
-void uds_get_volume_index_stats(const struct volume_index *volume_index,
-                               struct volume_index_stats *stats);
-
-#endif /* UDS_VOLUME_INDEX_H */
diff --git a/drivers/md/dm-vdo/volume.c b/drivers/md/dm-vdo/volume.c
deleted file mode 100644 (file)
index 60416dc..0000000
+++ /dev/null
@@ -1,1694 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright 2023 Red Hat
- */
-
-#include "volume.h"
-
-#include <linux/atomic.h>
-#include <linux/dm-bufio.h>
-#include <linux/err.h>
-
-#include "chapter-index.h"
-#include "config.h"
-#include "errors.h"
-#include "geometry.h"
-#include "hash-utils.h"
-#include "index.h"
-#include "logger.h"
-#include "memory-alloc.h"
-#include "permassert.h"
-#include "sparse-cache.h"
-#include "string-utils.h"
-#include "thread-utils.h"
-
-/*
- * The first block of the volume layout is reserved for the volume header, which is no longer used.
- * The remainder of the volume is divided into chapters consisting of several pages of records, and
- * several pages of static index to use to find those records. The index pages are recorded first,
- * followed by the record pages. The chapters are written in order as they are filled, so the
- * volume storage acts as a circular log of the most recent chapters, with each new chapter
- * overwriting the oldest saved one.
- *
- * When a new chapter is filled and closed, the records from that chapter are sorted and
- * interleaved in approximate temporal order, and assigned to record pages. Then a static delta
- * index is generated to store which record page contains each record. The in-memory index page map
- * is also updated to indicate which delta lists fall on each chapter index page. This means that
- * when a record is read, the volume only has to load a single index page and a single record page,
- * rather than search the entire chapter. These index and record pages are written to storage, and
- * the index pages are transferred to the page cache under the theory that the most recently
- * written chapter is likely to be accessed again soon.
- *
- * When reading a record, the volume index will indicate which chapter should contain it. The
- * volume uses the index page map to determine which chapter index page needs to be loaded, and
- * then reads the relevant record page number from the chapter index. Both index and record pages
- * are stored in a page cache when read for the common case that subsequent records need the same
- * pages. The page cache evicts the least recently accessed entries when caching new pages. In
- * addition, the volume uses dm-bufio to manage access to the storage, which may allow for
- * additional caching depending on available system resources.
- *
- * Record requests are handled from cached pages when possible. If a page needs to be read, it is
- * placed on a queue along with the request that wants to read it. Any requests for the same page
- * that arrive while the read is pending are added to the queue entry. A separate reader thread
- * handles the queued reads, adding the page to the cache and updating any requests queued with it
- * so they can continue processing. This allows the index zone threads to continue processing new
- * requests rather than wait for the storage reads.
- *
- * When an index rebuild is necessary, the volume reads each stored chapter to determine which
- * range of chapters contain valid records, so that those records can be used to reconstruct the
- * in-memory volume index.
- */
-
-enum {
-       /* The maximum allowable number of contiguous bad chapters */
-       MAX_BAD_CHAPTERS = 100,
-       VOLUME_CACHE_MAX_ENTRIES = (U16_MAX >> 1),
-       VOLUME_CACHE_QUEUED_FLAG = (1 << 15),
-       VOLUME_CACHE_MAX_QUEUED_READS = 4096,
-};
-
-static const u64 BAD_CHAPTER = U64_MAX;
-
-/*
- * The invalidate counter is two 32 bits fields stored together atomically. The low order 32 bits
- * are the physical page number of the cached page being read. The high order 32 bits are a
- * sequence number. This value is written when the zone that owns it begins or completes a cache
- * search. Any other thread will only read the counter in wait_for_pending_searches() while waiting
- * to update the cache contents.
- */
-union invalidate_counter {
-       u64 value;
-       struct {
-               u32 page;
-               u32 counter;
-       };
-};
-
-static inline u32 map_to_page_number(struct index_geometry *geometry, u32 physical_page)
-{
-       return (physical_page - HEADER_PAGES_PER_VOLUME) % geometry->pages_per_chapter;
-}
-
-static inline u32 map_to_chapter_number(struct index_geometry *geometry, u32 physical_page)
-{
-       return (physical_page - HEADER_PAGES_PER_VOLUME) / geometry->pages_per_chapter;
-}
-
-static inline bool is_record_page(struct index_geometry *geometry, u32 physical_page)
-{
-       return map_to_page_number(geometry, physical_page) >= geometry->index_pages_per_chapter;
-}
-
-static u32 map_to_physical_page(const struct index_geometry *geometry, u32 chapter, u32 page)
-{
-       /* Page zero is the header page, so the first chapter index page is page one. */
-       return HEADER_PAGES_PER_VOLUME + (geometry->pages_per_chapter * chapter) + page;
-}
-
-static inline union invalidate_counter get_invalidate_counter(struct page_cache *cache,
-                                                             unsigned int zone_number)
-{
-       return (union invalidate_counter) {
-               .value = READ_ONCE(cache->search_pending_counters[zone_number].atomic_value),
-       };
-}
-
-static inline void set_invalidate_counter(struct page_cache *cache,
-                                         unsigned int zone_number,
-                                         union invalidate_counter invalidate_counter)
-{
-       WRITE_ONCE(cache->search_pending_counters[zone_number].atomic_value,
-                  invalidate_counter.value);
-}
-
-static inline bool search_pending(union invalidate_counter invalidate_counter)
-{
-       return (invalidate_counter.counter & 1) != 0;
-}
-
-/* Lock the cache for a zone in order to search for a page. */
-static void begin_pending_search(struct page_cache *cache, u32 physical_page,
-                                unsigned int zone_number)
-{
-       union invalidate_counter invalidate_counter =
-               get_invalidate_counter(cache, zone_number);
-
-       invalidate_counter.page = physical_page;
-       invalidate_counter.counter++;
-       set_invalidate_counter(cache, zone_number, invalidate_counter);
-       ASSERT_LOG_ONLY(search_pending(invalidate_counter),
-                       "Search is pending for zone %u", zone_number);
-       /*
-        * This memory barrier ensures that the write to the invalidate counter is seen by other
-        * threads before this thread accesses the cached page. The corresponding read memory
-        * barrier is in wait_for_pending_searches().
-        */
-       smp_mb();
-}
-
-/* Unlock the cache for a zone by clearing its invalidate counter. */
-static void end_pending_search(struct page_cache *cache, unsigned int zone_number)
-{
-       union invalidate_counter invalidate_counter;
-
-       /*
-        * This memory barrier ensures that this thread completes reads of the
-        * cached page before other threads see the write to the invalidate
-        * counter.
-        */
-       smp_mb();
-
-       invalidate_counter = get_invalidate_counter(cache, zone_number);
-       ASSERT_LOG_ONLY(search_pending(invalidate_counter),
-                       "Search is pending for zone %u", zone_number);
-       invalidate_counter.counter++;
-       set_invalidate_counter(cache, zone_number, invalidate_counter);
-}
-
-static void wait_for_pending_searches(struct page_cache *cache, u32 physical_page)
-{
-       union invalidate_counter initial_counters[MAX_ZONES];
-       unsigned int i;
-
-       /*
-        * We hold the read_threads_mutex. We are waiting for threads that do not hold the
-        * read_threads_mutex. Those threads have "locked" their targeted page by setting the
-        * search_pending_counter. The corresponding write memory barrier is in
-        * begin_pending_search().
-        */
-       smp_mb();
-
-       for (i = 0; i < cache->zone_count; i++)
-               initial_counters[i] = get_invalidate_counter(cache, i);
-       for (i = 0; i < cache->zone_count; i++) {
-               if (search_pending(initial_counters[i]) &&
-                   (initial_counters[i].page == physical_page)) {
-                       /*
-                        * There is an active search using the physical page. We need to wait for
-                        * the search to finish.
-                        *
-                        * FIXME: Investigate using wait_event() to wait for the search to finish.
-                        */
-                       while (initial_counters[i].value ==
-                              get_invalidate_counter(cache, i).value)
-                               cond_resched();
-               }
-       }
-}
-
-static void release_page_buffer(struct cached_page *page)
-{
-       if (page->buffer != NULL)
-               dm_bufio_release(uds_forget(page->buffer));
-}
-
-static void clear_cache_page(struct page_cache *cache, struct cached_page *page)
-{
-       /* Do not clear read_pending because the read queue relies on it. */
-       release_page_buffer(page);
-       page->physical_page = cache->indexable_pages;
-       WRITE_ONCE(page->last_used, 0);
-}
-
-static void make_page_most_recent(struct page_cache *cache, struct cached_page *page)
-{
-       /*
-        * ASSERTION: We are either a zone thread holding a search_pending_counter, or we are any
-        * thread holding the read_threads_mutex.
-        */
-       if (atomic64_read(&cache->clock) != READ_ONCE(page->last_used))
-               WRITE_ONCE(page->last_used, atomic64_inc_return(&cache->clock));
-}
-
-/* Select a page to remove from the cache to make space for a new entry. */
-static struct cached_page *select_victim_in_cache(struct page_cache *cache)
-{
-       struct cached_page *page;
-       int oldest_index = 0;
-       s64 oldest_time = S64_MAX;
-       s64 last_used;
-       u16 i;
-
-       /* Find the oldest unclaimed page. We hold the read_threads_mutex. */
-       for (i = 0; i < cache->cache_slots; i++) {
-               /* A page with a pending read must not be replaced. */
-               if (cache->cache[i].read_pending)
-                       continue;
-
-               last_used = READ_ONCE(cache->cache[i].last_used);
-               if (last_used <= oldest_time) {
-                       oldest_time = last_used;
-                       oldest_index = i;
-               }
-       }
-
-       page = &cache->cache[oldest_index];
-       if (page->physical_page != cache->indexable_pages) {
-               WRITE_ONCE(cache->index[page->physical_page], cache->cache_slots);
-               wait_for_pending_searches(cache, page->physical_page);
-       }
-
-       page->read_pending = true;
-       clear_cache_page(cache, page);
-       return page;
-}
-
-/* Make a newly filled cache entry available to other threads. */
-static int put_page_in_cache(struct page_cache *cache, u32 physical_page,
-                            struct cached_page *page)
-{
-       int result;
-
-       /* We hold the read_threads_mutex. */
-       result = ASSERT((page->read_pending), "page to install has a pending read");
-       if (result != UDS_SUCCESS)
-               return result;
-
-       page->physical_page = physical_page;
-       make_page_most_recent(cache, page);
-       page->read_pending = false;
-
-       /*
-        * We hold the read_threads_mutex, but we must have a write memory barrier before making
-        * the cached_page available to the readers that do not hold the mutex. The corresponding
-        * read memory barrier is in get_page_and_index().
-        */
-       smp_wmb();
-
-       /* This assignment also clears the queued flag. */
-       WRITE_ONCE(cache->index[physical_page], page - cache->cache);
-       return UDS_SUCCESS;
-}
-
-static void cancel_page_in_cache(struct page_cache *cache, u32 physical_page,
-                                struct cached_page *page)
-{
-       int result;
-
-       /* We hold the read_threads_mutex. */
-       result = ASSERT((page->read_pending), "page to install has a pending read");
-       if (result != UDS_SUCCESS)
-               return;
-
-       clear_cache_page(cache, page);
-       page->read_pending = false;
-
-       /* Clear the mapping and the queued flag for the new page. */
-       WRITE_ONCE(cache->index[physical_page], cache->cache_slots);
-}
-
-static inline u16 next_queue_position(u16 position)
-{
-       return (position + 1) % VOLUME_CACHE_MAX_QUEUED_READS;
-}
-
-static inline void advance_queue_position(u16 *position)
-{
-       *position = next_queue_position(*position);
-}
-
-static inline bool read_queue_is_full(struct page_cache *cache)
-{
-       return cache->read_queue_first == next_queue_position(cache->read_queue_last);
-}
-
-static bool enqueue_read(struct page_cache *cache, struct uds_request *request,
-                        u32 physical_page)
-{
-       struct queued_read *queue_entry;
-       u16 last = cache->read_queue_last;
-       u16 read_queue_index;
-
-       /* We hold the read_threads_mutex. */
-       if ((cache->index[physical_page] & VOLUME_CACHE_QUEUED_FLAG) == 0) {
-               /* This page has no existing entry in the queue. */
-               if (read_queue_is_full(cache))
-                       return false;
-
-               /* Fill in the read queue entry. */
-               cache->read_queue[last].physical_page = physical_page;
-               cache->read_queue[last].invalid = false;
-               cache->read_queue[last].first_request = NULL;
-               cache->read_queue[last].last_request = NULL;
-
-               /* Point the cache index to the read queue entry. */
-               read_queue_index = last;
-               WRITE_ONCE(cache->index[physical_page],
-                          read_queue_index | VOLUME_CACHE_QUEUED_FLAG);
-
-               advance_queue_position(&cache->read_queue_last);
-       } else {
-               /* It's already queued, so add this request to the existing entry. */
-               read_queue_index = cache->index[physical_page] & ~VOLUME_CACHE_QUEUED_FLAG;
-       }
-
-       request->next_request = NULL;
-       queue_entry = &cache->read_queue[read_queue_index];
-       if (queue_entry->first_request == NULL)
-               queue_entry->first_request = request;
-       else
-               queue_entry->last_request->next_request = request;
-       queue_entry->last_request = request;
-
-       return true;
-}
-
-static void enqueue_page_read(struct volume *volume, struct uds_request *request,
-                             u32 physical_page)
-{
-       /* Mark the page as queued, so that chapter invalidation knows to cancel a read. */
-       while (!enqueue_read(&volume->page_cache, request, physical_page)) {
-               uds_log_debug("Read queue full, waiting for reads to finish");
-               uds_wait_cond(&volume->read_threads_read_done_cond,
-                             &volume->read_threads_mutex);
-       }
-
-       uds_signal_cond(&volume->read_threads_cond);
-}
-
-/*
- * Reserve the next read queue entry for processing, but do not actually remove it from the queue.
- * Must be followed by release_queued_requests().
- */
-static struct queued_read *reserve_read_queue_entry(struct page_cache *cache)
-{
-       /* We hold the read_threads_mutex. */
-       struct queued_read *entry;
-       u16 index_value;
-       bool queued;
-
-       /* No items to dequeue */
-       if (cache->read_queue_next_read == cache->read_queue_last)
-               return NULL;
-
-       entry = &cache->read_queue[cache->read_queue_next_read];
-       index_value = cache->index[entry->physical_page];
-       queued = (index_value & VOLUME_CACHE_QUEUED_FLAG) != 0;
-       /* Check to see if it's still queued before resetting. */
-       if (entry->invalid && queued)
-               WRITE_ONCE(cache->index[entry->physical_page], cache->cache_slots);
-
-       /*
-        * If a synchronous read has taken this page, set invalid to true so it doesn't get
-        * overwritten. Requests will just be requeued.
-        */
-       if (!queued)
-               entry->invalid = true;
-
-       entry->reserved = true;
-       advance_queue_position(&cache->read_queue_next_read);
-       return entry;
-}
-
-static inline struct queued_read *wait_to_reserve_read_queue_entry(struct volume *volume)
-{
-       struct queued_read *queue_entry = NULL;
-
-       while (!volume->read_threads_exiting) {
-               queue_entry = reserve_read_queue_entry(&volume->page_cache);
-               if (queue_entry != NULL)
-                       break;
-
-               uds_wait_cond(&volume->read_threads_cond, &volume->read_threads_mutex);
-       }
-
-       return queue_entry;
-}
-
-static int init_chapter_index_page(const struct volume *volume, u8 *index_page,
-                                  u32 chapter, u32 index_page_number,
-                                  struct delta_index_page *chapter_index_page)
-{
-       u64 ci_virtual;
-       u32 ci_chapter;
-       u32 lowest_list;
-       u32 highest_list;
-       struct index_geometry *geometry = volume->geometry;
-       int result;
-
-       result = uds_initialize_chapter_index_page(chapter_index_page, geometry,
-                                                  index_page, volume->nonce);
-       if (volume->lookup_mode == LOOKUP_FOR_REBUILD)
-               return result;
-
-       if (result != UDS_SUCCESS) {
-               return uds_log_error_strerror(result,
-                                             "Reading chapter index page for chapter %u page %u",
-                                             chapter, index_page_number);
-       }
-
-       uds_get_list_number_bounds(volume->index_page_map, chapter, index_page_number,
-                                  &lowest_list, &highest_list);
-       ci_virtual = chapter_index_page->virtual_chapter_number;
-       ci_chapter = uds_map_to_physical_chapter(geometry, ci_virtual);
-       if ((chapter == ci_chapter) &&
-           (lowest_list == chapter_index_page->lowest_list_number) &&
-           (highest_list == chapter_index_page->highest_list_number))
-               return UDS_SUCCESS;
-
-       uds_log_warning("Index page map updated to %llu",
-                       (unsigned long long) volume->index_page_map->last_update);
-       uds_log_warning("Page map expects that chapter %u page %u has range %u to %u, but chapter index page has chapter %llu with range %u to %u",
-                       chapter, index_page_number, lowest_list, highest_list,
-                       (unsigned long long) ci_virtual,
-                       chapter_index_page->lowest_list_number,
-                       chapter_index_page->highest_list_number);
-       return uds_log_error_strerror(UDS_CORRUPT_DATA,
-                                     "index page map mismatch with chapter index");
-}
-
-static int initialize_index_page(const struct volume *volume, u32 physical_page,
-                                struct cached_page *page)
-{
-       u32 chapter = map_to_chapter_number(volume->geometry, physical_page);
-       u32 index_page_number = map_to_page_number(volume->geometry, physical_page);
-
-       return init_chapter_index_page(volume, dm_bufio_get_block_data(page->buffer),
-                                      chapter, index_page_number, &page->index_page);
-}
-
-static bool search_record_page(const u8 record_page[],
-                              const struct uds_record_name *name,
-                              const struct index_geometry *geometry,
-                              struct uds_record_data *metadata)
-{
-       /*
-        * The array of records is sorted by name and stored as a binary tree in heap order, so the
-        * root of the tree is the first array element.
-        */
-       u32 node = 0;
-       const struct uds_volume_record *records = (const struct uds_volume_record *) record_page;
-
-       while (node < geometry->records_per_page) {
-               int result;
-               const struct uds_volume_record *record = &records[node];
-
-               result = memcmp(name, &record->name, UDS_RECORD_NAME_SIZE);
-               if (result == 0) {
-                       if (metadata != NULL)
-                               *metadata = record->data;
-                       return true;
-               }
-
-               /* The children of node N are at indexes 2N+1 and 2N+2. */
-               node = ((2 * node) + ((result < 0) ? 1 : 2));
-       }
-
-       return false;
-}
-
-/*
- * If we've read in a record page, we're going to do an immediate search, to speed up processing by
- * avoiding get_record_from_zone(), and to ensure that requests make progress even when queued. If
- * we've read in an index page, we save the record page number so we don't have to resolve the
- * index page again. We use the location, virtual_chapter, and old_metadata fields in the request
- * to allow the index code to know where to begin processing the request again.
- */
-static int search_page(struct cached_page *page, const struct volume *volume,
-                      struct uds_request *request, u32 physical_page)
-{
-       int result;
-       enum uds_index_region location;
-       u16 record_page_number;
-
-       if (is_record_page(volume->geometry, physical_page)) {
-               if (search_record_page(dm_bufio_get_block_data(page->buffer),
-                                      &request->record_name, volume->geometry,
-                                      &request->old_metadata))
-                       location = UDS_LOCATION_RECORD_PAGE_LOOKUP;
-               else
-                       location = UDS_LOCATION_UNAVAILABLE;
-       } else {
-               result = uds_search_chapter_index_page(&page->index_page,
-                                                      volume->geometry,
-                                                      &request->record_name,
-                                                      &record_page_number);
-               if (result != UDS_SUCCESS)
-                       return result;
-
-               if (record_page_number == NO_CHAPTER_INDEX_ENTRY) {
-                       location = UDS_LOCATION_UNAVAILABLE;
-               } else {
-                       location = UDS_LOCATION_INDEX_PAGE_LOOKUP;
-                       *((u16 *) &request->old_metadata) = record_page_number;
-               }
-       }
-
-       request->location = location;
-       request->found = false;
-       return UDS_SUCCESS;
-}
-
-static int process_entry(struct volume *volume, struct queued_read *entry)
-{
-       u32 page_number = entry->physical_page;
-       struct uds_request *request;
-       struct cached_page *page = NULL;
-       u8 *page_data;
-       int result;
-
-       if (entry->invalid) {
-               uds_log_debug("Requeuing requests for invalid page");
-               return UDS_SUCCESS;
-       }
-
-       page = select_victim_in_cache(&volume->page_cache);
-
-       mutex_unlock(&volume->read_threads_mutex);
-       page_data = dm_bufio_read(volume->client, page_number, &page->buffer);
-       mutex_lock(&volume->read_threads_mutex);
-       if (IS_ERR(page_data)) {
-               result = -PTR_ERR(page_data);
-               uds_log_warning_strerror(result,
-                                        "error reading physical page %u from volume",
-                                        page_number);
-               cancel_page_in_cache(&volume->page_cache, page_number, page);
-               return result;
-       }
-
-       if (entry->invalid) {
-               uds_log_warning("Page %u invalidated after read", page_number);
-               cancel_page_in_cache(&volume->page_cache, page_number, page);
-               return UDS_SUCCESS;
-       }
-
-       if (!is_record_page(volume->geometry, page_number)) {
-               result = initialize_index_page(volume, page_number, page);
-               if (result != UDS_SUCCESS) {
-                       uds_log_warning("Error initializing chapter index page");
-                       cancel_page_in_cache(&volume->page_cache, page_number, page);
-                       return result;
-               }
-       }
-
-       result = put_page_in_cache(&volume->page_cache, page_number, page);
-       if (result != UDS_SUCCESS) {
-               uds_log_warning("Error putting page %u in cache", page_number);
-               cancel_page_in_cache(&volume->page_cache, page_number, page);
-               return result;
-       }
-
-       request = entry->first_request;
-       while ((request != NULL) && (result == UDS_SUCCESS)) {
-               result = search_page(page, volume, request, page_number);
-               request = request->next_request;
-       }
-
-       return result;
-}
-
-static void release_queued_requests(struct volume *volume, struct queued_read *entry,
-                                   int result)
-{
-       struct page_cache *cache = &volume->page_cache;
-       u16 next_read = cache->read_queue_next_read;
-       struct uds_request *request;
-       struct uds_request *next;
-
-       for (request = entry->first_request; request != NULL; request = next) {
-               next = request->next_request;
-               request->status = result;
-               request->requeued = true;
-               uds_enqueue_request(request, STAGE_INDEX);
-       }
-
-       entry->reserved = false;
-
-       /* Move the read_queue_first pointer as far as we can. */
-       while ((cache->read_queue_first != next_read) &&
-              (!cache->read_queue[cache->read_queue_first].reserved))
-               advance_queue_position(&cache->read_queue_first);
-       uds_broadcast_cond(&volume->read_threads_read_done_cond);
-}
-
-static void read_thread_function(void *arg)
-{
-       struct volume *volume = arg;
-
-       uds_log_debug("reader starting");
-       mutex_lock(&volume->read_threads_mutex);
-       while (true) {
-               struct queued_read *queue_entry;
-               int result;
-
-               queue_entry = wait_to_reserve_read_queue_entry(volume);
-               if (volume->read_threads_exiting)
-                       break;
-
-               result = process_entry(volume, queue_entry);
-               release_queued_requests(volume, queue_entry, result);
-       }
-       mutex_unlock(&volume->read_threads_mutex);
-       uds_log_debug("reader done");
-}
-
-static void get_page_and_index(struct page_cache *cache, u32 physical_page,
-                              int *queue_index, struct cached_page **page_ptr)
-{
-       u16 index_value;
-       u16 index;
-       bool queued;
-
-       /*
-        * ASSERTION: We are either a zone thread holding a search_pending_counter, or we are any
-        * thread holding the read_threads_mutex.
-        *
-        * Holding only a search_pending_counter is the most frequent case.
-        */
-       /*
-        * It would be unlikely for the compiler to turn the usage of index_value into two reads of
-        * cache->index, but it would be possible and very bad if those reads did not return the
-        * same bits.
-        */
-       index_value = READ_ONCE(cache->index[physical_page]);
-       queued = (index_value & VOLUME_CACHE_QUEUED_FLAG) != 0;
-       index = index_value & ~VOLUME_CACHE_QUEUED_FLAG;
-
-       if (!queued && (index < cache->cache_slots)) {
-               *page_ptr = &cache->cache[index];
-               /*
-                * We have acquired access to the cached page, but unless we hold the
-                * read_threads_mutex, we need a read memory barrier now. The corresponding write
-                * memory barrier is in put_page_in_cache().
-                */
-               smp_rmb();
-       } else {
-               *page_ptr = NULL;
-       }
-
-       *queue_index = queued ? index : -1;
-}
-
-static void get_page_from_cache(struct page_cache *cache, u32 physical_page,
-                               struct cached_page **page)
-{
-       /*
-        * ASSERTION: We are in a zone thread.
-        * ASSERTION: We holding a search_pending_counter or the read_threads_mutex.
-        */
-       int queue_index = -1;
-
-       get_page_and_index(cache, physical_page, &queue_index, page);
-}
-
-static int read_page_locked(struct volume *volume, u32 physical_page,
-                           struct cached_page **page_ptr)
-{
-       int result = UDS_SUCCESS;
-       struct cached_page *page = NULL;
-       u8 *page_data;
-
-       page = select_victim_in_cache(&volume->page_cache);
-       page_data = dm_bufio_read(volume->client, physical_page, &page->buffer);
-       if (IS_ERR(page_data)) {
-               result = -PTR_ERR(page_data);
-               uds_log_warning_strerror(result,
-                                        "error reading physical page %u from volume",
-                                        physical_page);
-               cancel_page_in_cache(&volume->page_cache, physical_page, page);
-               return result;
-       }
-
-       if (!is_record_page(volume->geometry, physical_page)) {
-               result = initialize_index_page(volume, physical_page, page);
-               if (result != UDS_SUCCESS) {
-                       if (volume->lookup_mode != LOOKUP_FOR_REBUILD)
-                               uds_log_warning("Corrupt index page %u", physical_page);
-                       cancel_page_in_cache(&volume->page_cache, physical_page, page);
-                       return result;
-               }
-       }
-
-       result = put_page_in_cache(&volume->page_cache, physical_page, page);
-       if (result != UDS_SUCCESS) {
-               uds_log_warning("Error putting page %u in cache", physical_page);
-               cancel_page_in_cache(&volume->page_cache, physical_page, page);
-               return result;
-       }
-
-       *page_ptr = page;
-       return UDS_SUCCESS;
-}
-
-/* Retrieve a page from the cache while holding the read threads mutex. */
-static int get_volume_page_locked(struct volume *volume, u32 physical_page,
-                                 struct cached_page **page_ptr)
-{
-       int result;
-       struct cached_page *page = NULL;
-
-       get_page_from_cache(&volume->page_cache, physical_page, &page);
-       if (page == NULL) {
-               result = read_page_locked(volume, physical_page, &page);
-               if (result != UDS_SUCCESS)
-                       return result;
-       } else {
-               make_page_most_recent(&volume->page_cache, page);
-       }
-
-       *page_ptr = page;
-       return UDS_SUCCESS;
-}
-
-/* Retrieve a page from the cache while holding a search_pending lock. */
-static int get_volume_page_protected(struct volume *volume, struct uds_request *request,
-                                    u32 physical_page, struct cached_page **page_ptr)
-{
-       struct cached_page *page;
-
-       get_page_from_cache(&volume->page_cache, physical_page, &page);
-       if (page != NULL) {
-               if (request->zone_number == 0) {
-                       /* Only one zone is allowed to update the LRU. */
-                       make_page_most_recent(&volume->page_cache, page);
-               }
-
-               *page_ptr = page;
-               return UDS_SUCCESS;
-       }
-
-       /* Prepare to enqueue a read for the page. */
-       end_pending_search(&volume->page_cache, request->zone_number);
-       mutex_lock(&volume->read_threads_mutex);
-
-       /*
-        * Do the lookup again while holding the read mutex (no longer the fast case so this should
-        * be fine to repeat). We need to do this because a page may have been added to the cache
-        * by a reader thread between the time we searched above and the time we went to actually
-        * try to enqueue it below. This could result in us enqueuing another read for a page which
-        * is already in the cache, which would mean we end up with two entries in the cache for
-        * the same page.
-        */
-       get_page_from_cache(&volume->page_cache, physical_page, &page);
-       if (page == NULL) {
-               enqueue_page_read(volume, request, physical_page);
-               /*
-                * The performance gain from unlocking first, while "search pending" mode is off,
-                * turns out to be significant in some cases. The page is not available yet so
-                * the order does not matter for correctness as it does below.
-                */
-               mutex_unlock(&volume->read_threads_mutex);
-               begin_pending_search(&volume->page_cache, physical_page,
-                                    request->zone_number);
-               return UDS_QUEUED;
-       }
-
-       /*
-        * Now that the page is loaded, the volume needs to switch to "reader thread unlocked" and
-        * "search pending" state in careful order so no other thread can mess with the data before
-        * the caller gets to look at it.
-        */
-       begin_pending_search(&volume->page_cache, physical_page, request->zone_number);
-       mutex_unlock(&volume->read_threads_mutex);
-       *page_ptr = page;
-       return UDS_SUCCESS;
-}
-
-static int get_volume_page(struct volume *volume, u32 chapter, u32 page_number,
-                          struct cached_page **page_ptr)
-{
-       int result;
-       u32 physical_page = map_to_physical_page(volume->geometry, chapter, page_number);
-
-       mutex_lock(&volume->read_threads_mutex);
-       result = get_volume_page_locked(volume, physical_page, page_ptr);
-       mutex_unlock(&volume->read_threads_mutex);
-       return result;
-}
-
-int uds_get_volume_record_page(struct volume *volume, u32 chapter, u32 page_number,
-                              u8 **data_ptr)
-{
-       int result;
-       struct cached_page *page = NULL;
-
-       result = get_volume_page(volume, chapter, page_number, &page);
-       if (result == UDS_SUCCESS)
-               *data_ptr = dm_bufio_get_block_data(page->buffer);
-       return result;
-}
-
-int uds_get_volume_index_page(struct volume *volume, u32 chapter, u32 page_number,
-                             struct delta_index_page **index_page_ptr)
-{
-       int result;
-       struct cached_page *page = NULL;
-
-       result = get_volume_page(volume, chapter, page_number, &page);
-       if (result == UDS_SUCCESS)
-               *index_page_ptr = &page->index_page;
-       return result;
-}
-
-/*
- * Find the record page associated with a name in a given index page. This will return UDS_QUEUED
- * if the page in question must be read from storage.
- */
-static int search_cached_index_page(struct volume *volume, struct uds_request *request,
-                                   u32 chapter, u32 index_page_number,
-                                   u16 *record_page_number)
-{
-       int result;
-       struct cached_page *page = NULL;
-       u32 physical_page = map_to_physical_page(volume->geometry, chapter,
-                                                index_page_number);
-
-       /*
-        * Make sure the invalidate counter is updated before we try and read the mapping. This
-        * prevents this thread from reading a page in the cache which has already been marked for
-        * invalidation by the reader thread, before the reader thread has noticed that the
-        * invalidate_counter has been incremented.
-        */
-       begin_pending_search(&volume->page_cache, physical_page, request->zone_number);
-
-       result = get_volume_page_protected(volume, request, physical_page, &page);
-       if (result != UDS_SUCCESS) {
-               end_pending_search(&volume->page_cache, request->zone_number);
-               return result;
-       }
-
-       result = uds_search_chapter_index_page(&page->index_page, volume->geometry,
-                                              &request->record_name,
-                                              record_page_number);
-       end_pending_search(&volume->page_cache, request->zone_number);
-       return result;
-}
-
-/*
- * Find the metadata associated with a name in a given record page. This will return UDS_QUEUED if
- * the page in question must be read from storage.
- */
-int uds_search_cached_record_page(struct volume *volume, struct uds_request *request,
-                                 u32 chapter, u16 record_page_number, bool *found)
-{
-       struct cached_page *record_page;
-       struct index_geometry *geometry = volume->geometry;
-       int result;
-       u32 physical_page, page_number;
-
-       *found = false;
-       if (record_page_number == NO_CHAPTER_INDEX_ENTRY)
-               return UDS_SUCCESS;
-
-       result = ASSERT(record_page_number < geometry->record_pages_per_chapter,
-                       "0 <= %d < %u", record_page_number,
-                       geometry->record_pages_per_chapter);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       page_number = geometry->index_pages_per_chapter + record_page_number;
-
-       physical_page = map_to_physical_page(volume->geometry, chapter, page_number);
-
-       /*
-        * Make sure the invalidate counter is updated before we try and read the mapping. This
-        * prevents this thread from reading a page in the cache which has already been marked for
-        * invalidation by the reader thread, before the reader thread has noticed that the
-        * invalidate_counter has been incremented.
-        */
-       begin_pending_search(&volume->page_cache, physical_page, request->zone_number);
-
-       result = get_volume_page_protected(volume, request, physical_page, &record_page);
-       if (result != UDS_SUCCESS) {
-               end_pending_search(&volume->page_cache, request->zone_number);
-               return result;
-       }
-
-       if (search_record_page(dm_bufio_get_block_data(record_page->buffer),
-                              &request->record_name, geometry, &request->old_metadata))
-               *found = true;
-
-       end_pending_search(&volume->page_cache, request->zone_number);
-       return UDS_SUCCESS;
-}
-
-void uds_prefetch_volume_chapter(const struct volume *volume, u32 chapter)
-{
-       const struct index_geometry *geometry = volume->geometry;
-       u32 physical_page = map_to_physical_page(geometry, chapter, 0);
-
-       dm_bufio_prefetch(volume->client, physical_page, geometry->pages_per_chapter);
-}
-
-int uds_read_chapter_index_from_volume(const struct volume *volume, u64 virtual_chapter,
-                                      struct dm_buffer *volume_buffers[],
-                                      struct delta_index_page index_pages[])
-{
-       int result;
-       u32 i;
-       const struct index_geometry *geometry = volume->geometry;
-       u32 physical_chapter = uds_map_to_physical_chapter(geometry, virtual_chapter);
-       u32 physical_page = map_to_physical_page(geometry, physical_chapter, 0);
-
-       dm_bufio_prefetch(volume->client, physical_page, geometry->index_pages_per_chapter);
-       for (i = 0; i < geometry->index_pages_per_chapter; i++) {
-               u8 *index_page;
-
-               index_page = dm_bufio_read(volume->client, physical_page + i,
-                                          &volume_buffers[i]);
-               if (IS_ERR(index_page)) {
-                       result = -PTR_ERR(index_page);
-                       uds_log_warning_strerror(result,
-                                                "error reading physical page %u",
-                                                physical_page);
-                       return result;
-               }
-
-               result = init_chapter_index_page(volume, index_page, physical_chapter, i,
-                                                &index_pages[i]);
-               if (result != UDS_SUCCESS)
-                       return result;
-       }
-
-       return UDS_SUCCESS;
-}
-
-int uds_search_volume_page_cache(struct volume *volume, struct uds_request *request,
-                                bool *found)
-{
-       int result;
-       u32 physical_chapter =
-               uds_map_to_physical_chapter(volume->geometry, request->virtual_chapter);
-       u32 index_page_number;
-       u16 record_page_number;
-
-       index_page_number = uds_find_index_page_number(volume->index_page_map,
-                                                      &request->record_name,
-                                                      physical_chapter);
-
-       if (request->location == UDS_LOCATION_INDEX_PAGE_LOOKUP) {
-               record_page_number = *((u16 *) &request->old_metadata);
-       } else {
-               result = search_cached_index_page(volume, request, physical_chapter,
-                                                 index_page_number,
-                                                 &record_page_number);
-               if (result != UDS_SUCCESS)
-                       return result;
-       }
-
-       return uds_search_cached_record_page(volume, request, physical_chapter,
-                                            record_page_number, found);
-}
-
-int uds_search_volume_page_cache_for_rebuild(struct volume *volume,
-                                            const struct uds_record_name *name,
-                                            u64 virtual_chapter, bool *found)
-{
-       int result;
-       struct index_geometry *geometry = volume->geometry;
-       struct cached_page *page;
-       u32 physical_chapter = uds_map_to_physical_chapter(geometry, virtual_chapter);
-       u32 index_page_number;
-       u16 record_page_number;
-       u32 page_number;
-
-       *found = false;
-       index_page_number =
-               uds_find_index_page_number(volume->index_page_map, name,
-                                          physical_chapter);
-       result = get_volume_page(volume, physical_chapter, index_page_number, &page);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       result = uds_search_chapter_index_page(&page->index_page, geometry, name,
-                                              &record_page_number);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       if (record_page_number == NO_CHAPTER_INDEX_ENTRY)
-               return UDS_SUCCESS;
-
-       page_number = geometry->index_pages_per_chapter + record_page_number;
-       result = get_volume_page(volume, physical_chapter, page_number, &page);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       *found = search_record_page(dm_bufio_get_block_data(page->buffer), name,
-                                   geometry, NULL);
-       return UDS_SUCCESS;
-}
-
-static void invalidate_page(struct page_cache *cache, u32 physical_page)
-{
-       struct cached_page *page;
-       int queue_index = -1;
-
-       /* We hold the read_threads_mutex. */
-       get_page_and_index(cache, physical_page, &queue_index, &page);
-       if (page != NULL) {
-               WRITE_ONCE(cache->index[page->physical_page], cache->cache_slots);
-               wait_for_pending_searches(cache, page->physical_page);
-               clear_cache_page(cache, page);
-       } else if (queue_index > -1) {
-               uds_log_debug("setting pending read to invalid");
-               cache->read_queue[queue_index].invalid = true;
-       }
-}
-
-void uds_forget_chapter(struct volume *volume, u64 virtual_chapter)
-{
-       u32 physical_chapter =
-               uds_map_to_physical_chapter(volume->geometry, virtual_chapter);
-       u32 first_page = map_to_physical_page(volume->geometry, physical_chapter, 0);
-       u32 i;
-
-       uds_log_debug("forgetting chapter %llu", (unsigned long long) virtual_chapter);
-       mutex_lock(&volume->read_threads_mutex);
-       for (i = 0; i < volume->geometry->pages_per_chapter; i++)
-               invalidate_page(&volume->page_cache, first_page + i);
-       mutex_unlock(&volume->read_threads_mutex);
-}
-
-/*
- * Donate an index pages from a newly written chapter to the page cache since it is likely to be
- * used again soon. The caller must already hold the reader thread mutex.
- */
-static int donate_index_page_locked(struct volume *volume, u32 physical_chapter,
-                                   u32 index_page_number, struct dm_buffer *page_buffer)
-{
-       int result;
-       struct cached_page *page = NULL;
-       u32 physical_page =
-               map_to_physical_page(volume->geometry, physical_chapter,
-                                    index_page_number);
-
-       page = select_victim_in_cache(&volume->page_cache);
-       page->buffer = page_buffer;
-       result = init_chapter_index_page(volume, dm_bufio_get_block_data(page_buffer),
-                                        physical_chapter, index_page_number,
-                                        &page->index_page);
-       if (result != UDS_SUCCESS) {
-               uds_log_warning("Error initialize chapter index page");
-               cancel_page_in_cache(&volume->page_cache, physical_page, page);
-               return result;
-       }
-
-       result = put_page_in_cache(&volume->page_cache, physical_page, page);
-       if (result != UDS_SUCCESS) {
-               uds_log_warning("Error putting page %u in cache", physical_page);
-               cancel_page_in_cache(&volume->page_cache, physical_page, page);
-               return result;
-       }
-
-       return UDS_SUCCESS;
-}
-
-static int write_index_pages(struct volume *volume, u32 physical_chapter_number,
-                            struct open_chapter_index *chapter_index)
-{
-       struct index_geometry *geometry = volume->geometry;
-       struct dm_buffer *page_buffer;
-       u32 first_index_page = map_to_physical_page(geometry, physical_chapter_number, 0);
-       u32 delta_list_number = 0;
-       u32 index_page_number;
-
-       for (index_page_number = 0;
-            index_page_number < geometry->index_pages_per_chapter;
-            index_page_number++) {
-               u8 *page_data;
-               u32 physical_page = first_index_page + index_page_number;
-               u32 lists_packed;
-               bool last_page;
-               int result;
-
-               page_data = dm_bufio_new(volume->client, physical_page, &page_buffer);
-               if (IS_ERR(page_data)) {
-                       return uds_log_warning_strerror(-PTR_ERR(page_data),
-                                                       "failed to prepare index page");
-               }
-
-               last_page = ((index_page_number + 1) == geometry->index_pages_per_chapter);
-               result = uds_pack_open_chapter_index_page(chapter_index, page_data,
-                                                         delta_list_number, last_page,
-                                                         &lists_packed);
-               if (result != UDS_SUCCESS) {
-                       dm_bufio_release(page_buffer);
-                       return uds_log_warning_strerror(result,
-                                                       "failed to pack index page");
-               }
-
-               dm_bufio_mark_buffer_dirty(page_buffer);
-
-               if (lists_packed == 0) {
-                       uds_log_debug("no delta lists packed on chapter %u page %u",
-                                     physical_chapter_number, index_page_number);
-               } else {
-                       delta_list_number += lists_packed;
-               }
-
-               uds_update_index_page_map(volume->index_page_map,
-                                         chapter_index->virtual_chapter_number,
-                                         physical_chapter_number, index_page_number,
-                                         delta_list_number - 1);
-
-               mutex_lock(&volume->read_threads_mutex);
-               result = donate_index_page_locked(volume, physical_chapter_number,
-                                                 index_page_number, page_buffer);
-               mutex_unlock(&volume->read_threads_mutex);
-               if (result != UDS_SUCCESS) {
-                       dm_bufio_release(page_buffer);
-                       return result;
-               }
-       }
-
-       return UDS_SUCCESS;
-}
-
-static u32 encode_tree(u8 record_page[],
-                      const struct uds_volume_record *sorted_pointers[],
-                      u32 next_record, u32 node, u32 node_count)
-{
-       if (node < node_count) {
-               u32 child = (2 * node) + 1;
-
-               next_record = encode_tree(record_page, sorted_pointers, next_record,
-                                         child, node_count);
-
-               /*
-                * In-order traversal: copy the contents of the next record into the page at the
-                * node offset.
-                */
-               memcpy(&record_page[node * BYTES_PER_RECORD],
-                      sorted_pointers[next_record++], BYTES_PER_RECORD);
-
-               next_record = encode_tree(record_page, sorted_pointers, next_record,
-                                         child + 1, node_count);
-       }
-
-       return next_record;
-}
-
-static int encode_record_page(const struct volume *volume,
-                             const struct uds_volume_record records[], u8 record_page[])
-{
-       int result;
-       u32 i;
-       u32 records_per_page = volume->geometry->records_per_page;
-       const struct uds_volume_record **record_pointers = volume->record_pointers;
-
-       for (i = 0; i < records_per_page; i++)
-               record_pointers[i] = &records[i];
-
-       /*
-        * Sort the record pointers by using just the names in the records, which is less work than
-        * sorting the entire record values.
-        */
-       BUILD_BUG_ON(offsetof(struct uds_volume_record, name) != 0);
-       result = uds_radix_sort(volume->radix_sorter, (const u8 **) record_pointers,
-                               records_per_page, UDS_RECORD_NAME_SIZE);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       encode_tree(record_page, record_pointers, 0, 0, records_per_page);
-       return UDS_SUCCESS;
-}
-
-static int write_record_pages(struct volume *volume, u32 physical_chapter_number,
-                             const struct uds_volume_record *records)
-{
-       u32 record_page_number;
-       struct index_geometry *geometry = volume->geometry;
-       struct dm_buffer *page_buffer;
-       const struct uds_volume_record *next_record = records;
-       u32 first_record_page = map_to_physical_page(geometry, physical_chapter_number,
-                                                    geometry->index_pages_per_chapter);
-
-       for (record_page_number = 0;
-            record_page_number < geometry->record_pages_per_chapter;
-            record_page_number++) {
-               u8 *page_data;
-               u32 physical_page = first_record_page + record_page_number;
-               int result;
-
-               page_data = dm_bufio_new(volume->client, physical_page, &page_buffer);
-               if (IS_ERR(page_data)) {
-                       return uds_log_warning_strerror(-PTR_ERR(page_data),
-                                                       "failed to prepare record page");
-               }
-
-               result = encode_record_page(volume, next_record, page_data);
-               if (result != UDS_SUCCESS) {
-                       dm_bufio_release(page_buffer);
-                       return uds_log_warning_strerror(result,
-                                                       "failed to encode record page %u",
-                                                       record_page_number);
-               }
-
-               next_record += geometry->records_per_page;
-               dm_bufio_mark_buffer_dirty(page_buffer);
-               dm_bufio_release(page_buffer);
-       }
-
-       return UDS_SUCCESS;
-}
-
-int uds_write_chapter(struct volume *volume, struct open_chapter_index *chapter_index,
-                     const struct uds_volume_record *records)
-{
-       int result;
-       u32 physical_chapter_number =
-               uds_map_to_physical_chapter(volume->geometry,
-                                           chapter_index->virtual_chapter_number);
-
-       result = write_index_pages(volume, physical_chapter_number, chapter_index);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       result = write_record_pages(volume, physical_chapter_number, records);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       result = -dm_bufio_write_dirty_buffers(volume->client);
-       if (result != UDS_SUCCESS)
-               uds_log_error_strerror(result, "cannot sync chapter to volume");
-
-       return result;
-}
-
-static void probe_chapter(struct volume *volume, u32 chapter_number,
-                         u64 *virtual_chapter_number)
-{
-       const struct index_geometry *geometry = volume->geometry;
-       u32 expected_list_number = 0;
-       u32 i;
-       u64 vcn = BAD_CHAPTER;
-
-       *virtual_chapter_number = BAD_CHAPTER;
-       dm_bufio_prefetch(volume->client,
-                         map_to_physical_page(geometry, chapter_number, 0),
-                         geometry->index_pages_per_chapter);
-
-       for (i = 0; i < geometry->index_pages_per_chapter; i++) {
-               struct delta_index_page *page;
-               int result;
-
-               result = uds_get_volume_index_page(volume, chapter_number, i, &page);
-               if (result != UDS_SUCCESS)
-                       return;
-
-               if (page->virtual_chapter_number == BAD_CHAPTER) {
-                       uds_log_error("corrupt index page in chapter %u",
-                                     chapter_number);
-                       return;
-               }
-
-               if (vcn == BAD_CHAPTER) {
-                       vcn = page->virtual_chapter_number;
-               } else if (page->virtual_chapter_number != vcn) {
-                       uds_log_error("inconsistent chapter %u index page %u: expected vcn %llu, got vcn %llu",
-                                     chapter_number, i, (unsigned long long) vcn,
-                                     (unsigned long long) page->virtual_chapter_number);
-                       return;
-               }
-
-               if (expected_list_number != page->lowest_list_number) {
-                       uds_log_error("inconsistent chapter %u index page %u: expected list number %u, got list number %u",
-                                     chapter_number, i, expected_list_number,
-                                     page->lowest_list_number);
-                       return;
-               }
-               expected_list_number = page->highest_list_number + 1;
-
-               result = uds_validate_chapter_index_page(page, geometry);
-               if (result != UDS_SUCCESS)
-                       return;
-       }
-
-       if (chapter_number != uds_map_to_physical_chapter(geometry, vcn)) {
-               uds_log_error("chapter %u vcn %llu is out of phase (%u)", chapter_number,
-                             (unsigned long long) vcn, geometry->chapters_per_volume);
-               return;
-       }
-
-       *virtual_chapter_number = vcn;
-}
-
-/* Find the last valid physical chapter in the volume. */
-static void find_real_end_of_volume(struct volume *volume, u32 limit, u32 *limit_ptr)
-{
-       u32 span = 1;
-       u32 tries = 0;
-
-       while (limit > 0) {
-               u32 chapter = (span > limit) ? 0 : limit - span;
-               u64 vcn = 0;
-
-               probe_chapter(volume, chapter, &vcn);
-               if (vcn == BAD_CHAPTER) {
-                       limit = chapter;
-                       if (++tries > 1)
-                               span *= 2;
-               } else {
-                       if (span == 1)
-                               break;
-                       span /= 2;
-                       tries = 0;
-               }
-       }
-
-       *limit_ptr = limit;
-}
-
-static int find_chapter_limits(struct volume *volume, u32 chapter_limit, u64 *lowest_vcn,
-                              u64 *highest_vcn)
-{
-       struct index_geometry *geometry = volume->geometry;
-       u64 zero_vcn;
-       u64 lowest = BAD_CHAPTER;
-       u64 highest = BAD_CHAPTER;
-       u64 moved_chapter = BAD_CHAPTER;
-       u32 left_chapter = 0;
-       u32 right_chapter = 0;
-       u32 bad_chapters = 0;
-
-       /*
-        * This method assumes there is at most one run of contiguous bad chapters caused by
-        * unflushed writes. Either the bad spot is at the beginning and end, or somewhere in the
-        * middle. Wherever it is, the highest and lowest VCNs are adjacent to it. Otherwise the
-        * volume is cleanly saved and somewhere in the middle of it the highest VCN immediately
-        * precedes the lowest one.
-        */
-
-       /* It doesn't matter if this results in a bad spot (BAD_CHAPTER). */
-       probe_chapter(volume, 0, &zero_vcn);
-
-       /*
-        * Binary search for end of the discontinuity in the monotonically increasing virtual
-        * chapter numbers; bad spots are treated as a span of BAD_CHAPTER values. In effect we're
-        * searching for the index of the smallest value less than zero_vcn. In the case we go off
-        * the end it means that chapter 0 has the lowest vcn.
-        *
-        * If a virtual chapter is out-of-order, it will be the one moved by conversion. Always
-        * skip over the moved chapter when searching, adding it to the range at the end if
-        * necessary.
-        */
-       if (geometry->remapped_physical > 0) {
-               u64 remapped_vcn;
-
-               probe_chapter(volume, geometry->remapped_physical, &remapped_vcn);
-               if (remapped_vcn == geometry->remapped_virtual)
-                       moved_chapter = geometry->remapped_physical;
-       }
-
-       left_chapter = 0;
-       right_chapter = chapter_limit;
-
-       while (left_chapter < right_chapter) {
-               u64 probe_vcn;
-               u32 chapter = (left_chapter + right_chapter) / 2;
-
-               if (chapter == moved_chapter)
-                       chapter--;
-
-               probe_chapter(volume, chapter, &probe_vcn);
-               if (zero_vcn <= probe_vcn) {
-                       left_chapter = chapter + 1;
-                       if (left_chapter == moved_chapter)
-                               left_chapter++;
-               } else {
-                       right_chapter = chapter;
-               }
-       }
-
-       /* If left_chapter goes off the end, chapter 0 has the lowest virtual chapter number.*/
-       if (left_chapter >= chapter_limit)
-               left_chapter = 0;
-
-       /* At this point, left_chapter is the chapter with the lowest virtual chapter number. */
-       probe_chapter(volume, left_chapter, &lowest);
-
-       /* The moved chapter might be the lowest in the range. */
-       if ((moved_chapter != BAD_CHAPTER) && (lowest == geometry->remapped_virtual + 1))
-               lowest = geometry->remapped_virtual;
-
-       /*
-        * Circularly scan backwards, moving over any bad chapters until encountering a good one,
-        * which is the chapter with the highest vcn.
-        */
-       while (highest == BAD_CHAPTER) {
-               right_chapter = (right_chapter + chapter_limit - 1) % chapter_limit;
-               if (right_chapter == moved_chapter)
-                       continue;
-
-               probe_chapter(volume, right_chapter, &highest);
-               if (bad_chapters++ >= MAX_BAD_CHAPTERS) {
-                       uds_log_error("too many bad chapters in volume: %u",
-                                     bad_chapters);
-                       return UDS_CORRUPT_DATA;
-               }
-       }
-
-       *lowest_vcn = lowest;
-       *highest_vcn = highest;
-       return UDS_SUCCESS;
-}
-
-/*
- * Find the highest and lowest contiguous chapters present in the volume and determine their
- * virtual chapter numbers. This is used by rebuild.
- */
-int uds_find_volume_chapter_boundaries(struct volume *volume, u64 *lowest_vcn,
-                                      u64 *highest_vcn, bool *is_empty)
-{
-       u32 chapter_limit = volume->geometry->chapters_per_volume;
-
-       find_real_end_of_volume(volume, chapter_limit, &chapter_limit);
-       if (chapter_limit == 0) {
-               *lowest_vcn = 0;
-               *highest_vcn = 0;
-               *is_empty = true;
-               return UDS_SUCCESS;
-       }
-
-       *is_empty = false;
-       return find_chapter_limits(volume, chapter_limit, lowest_vcn, highest_vcn);
-}
-
-int __must_check uds_replace_volume_storage(struct volume *volume,
-                                           struct index_layout *layout,
-                                           struct block_device *bdev)
-{
-       int result;
-       u32 i;
-
-       result = uds_replace_index_layout_storage(layout, bdev);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       /* Release all outstanding dm_bufio objects */
-       for (i = 0; i < volume->page_cache.indexable_pages; i++)
-               volume->page_cache.index[i] = volume->page_cache.cache_slots;
-       for (i = 0; i < volume->page_cache.cache_slots; i++)
-               clear_cache_page(&volume->page_cache, &volume->page_cache.cache[i]);
-       if (volume->sparse_cache != NULL)
-               uds_invalidate_sparse_cache(volume->sparse_cache);
-       if (volume->client != NULL)
-               dm_bufio_client_destroy(uds_forget(volume->client));
-
-       return uds_open_volume_bufio(layout, volume->geometry->bytes_per_page,
-                                    volume->reserved_buffers, &volume->client);
-}
-
-static int __must_check initialize_page_cache(struct page_cache *cache,
-                                             const struct index_geometry *geometry,
-                                             u32 chapters_in_cache,
-                                             unsigned int zone_count)
-{
-       int result;
-       u32 i;
-
-       cache->indexable_pages = geometry->pages_per_volume + 1;
-       cache->cache_slots = chapters_in_cache * geometry->record_pages_per_chapter;
-       cache->zone_count = zone_count;
-       atomic64_set(&cache->clock, 1);
-
-       result = ASSERT((cache->cache_slots <= VOLUME_CACHE_MAX_ENTRIES),
-                       "requested cache size, %u, within limit %u",
-                       cache->cache_slots, VOLUME_CACHE_MAX_ENTRIES);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       result = uds_allocate(VOLUME_CACHE_MAX_QUEUED_READS, struct queued_read,
-                             "volume read queue", &cache->read_queue);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       result = uds_allocate(cache->zone_count, struct search_pending_counter,
-                             "Volume Cache Zones", &cache->search_pending_counters);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       result = uds_allocate(cache->indexable_pages, u16, "page cache index",
-                             &cache->index);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       result = uds_allocate(cache->cache_slots, struct cached_page, "page cache cache",
-                             &cache->cache);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       /* Initialize index values to invalid values. */
-       for (i = 0; i < cache->indexable_pages; i++)
-               cache->index[i] = cache->cache_slots;
-
-       for (i = 0; i < cache->cache_slots; i++)
-               clear_cache_page(cache, &cache->cache[i]);
-
-       return UDS_SUCCESS;
-}
-
-int uds_make_volume(const struct uds_configuration *config, struct index_layout *layout,
-                   struct volume **new_volume)
-{
-       unsigned int i;
-       struct volume *volume = NULL;
-       struct index_geometry *geometry;
-       unsigned int reserved_buffers;
-       int result;
-
-       result = uds_allocate(1, struct volume, "volume", &volume);
-       if (result != UDS_SUCCESS)
-               return result;
-
-       volume->nonce = uds_get_volume_nonce(layout);
-
-       result = uds_copy_index_geometry(config->geometry, &volume->geometry);
-       if (result != UDS_SUCCESS) {
-               uds_free_volume(volume);
-               return uds_log_warning_strerror(result,
-                                               "failed to allocate geometry: error");
-       }
-       geometry = volume->geometry;
-
-       /*
-        * Reserve a buffer for each entry in the page cache, one for the chapter writer, and one
-        * for each entry in the sparse cache.
-        */
-       reserved_buffers = config->cache_chapters * geometry->record_pages_per_chapter;
-       reserved_buffers += 1;
-       if (uds_is_sparse_index_geometry(geometry))
-               reserved_buffers += (config->cache_chapters * geometry->index_pages_per_chapter);
-       volume->reserved_buffers = reserved_buffers;
-       result = uds_open_volume_bufio(layout, geometry->bytes_per_page,
-                                      volume->reserved_buffers, &volume->client);
-       if (result != UDS_SUCCESS) {
-               uds_free_volume(volume);
-               return result;
-       }
-
-       result = uds_make_radix_sorter(geometry->records_per_page,
-                                      &volume->radix_sorter);
-       if (result != UDS_SUCCESS) {
-               uds_free_volume(volume);
-               return result;
-       }
-
-       result = uds_allocate(geometry->records_per_page,
-                             const struct uds_volume_record *, "record pointers",
-                             &volume->record_pointers);
-       if (result != UDS_SUCCESS) {
-               uds_free_volume(volume);
-               return result;
-       }
-
-       if (uds_is_sparse_index_geometry(geometry)) {
-               size_t page_size = sizeof(struct delta_index_page) + geometry->bytes_per_page;
-
-               result = uds_make_sparse_cache(geometry, config->cache_chapters,
-                                              config->zone_count,
-                                              &volume->sparse_cache);
-               if (result != UDS_SUCCESS) {
-                       uds_free_volume(volume);
-                       return result;
-               }
-
-               volume->cache_size =
-                       page_size * geometry->index_pages_per_chapter * config->cache_chapters;
-       }
-
-       result = initialize_page_cache(&volume->page_cache, geometry,
-                                      config->cache_chapters, config->zone_count);
-       if (result != UDS_SUCCESS) {
-               uds_free_volume(volume);
-               return result;
-       }
-
-       volume->cache_size += volume->page_cache.cache_slots * sizeof(struct delta_index_page);
-       result = uds_make_index_page_map(geometry, &volume->index_page_map);
-       if (result != UDS_SUCCESS) {
-               uds_free_volume(volume);
-               return result;
-       }
-
-       mutex_init(&volume->read_threads_mutex);
-       uds_init_cond(&volume->read_threads_read_done_cond);
-       uds_init_cond(&volume->read_threads_cond);
-
-       result = uds_allocate(config->read_threads, struct thread *, "reader threads",
-                             &volume->reader_threads);
-       if (result != UDS_SUCCESS) {
-               uds_free_volume(volume);
-               return result;
-       }
-
-       for (i = 0; i < config->read_threads; i++) {
-               result = vdo_create_thread(read_thread_function, (void *) volume,
-                                          "reader", &volume->reader_threads[i]);
-               if (result != UDS_SUCCESS) {
-                       uds_free_volume(volume);
-                       return result;
-               }
-
-               volume->read_thread_count = i + 1;
-       }
-
-       *new_volume = volume;
-       return UDS_SUCCESS;
-}
-
-static void uninitialize_page_cache(struct page_cache *cache)
-{
-       u16 i;
-
-       if (cache->cache != NULL) {
-               for (i = 0; i < cache->cache_slots; i++)
-                       release_page_buffer(&cache->cache[i]);
-       }
-       uds_free(cache->index);
-       uds_free(cache->cache);
-       uds_free(cache->search_pending_counters);
-       uds_free(cache->read_queue);
-}
-
-void uds_free_volume(struct volume *volume)
-{
-       if (volume == NULL)
-               return;
-
-       if (volume->reader_threads != NULL) {
-               unsigned int i;
-
-               /* This works even if some threads weren't started. */
-               mutex_lock(&volume->read_threads_mutex);
-               volume->read_threads_exiting = true;
-               uds_broadcast_cond(&volume->read_threads_cond);
-               mutex_unlock(&volume->read_threads_mutex);
-               for (i = 0; i < volume->read_thread_count; i++)
-                       vdo_join_threads(volume->reader_threads[i]);
-               uds_free(volume->reader_threads);
-               volume->reader_threads = NULL;
-       }
-
-       /* Must destroy the client AFTER freeing the cached pages. */
-       uninitialize_page_cache(&volume->page_cache);
-       uds_free_sparse_cache(volume->sparse_cache);
-       if (volume->client != NULL)
-               dm_bufio_client_destroy(uds_forget(volume->client));
-
-       uds_free_index_page_map(volume->index_page_map);
-       uds_free_radix_sorter(volume->radix_sorter);
-       uds_free(volume->geometry);
-       uds_free(volume->record_pointers);
-       uds_free(volume);
-}
diff --git a/drivers/md/dm-vdo/volume.h b/drivers/md/dm-vdo/volume.h
deleted file mode 100644 (file)
index 290de5c..0000000
+++ /dev/null
@@ -1,171 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright 2023 Red Hat
- */
-
-#ifndef UDS_VOLUME_H
-#define UDS_VOLUME_H
-
-#include <linux/atomic.h>
-#include <linux/cache.h>
-#include <linux/dm-bufio.h>
-#include <linux/limits.h>
-
-#include "chapter-index.h"
-#include "config.h"
-#include "geometry.h"
-#include "indexer.h"
-#include "index-layout.h"
-#include "index-page-map.h"
-#include "permassert.h"
-#include "radix-sort.h"
-#include "sparse-cache.h"
-#include "thread-utils.h"
-
-/*
- * The volume manages deduplication records on permanent storage. The term "volume" can also refer
- * to the region of permanent storage where the records (and the chapters containing them) are
- * stored. The volume handles all I/O to this region by reading, caching, and writing chapter pages
- * as necessary.
- */
-
-enum index_lookup_mode {
-       /* Always do lookups in all chapters normally */
-       LOOKUP_NORMAL,
-       /* Only do a subset of lookups needed when rebuilding an index */
-       LOOKUP_FOR_REBUILD,
-};
-
-struct queued_read {
-       bool invalid;
-       bool reserved;
-       u32 physical_page;
-       struct uds_request *first_request;
-       struct uds_request *last_request;
-};
-
-struct __aligned(L1_CACHE_BYTES) search_pending_counter {
-       u64 atomic_value;
-};
-
-struct cached_page {
-       /* Whether this page is currently being read asynchronously */
-       bool read_pending;
-       /* The physical page stored in this cache entry */
-       u32 physical_page;
-       /* The value of the volume clock when this page was last used */
-       s64 last_used;
-       /* The cached page buffer */
-       struct dm_buffer *buffer;
-       /* The chapter index page, meaningless for record pages */
-       struct delta_index_page index_page;
-};
-
-struct page_cache {
-       /* The number of zones */
-       unsigned int zone_count;
-       /* The number of volume pages that can be cached */
-       u32 indexable_pages;
-       /* The maximum number of simultaneously cached pages */
-       u16 cache_slots;
-       /* An index for each physical page noting where it is in the cache */
-       u16 *index;
-       /* The array of cached pages */
-       struct cached_page *cache;
-       /* A counter for each zone tracking if a search is occurring there */
-       struct search_pending_counter *search_pending_counters;
-       /* The read queue entries as a circular array */
-       struct queued_read *read_queue;
-
-       /* All entries above this point are constant after initialization. */
-
-       /*
-        * These values are all indexes into the array of read queue entries. New entries in the
-        * read queue are enqueued at read_queue_last. To dequeue entries, a reader thread gets the
-        * lock and then claims the entry pointed to by read_queue_next_read and increments that
-        * value. After the read is completed, the reader thread calls release_read_queue_entry(),
-        * which increments read_queue_first until it points to a pending read, or is equal to
-        * read_queue_next_read. This means that if multiple reads are outstanding,
-        * read_queue_first might not advance until the last of the reads finishes.
-        */
-       u16 read_queue_first;
-       u16 read_queue_next_read;
-       u16 read_queue_last;
-
-       atomic64_t clock;
-};
-
-struct volume {
-       struct index_geometry *geometry;
-       struct dm_bufio_client *client;
-       u64 nonce;
-       size_t cache_size;
-
-       /* A single page worth of records, for sorting */
-       const struct uds_volume_record **record_pointers;
-       /* Sorter for sorting records within each page */
-       struct radix_sorter *radix_sorter;
-
-       struct sparse_cache *sparse_cache;
-       struct page_cache page_cache;
-       struct index_page_map *index_page_map;
-
-       struct mutex read_threads_mutex;
-       struct cond_var read_threads_cond;
-       struct cond_var read_threads_read_done_cond;
-       struct thread **reader_threads;
-       unsigned int read_thread_count;
-       bool read_threads_exiting;
-
-       enum index_lookup_mode lookup_mode;
-       unsigned int reserved_buffers;
-};
-
-int __must_check uds_make_volume(const struct uds_configuration *config,
-                                struct index_layout *layout,
-                                struct volume **new_volume);
-
-void uds_free_volume(struct volume *volume);
-
-int __must_check uds_replace_volume_storage(struct volume *volume,
-                                           struct index_layout *layout,
-                                           struct block_device *bdev);
-
-int __must_check uds_find_volume_chapter_boundaries(struct volume *volume,
-                                                   u64 *lowest_vcn, u64 *highest_vcn,
-                                                   bool *is_empty);
-
-int __must_check uds_search_volume_page_cache(struct volume *volume,
-                                             struct uds_request *request,
-                                             bool *found);
-
-int __must_check uds_search_volume_page_cache_for_rebuild(struct volume *volume,
-                                                         const struct uds_record_name *name,
-                                                         u64 virtual_chapter,
-                                                         bool *found);
-
-int __must_check uds_search_cached_record_page(struct volume *volume,
-                                              struct uds_request *request, u32 chapter,
-                                              u16 record_page_number, bool *found);
-
-void uds_forget_chapter(struct volume *volume, u64 chapter);
-
-int __must_check uds_write_chapter(struct volume *volume,
-                                  struct open_chapter_index *chapter_index,
-                                  const struct uds_volume_record records[]);
-
-void uds_prefetch_volume_chapter(const struct volume *volume, u32 chapter);
-
-int __must_check uds_read_chapter_index_from_volume(const struct volume *volume,
-                                                   u64 virtual_chapter,
-                                                   struct dm_buffer *volume_buffers[],
-                                                   struct delta_index_page index_pages[]);
-
-int __must_check uds_get_volume_record_page(struct volume *volume, u32 chapter,
-                                           u32 page_number, u8 **data_ptr);
-
-int __must_check uds_get_volume_index_page(struct volume *volume, u32 chapter,
-                                          u32 page_number,
-                                          struct delta_index_page **page_ptr);
-
-#endif /* UDS_VOLUME_H */