From 8b0a8165cdad0f4133837d753649ef4682e42c3b Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Wed, 7 Aug 2024 15:11:40 +0200 Subject: Merging upstream version 6.9.7. Signed-off-by: Daniel Baumann --- drivers/md/dm-vdo/indexer/chapter-index.c | 293 ++++ drivers/md/dm-vdo/indexer/chapter-index.h | 61 + drivers/md/dm-vdo/indexer/config.c | 376 +++++ drivers/md/dm-vdo/indexer/config.h | 124 ++ drivers/md/dm-vdo/indexer/delta-index.c | 1970 +++++++++++++++++++++++ drivers/md/dm-vdo/indexer/delta-index.h | 279 ++++ drivers/md/dm-vdo/indexer/funnel-requestqueue.c | 279 ++++ drivers/md/dm-vdo/indexer/funnel-requestqueue.h | 31 + drivers/md/dm-vdo/indexer/geometry.c | 201 +++ drivers/md/dm-vdo/indexer/geometry.h | 140 ++ drivers/md/dm-vdo/indexer/hash-utils.h | 66 + drivers/md/dm-vdo/indexer/index-layout.c | 1765 ++++++++++++++++++++ drivers/md/dm-vdo/indexer/index-layout.h | 43 + drivers/md/dm-vdo/indexer/index-page-map.c | 173 ++ drivers/md/dm-vdo/indexer/index-page-map.h | 50 + drivers/md/dm-vdo/indexer/index-session.c | 739 +++++++++ drivers/md/dm-vdo/indexer/index-session.h | 85 + drivers/md/dm-vdo/indexer/index.c | 1388 ++++++++++++++++ drivers/md/dm-vdo/indexer/index.h | 83 + drivers/md/dm-vdo/indexer/indexer.h | 353 ++++ drivers/md/dm-vdo/indexer/io-factory.c | 415 +++++ drivers/md/dm-vdo/indexer/io-factory.h | 64 + drivers/md/dm-vdo/indexer/open-chapter.c | 426 +++++ drivers/md/dm-vdo/indexer/open-chapter.h | 79 + drivers/md/dm-vdo/indexer/radix-sort.c | 330 ++++ drivers/md/dm-vdo/indexer/radix-sort.h | 26 + drivers/md/dm-vdo/indexer/sparse-cache.c | 624 +++++++ drivers/md/dm-vdo/indexer/sparse-cache.h | 46 + drivers/md/dm-vdo/indexer/volume-index.c | 1283 +++++++++++++++ drivers/md/dm-vdo/indexer/volume-index.h | 193 +++ drivers/md/dm-vdo/indexer/volume.c | 1693 +++++++++++++++++++ drivers/md/dm-vdo/indexer/volume.h | 172 ++ 32 files changed, 13850 insertions(+) create mode 100644 drivers/md/dm-vdo/indexer/chapter-index.c create mode 100644 drivers/md/dm-vdo/indexer/chapter-index.h create mode 100644 drivers/md/dm-vdo/indexer/config.c create mode 100644 drivers/md/dm-vdo/indexer/config.h create mode 100644 drivers/md/dm-vdo/indexer/delta-index.c create mode 100644 drivers/md/dm-vdo/indexer/delta-index.h create mode 100644 drivers/md/dm-vdo/indexer/funnel-requestqueue.c create mode 100644 drivers/md/dm-vdo/indexer/funnel-requestqueue.h create mode 100644 drivers/md/dm-vdo/indexer/geometry.c create mode 100644 drivers/md/dm-vdo/indexer/geometry.h create mode 100644 drivers/md/dm-vdo/indexer/hash-utils.h create mode 100644 drivers/md/dm-vdo/indexer/index-layout.c create mode 100644 drivers/md/dm-vdo/indexer/index-layout.h create mode 100644 drivers/md/dm-vdo/indexer/index-page-map.c create mode 100644 drivers/md/dm-vdo/indexer/index-page-map.h create mode 100644 drivers/md/dm-vdo/indexer/index-session.c create mode 100644 drivers/md/dm-vdo/indexer/index-session.h create mode 100644 drivers/md/dm-vdo/indexer/index.c create mode 100644 drivers/md/dm-vdo/indexer/index.h create mode 100644 drivers/md/dm-vdo/indexer/indexer.h create mode 100644 drivers/md/dm-vdo/indexer/io-factory.c create mode 100644 drivers/md/dm-vdo/indexer/io-factory.h create mode 100644 drivers/md/dm-vdo/indexer/open-chapter.c create mode 100644 drivers/md/dm-vdo/indexer/open-chapter.h create mode 100644 drivers/md/dm-vdo/indexer/radix-sort.c create mode 100644 drivers/md/dm-vdo/indexer/radix-sort.h create mode 100644 drivers/md/dm-vdo/indexer/sparse-cache.c create mode 100644 drivers/md/dm-vdo/indexer/sparse-cache.h create mode 100644 drivers/md/dm-vdo/indexer/volume-index.c create mode 100644 drivers/md/dm-vdo/indexer/volume-index.h create mode 100644 drivers/md/dm-vdo/indexer/volume.c create mode 100644 drivers/md/dm-vdo/indexer/volume.h (limited to 'drivers/md/dm-vdo/indexer') diff --git a/drivers/md/dm-vdo/indexer/chapter-index.c b/drivers/md/dm-vdo/indexer/chapter-index.c new file mode 100644 index 0000000000..7e32a25d3f --- /dev/null +++ b/drivers/md/dm-vdo/indexer/chapter-index.c @@ -0,0 +1,293 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2023 Red Hat + */ + +#include "chapter-index.h" + +#include "errors.h" +#include "logger.h" +#include "memory-alloc.h" +#include "permassert.h" + +#include "hash-utils.h" +#include "indexer.h" + +int uds_make_open_chapter_index(struct open_chapter_index **chapter_index, + const struct index_geometry *geometry, u64 volume_nonce) +{ + int result; + size_t memory_size; + struct open_chapter_index *index; + + result = vdo_allocate(1, struct open_chapter_index, "open chapter index", &index); + if (result != VDO_SUCCESS) + return result; + + /* + * The delta index will rebalance delta lists when memory gets tight, + * so give the chapter index one extra page. + */ + memory_size = ((geometry->index_pages_per_chapter + 1) * geometry->bytes_per_page); + index->geometry = geometry; + index->volume_nonce = volume_nonce; + result = uds_initialize_delta_index(&index->delta_index, 1, + geometry->delta_lists_per_chapter, + geometry->chapter_mean_delta, + geometry->chapter_payload_bits, + memory_size, 'm'); + if (result != UDS_SUCCESS) { + vdo_free(index); + return result; + } + + index->memory_size = index->delta_index.memory_size + sizeof(struct open_chapter_index); + *chapter_index = index; + return UDS_SUCCESS; +} + +void uds_free_open_chapter_index(struct open_chapter_index *chapter_index) +{ + if (chapter_index == NULL) + return; + + uds_uninitialize_delta_index(&chapter_index->delta_index); + vdo_free(chapter_index); +} + +/* Re-initialize an open chapter index for a new chapter. */ +void uds_empty_open_chapter_index(struct open_chapter_index *chapter_index, + u64 virtual_chapter_number) +{ + uds_reset_delta_index(&chapter_index->delta_index); + chapter_index->virtual_chapter_number = virtual_chapter_number; +} + +static inline bool was_entry_found(const struct delta_index_entry *entry, u32 address) +{ + return (!entry->at_end) && (entry->key == address); +} + +/* Associate a record name with the record page containing its metadata. */ +int uds_put_open_chapter_index_record(struct open_chapter_index *chapter_index, + const struct uds_record_name *name, + u32 page_number) +{ + int result; + struct delta_index_entry entry; + u32 address; + u32 list_number; + const u8 *found_name; + bool found; + const struct index_geometry *geometry = chapter_index->geometry; + u64 chapter_number = chapter_index->virtual_chapter_number; + u32 record_pages = geometry->record_pages_per_chapter; + + result = VDO_ASSERT(page_number < record_pages, + "Page number within chapter (%u) exceeds the maximum value %u", + page_number, record_pages); + if (result != VDO_SUCCESS) + return UDS_INVALID_ARGUMENT; + + address = uds_hash_to_chapter_delta_address(name, geometry); + list_number = uds_hash_to_chapter_delta_list(name, geometry); + result = uds_get_delta_index_entry(&chapter_index->delta_index, list_number, + address, name->name, &entry); + if (result != UDS_SUCCESS) + return result; + + found = was_entry_found(&entry, address); + result = VDO_ASSERT(!(found && entry.is_collision), + "Chunk appears more than once in chapter %llu", + (unsigned long long) chapter_number); + if (result != VDO_SUCCESS) + return UDS_BAD_STATE; + + found_name = (found ? name->name : NULL); + return uds_put_delta_index_entry(&entry, address, page_number, found_name); +} + +/* + * Pack a section of an open chapter index into a chapter index page. A range of delta lists + * (starting with a specified list index) is copied from the open chapter index into a memory page. + * The number of lists copied onto the page is returned to the caller on success. + * + * @chapter_index: The open chapter index + * @memory: The memory page to use + * @first_list: The first delta list number to be copied + * @last_page: If true, this is the last page of the chapter index and all the remaining lists must + * be packed onto this page + * @lists_packed: The number of delta lists that were packed onto this page + */ +int uds_pack_open_chapter_index_page(struct open_chapter_index *chapter_index, + u8 *memory, u32 first_list, bool last_page, + u32 *lists_packed) +{ + int result; + struct delta_index *delta_index = &chapter_index->delta_index; + struct delta_index_stats stats; + u64 nonce = chapter_index->volume_nonce; + u64 chapter_number = chapter_index->virtual_chapter_number; + const struct index_geometry *geometry = chapter_index->geometry; + u32 list_count = geometry->delta_lists_per_chapter; + unsigned int removals = 0; + struct delta_index_entry entry; + u32 next_list; + s32 list_number; + + for (;;) { + result = uds_pack_delta_index_page(delta_index, nonce, memory, + geometry->bytes_per_page, + chapter_number, first_list, + lists_packed); + if (result != UDS_SUCCESS) + return result; + + if ((first_list + *lists_packed) == list_count) { + /* All lists are packed. */ + break; + } else if (*lists_packed == 0) { + /* + * The next delta list does not fit on a page. This delta list will be + * removed. + */ + } else if (last_page) { + /* + * This is the last page and there are lists left unpacked, but all of the + * remaining lists must fit on the page. Find a list that contains entries + * and remove the entire list. Try the first list that does not fit. If it + * is empty, we will select the last list that already fits and has any + * entries. + */ + } else { + /* This page is done. */ + break; + } + + if (removals == 0) { + uds_get_delta_index_stats(delta_index, &stats); + vdo_log_warning("The chapter index for chapter %llu contains %llu entries with %llu collisions", + (unsigned long long) chapter_number, + (unsigned long long) stats.record_count, + (unsigned long long) stats.collision_count); + } + + list_number = *lists_packed; + do { + if (list_number < 0) + return UDS_OVERFLOW; + + next_list = first_list + list_number--, + result = uds_start_delta_index_search(delta_index, next_list, 0, + &entry); + if (result != UDS_SUCCESS) + return result; + + result = uds_next_delta_index_entry(&entry); + if (result != UDS_SUCCESS) + return result; + } while (entry.at_end); + + do { + result = uds_remove_delta_index_entry(&entry); + if (result != UDS_SUCCESS) + return result; + + removals++; + } while (!entry.at_end); + } + + if (removals > 0) { + vdo_log_warning("To avoid chapter index page overflow in chapter %llu, %u entries were removed from the chapter index", + (unsigned long long) chapter_number, removals); + } + + return UDS_SUCCESS; +} + +/* Make a new chapter index page, initializing it with the data from a given index_page buffer. */ +int uds_initialize_chapter_index_page(struct delta_index_page *index_page, + const struct index_geometry *geometry, + u8 *page_buffer, u64 volume_nonce) +{ + return uds_initialize_delta_index_page(index_page, volume_nonce, + geometry->chapter_mean_delta, + geometry->chapter_payload_bits, + page_buffer, geometry->bytes_per_page); +} + +/* Validate a chapter index page read during rebuild. */ +int uds_validate_chapter_index_page(const struct delta_index_page *index_page, + const struct index_geometry *geometry) +{ + int result; + const struct delta_index *delta_index = &index_page->delta_index; + u32 first = index_page->lowest_list_number; + u32 last = index_page->highest_list_number; + u32 list_number; + + /* We walk every delta list from start to finish. */ + for (list_number = first; list_number <= last; list_number++) { + struct delta_index_entry entry; + + result = uds_start_delta_index_search(delta_index, list_number - first, + 0, &entry); + if (result != UDS_SUCCESS) + return result; + + for (;;) { + result = uds_next_delta_index_entry(&entry); + if (result != UDS_SUCCESS) { + /* + * A random bit stream is highly likely to arrive here when we go + * past the end of the delta list. + */ + return result; + } + + if (entry.at_end) + break; + + /* Also make sure that the record page field contains a plausible value. */ + if (uds_get_delta_entry_value(&entry) >= + geometry->record_pages_per_chapter) { + /* + * Do not log this as an error. It happens in normal operation when + * we are doing a rebuild but haven't written the entire volume + * once. + */ + return UDS_CORRUPT_DATA; + } + } + } + return UDS_SUCCESS; +} + +/* + * Search a chapter index page for a record name, returning the record page number that may contain + * the name. + */ +int uds_search_chapter_index_page(struct delta_index_page *index_page, + const struct index_geometry *geometry, + const struct uds_record_name *name, + u16 *record_page_ptr) +{ + int result; + struct delta_index *delta_index = &index_page->delta_index; + u32 address = uds_hash_to_chapter_delta_address(name, geometry); + u32 delta_list_number = uds_hash_to_chapter_delta_list(name, geometry); + u32 sub_list_number = delta_list_number - index_page->lowest_list_number; + struct delta_index_entry entry; + + result = uds_get_delta_index_entry(delta_index, sub_list_number, address, + name->name, &entry); + if (result != UDS_SUCCESS) + return result; + + if (was_entry_found(&entry, address)) + *record_page_ptr = uds_get_delta_entry_value(&entry); + else + *record_page_ptr = NO_CHAPTER_INDEX_ENTRY; + + return UDS_SUCCESS; +} diff --git a/drivers/md/dm-vdo/indexer/chapter-index.h b/drivers/md/dm-vdo/indexer/chapter-index.h new file mode 100644 index 0000000000..be8bf2b675 --- /dev/null +++ b/drivers/md/dm-vdo/indexer/chapter-index.h @@ -0,0 +1,61 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright 2023 Red Hat + */ + +#ifndef UDS_CHAPTER_INDEX_H +#define UDS_CHAPTER_INDEX_H + +#include + +#include "delta-index.h" +#include "geometry.h" + +/* + * A chapter index for an open chapter is a mutable structure that tracks all the records that have + * been added to the chapter. A chapter index for a closed chapter is similar except that it is + * immutable because the contents of a closed chapter can never change, and the immutable structure + * is more efficient. Both types of chapter index are implemented with a delta index. + */ + +/* The value returned when no entry is found in the chapter index. */ +#define NO_CHAPTER_INDEX_ENTRY U16_MAX + +struct open_chapter_index { + const struct index_geometry *geometry; + struct delta_index delta_index; + u64 virtual_chapter_number; + u64 volume_nonce; + size_t memory_size; +}; + +int __must_check uds_make_open_chapter_index(struct open_chapter_index **chapter_index, + const struct index_geometry *geometry, + u64 volume_nonce); + +void uds_free_open_chapter_index(struct open_chapter_index *chapter_index); + +void uds_empty_open_chapter_index(struct open_chapter_index *chapter_index, + u64 virtual_chapter_number); + +int __must_check uds_put_open_chapter_index_record(struct open_chapter_index *chapter_index, + const struct uds_record_name *name, + u32 page_number); + +int __must_check uds_pack_open_chapter_index_page(struct open_chapter_index *chapter_index, + u8 *memory, u32 first_list, + bool last_page, u32 *lists_packed); + +int __must_check uds_initialize_chapter_index_page(struct delta_index_page *index_page, + const struct index_geometry *geometry, + u8 *page_buffer, u64 volume_nonce); + +int __must_check uds_validate_chapter_index_page(const struct delta_index_page *index_page, + const struct index_geometry *geometry); + +int __must_check uds_search_chapter_index_page(struct delta_index_page *index_page, + const struct index_geometry *geometry, + const struct uds_record_name *name, + u16 *record_page_ptr); + +#endif /* UDS_CHAPTER_INDEX_H */ diff --git a/drivers/md/dm-vdo/indexer/config.c b/drivers/md/dm-vdo/indexer/config.c new file mode 100644 index 0000000000..5532371b95 --- /dev/null +++ b/drivers/md/dm-vdo/indexer/config.c @@ -0,0 +1,376 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2023 Red Hat + */ + +#include "config.h" + +#include "logger.h" +#include "memory-alloc.h" +#include "numeric.h" +#include "string-utils.h" +#include "thread-utils.h" + +static const u8 INDEX_CONFIG_MAGIC[] = "ALBIC"; +static const u8 INDEX_CONFIG_VERSION_6_02[] = "06.02"; +static const u8 INDEX_CONFIG_VERSION_8_02[] = "08.02"; + +#define DEFAULT_VOLUME_READ_THREADS 2 +#define MAX_VOLUME_READ_THREADS 16 +#define INDEX_CONFIG_MAGIC_LENGTH (sizeof(INDEX_CONFIG_MAGIC) - 1) +#define INDEX_CONFIG_VERSION_LENGTH ((int)(sizeof(INDEX_CONFIG_VERSION_6_02) - 1)) + +static bool is_version(const u8 *version, u8 *buffer) +{ + return memcmp(version, buffer, INDEX_CONFIG_VERSION_LENGTH) == 0; +} + +static bool are_matching_configurations(struct uds_configuration *saved_config, + struct index_geometry *saved_geometry, + struct uds_configuration *user) +{ + struct index_geometry *geometry = user->geometry; + bool result = true; + + if (saved_geometry->record_pages_per_chapter != geometry->record_pages_per_chapter) { + vdo_log_error("Record pages per chapter (%u) does not match (%u)", + saved_geometry->record_pages_per_chapter, + geometry->record_pages_per_chapter); + result = false; + } + + if (saved_geometry->chapters_per_volume != geometry->chapters_per_volume) { + vdo_log_error("Chapter count (%u) does not match (%u)", + saved_geometry->chapters_per_volume, + geometry->chapters_per_volume); + result = false; + } + + if (saved_geometry->sparse_chapters_per_volume != geometry->sparse_chapters_per_volume) { + vdo_log_error("Sparse chapter count (%u) does not match (%u)", + saved_geometry->sparse_chapters_per_volume, + geometry->sparse_chapters_per_volume); + result = false; + } + + if (saved_config->cache_chapters != user->cache_chapters) { + vdo_log_error("Cache size (%u) does not match (%u)", + saved_config->cache_chapters, user->cache_chapters); + result = false; + } + + if (saved_config->volume_index_mean_delta != user->volume_index_mean_delta) { + vdo_log_error("Volume index mean delta (%u) does not match (%u)", + saved_config->volume_index_mean_delta, + user->volume_index_mean_delta); + result = false; + } + + if (saved_geometry->bytes_per_page != geometry->bytes_per_page) { + vdo_log_error("Bytes per page value (%zu) does not match (%zu)", + saved_geometry->bytes_per_page, geometry->bytes_per_page); + result = false; + } + + if (saved_config->sparse_sample_rate != user->sparse_sample_rate) { + vdo_log_error("Sparse sample rate (%u) does not match (%u)", + saved_config->sparse_sample_rate, + user->sparse_sample_rate); + result = false; + } + + if (saved_config->nonce != user->nonce) { + vdo_log_error("Nonce (%llu) does not match (%llu)", + (unsigned long long) saved_config->nonce, + (unsigned long long) user->nonce); + result = false; + } + + return result; +} + +/* Read the configuration and validate it against the provided one. */ +int uds_validate_config_contents(struct buffered_reader *reader, + struct uds_configuration *user_config) +{ + int result; + struct uds_configuration config; + struct index_geometry geometry; + u8 version_buffer[INDEX_CONFIG_VERSION_LENGTH]; + u32 bytes_per_page; + u8 buffer[sizeof(struct uds_configuration_6_02)]; + size_t offset = 0; + + result = uds_verify_buffered_data(reader, INDEX_CONFIG_MAGIC, + INDEX_CONFIG_MAGIC_LENGTH); + if (result != UDS_SUCCESS) + return result; + + result = uds_read_from_buffered_reader(reader, version_buffer, + INDEX_CONFIG_VERSION_LENGTH); + if (result != UDS_SUCCESS) + return vdo_log_error_strerror(result, "cannot read index config version"); + + if (!is_version(INDEX_CONFIG_VERSION_6_02, version_buffer) && + !is_version(INDEX_CONFIG_VERSION_8_02, version_buffer)) { + return vdo_log_error_strerror(UDS_CORRUPT_DATA, + "unsupported configuration version: '%.*s'", + INDEX_CONFIG_VERSION_LENGTH, + version_buffer); + } + + result = uds_read_from_buffered_reader(reader, buffer, sizeof(buffer)); + if (result != UDS_SUCCESS) + return vdo_log_error_strerror(result, "cannot read config data"); + + decode_u32_le(buffer, &offset, &geometry.record_pages_per_chapter); + decode_u32_le(buffer, &offset, &geometry.chapters_per_volume); + decode_u32_le(buffer, &offset, &geometry.sparse_chapters_per_volume); + decode_u32_le(buffer, &offset, &config.cache_chapters); + offset += sizeof(u32); + decode_u32_le(buffer, &offset, &config.volume_index_mean_delta); + decode_u32_le(buffer, &offset, &bytes_per_page); + geometry.bytes_per_page = bytes_per_page; + decode_u32_le(buffer, &offset, &config.sparse_sample_rate); + decode_u64_le(buffer, &offset, &config.nonce); + + result = VDO_ASSERT(offset == sizeof(struct uds_configuration_6_02), + "%zu bytes read but not decoded", + sizeof(struct uds_configuration_6_02) - offset); + if (result != VDO_SUCCESS) + return UDS_CORRUPT_DATA; + + if (is_version(INDEX_CONFIG_VERSION_6_02, version_buffer)) { + user_config->geometry->remapped_virtual = 0; + user_config->geometry->remapped_physical = 0; + } else { + u8 remapping[sizeof(u64) + sizeof(u64)]; + + result = uds_read_from_buffered_reader(reader, remapping, + sizeof(remapping)); + if (result != UDS_SUCCESS) + return vdo_log_error_strerror(result, "cannot read converted config"); + + offset = 0; + decode_u64_le(remapping, &offset, + &user_config->geometry->remapped_virtual); + decode_u64_le(remapping, &offset, + &user_config->geometry->remapped_physical); + } + + if (!are_matching_configurations(&config, &geometry, user_config)) { + vdo_log_warning("Supplied configuration does not match save"); + return UDS_NO_INDEX; + } + + return UDS_SUCCESS; +} + +/* + * Write the configuration to stable storage. If the superblock version is < 4, write the 6.02 + * version; otherwise write the 8.02 version, indicating the configuration is for an index that has + * been reduced by one chapter. + */ +int uds_write_config_contents(struct buffered_writer *writer, + struct uds_configuration *config, u32 version) +{ + int result; + struct index_geometry *geometry = config->geometry; + u8 buffer[sizeof(struct uds_configuration_8_02)]; + size_t offset = 0; + + result = uds_write_to_buffered_writer(writer, INDEX_CONFIG_MAGIC, + INDEX_CONFIG_MAGIC_LENGTH); + if (result != UDS_SUCCESS) + return result; + + /* + * If version is < 4, the index has not been reduced by a chapter so it must be written out + * as version 6.02 so that it is still compatible with older versions of UDS. + */ + if (version >= 4) { + result = uds_write_to_buffered_writer(writer, INDEX_CONFIG_VERSION_8_02, + INDEX_CONFIG_VERSION_LENGTH); + if (result != UDS_SUCCESS) + return result; + } else { + result = uds_write_to_buffered_writer(writer, INDEX_CONFIG_VERSION_6_02, + INDEX_CONFIG_VERSION_LENGTH); + if (result != UDS_SUCCESS) + return result; + } + + encode_u32_le(buffer, &offset, geometry->record_pages_per_chapter); + encode_u32_le(buffer, &offset, geometry->chapters_per_volume); + encode_u32_le(buffer, &offset, geometry->sparse_chapters_per_volume); + encode_u32_le(buffer, &offset, config->cache_chapters); + encode_u32_le(buffer, &offset, 0); + encode_u32_le(buffer, &offset, config->volume_index_mean_delta); + encode_u32_le(buffer, &offset, geometry->bytes_per_page); + encode_u32_le(buffer, &offset, config->sparse_sample_rate); + encode_u64_le(buffer, &offset, config->nonce); + + result = VDO_ASSERT(offset == sizeof(struct uds_configuration_6_02), + "%zu bytes encoded, of %zu expected", offset, + sizeof(struct uds_configuration_6_02)); + if (result != VDO_SUCCESS) + return result; + + if (version >= 4) { + encode_u64_le(buffer, &offset, geometry->remapped_virtual); + encode_u64_le(buffer, &offset, geometry->remapped_physical); + } + + return uds_write_to_buffered_writer(writer, buffer, offset); +} + +/* Compute configuration parameters that depend on memory size. */ +static int compute_memory_sizes(uds_memory_config_size_t mem_gb, bool sparse, + u32 *chapters_per_volume, u32 *record_pages_per_chapter, + u32 *sparse_chapters_per_volume) +{ + u32 reduced_chapters = 0; + u32 base_chapters; + + if (mem_gb == UDS_MEMORY_CONFIG_256MB) { + base_chapters = DEFAULT_CHAPTERS_PER_VOLUME; + *record_pages_per_chapter = SMALL_RECORD_PAGES_PER_CHAPTER; + } else if (mem_gb == UDS_MEMORY_CONFIG_512MB) { + base_chapters = DEFAULT_CHAPTERS_PER_VOLUME; + *record_pages_per_chapter = 2 * SMALL_RECORD_PAGES_PER_CHAPTER; + } else if (mem_gb == UDS_MEMORY_CONFIG_768MB) { + base_chapters = DEFAULT_CHAPTERS_PER_VOLUME; + *record_pages_per_chapter = 3 * SMALL_RECORD_PAGES_PER_CHAPTER; + } else if ((mem_gb >= 1) && (mem_gb <= UDS_MEMORY_CONFIG_MAX)) { + base_chapters = mem_gb * DEFAULT_CHAPTERS_PER_VOLUME; + *record_pages_per_chapter = DEFAULT_RECORD_PAGES_PER_CHAPTER; + } else if (mem_gb == UDS_MEMORY_CONFIG_REDUCED_256MB) { + reduced_chapters = 1; + base_chapters = DEFAULT_CHAPTERS_PER_VOLUME; + *record_pages_per_chapter = SMALL_RECORD_PAGES_PER_CHAPTER; + } else if (mem_gb == UDS_MEMORY_CONFIG_REDUCED_512MB) { + reduced_chapters = 1; + base_chapters = DEFAULT_CHAPTERS_PER_VOLUME; + *record_pages_per_chapter = 2 * SMALL_RECORD_PAGES_PER_CHAPTER; + } else if (mem_gb == UDS_MEMORY_CONFIG_REDUCED_768MB) { + reduced_chapters = 1; + base_chapters = DEFAULT_CHAPTERS_PER_VOLUME; + *record_pages_per_chapter = 3 * SMALL_RECORD_PAGES_PER_CHAPTER; + } else if ((mem_gb >= 1 + UDS_MEMORY_CONFIG_REDUCED) && + (mem_gb <= UDS_MEMORY_CONFIG_REDUCED_MAX)) { + reduced_chapters = 1; + base_chapters = ((mem_gb - UDS_MEMORY_CONFIG_REDUCED) * + DEFAULT_CHAPTERS_PER_VOLUME); + *record_pages_per_chapter = DEFAULT_RECORD_PAGES_PER_CHAPTER; + } else { + vdo_log_error("received invalid memory size"); + return -EINVAL; + } + + if (sparse) { + /* Make 95% of chapters sparse, allowing 10x more records. */ + *sparse_chapters_per_volume = (19 * base_chapters) / 2; + base_chapters *= 10; + } else { + *sparse_chapters_per_volume = 0; + } + + *chapters_per_volume = base_chapters - reduced_chapters; + return UDS_SUCCESS; +} + +static unsigned int __must_check normalize_zone_count(unsigned int requested) +{ + unsigned int zone_count = requested; + + if (zone_count == 0) + zone_count = num_online_cpus() / 2; + + if (zone_count < 1) + zone_count = 1; + + if (zone_count > MAX_ZONES) + zone_count = MAX_ZONES; + + vdo_log_info("Using %u indexing zone%s for concurrency.", + zone_count, zone_count == 1 ? "" : "s"); + return zone_count; +} + +static unsigned int __must_check normalize_read_threads(unsigned int requested) +{ + unsigned int read_threads = requested; + + if (read_threads < 1) + read_threads = DEFAULT_VOLUME_READ_THREADS; + + if (read_threads > MAX_VOLUME_READ_THREADS) + read_threads = MAX_VOLUME_READ_THREADS; + + return read_threads; +} + +int uds_make_configuration(const struct uds_parameters *params, + struct uds_configuration **config_ptr) +{ + struct uds_configuration *config; + u32 chapters_per_volume = 0; + u32 record_pages_per_chapter = 0; + u32 sparse_chapters_per_volume = 0; + int result; + + result = compute_memory_sizes(params->memory_size, params->sparse, + &chapters_per_volume, &record_pages_per_chapter, + &sparse_chapters_per_volume); + if (result != UDS_SUCCESS) + return result; + + result = vdo_allocate(1, struct uds_configuration, __func__, &config); + if (result != VDO_SUCCESS) + return result; + + result = uds_make_index_geometry(DEFAULT_BYTES_PER_PAGE, record_pages_per_chapter, + chapters_per_volume, sparse_chapters_per_volume, + 0, 0, &config->geometry); + if (result != UDS_SUCCESS) { + uds_free_configuration(config); + return result; + } + + config->zone_count = normalize_zone_count(params->zone_count); + config->read_threads = normalize_read_threads(params->read_threads); + + config->cache_chapters = DEFAULT_CACHE_CHAPTERS; + config->volume_index_mean_delta = DEFAULT_VOLUME_INDEX_MEAN_DELTA; + config->sparse_sample_rate = (params->sparse ? DEFAULT_SPARSE_SAMPLE_RATE : 0); + config->nonce = params->nonce; + config->bdev = params->bdev; + config->offset = params->offset; + config->size = params->size; + + *config_ptr = config; + return UDS_SUCCESS; +} + +void uds_free_configuration(struct uds_configuration *config) +{ + if (config != NULL) { + uds_free_index_geometry(config->geometry); + vdo_free(config); + } +} + +void uds_log_configuration(struct uds_configuration *config) +{ + struct index_geometry *geometry = config->geometry; + + vdo_log_debug("Configuration:"); + vdo_log_debug(" Record pages per chapter: %10u", geometry->record_pages_per_chapter); + vdo_log_debug(" Chapters per volume: %10u", geometry->chapters_per_volume); + vdo_log_debug(" Sparse chapters per volume: %10u", geometry->sparse_chapters_per_volume); + vdo_log_debug(" Cache size (chapters): %10u", config->cache_chapters); + vdo_log_debug(" Volume index mean delta: %10u", config->volume_index_mean_delta); + vdo_log_debug(" Bytes per page: %10zu", geometry->bytes_per_page); + vdo_log_debug(" Sparse sample rate: %10u", config->sparse_sample_rate); + vdo_log_debug(" Nonce: %llu", (unsigned long long) config->nonce); +} diff --git a/drivers/md/dm-vdo/indexer/config.h b/drivers/md/dm-vdo/indexer/config.h new file mode 100644 index 0000000000..08507dc2f7 --- /dev/null +++ b/drivers/md/dm-vdo/indexer/config.h @@ -0,0 +1,124 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright 2023 Red Hat + */ + +#ifndef UDS_CONFIG_H +#define UDS_CONFIG_H + +#include "geometry.h" +#include "indexer.h" +#include "io-factory.h" + +/* + * The uds_configuration records a variety of parameters used to configure a new UDS index. Some + * parameters are provided by the client, while others are fixed or derived from user-supplied + * values. It is created when an index is created, and it is recorded in the index metadata. + */ + +enum { + DEFAULT_VOLUME_INDEX_MEAN_DELTA = 4096, + DEFAULT_CACHE_CHAPTERS = 7, + DEFAULT_SPARSE_SAMPLE_RATE = 32, + MAX_ZONES = 16, +}; + +/* A set of configuration parameters for the indexer. */ +struct uds_configuration { + /* Storage device for the index */ + struct block_device *bdev; + + /* The maximum allowable size of the index */ + size_t size; + + /* The offset where the index should start */ + off_t offset; + + /* Parameters for the volume */ + + /* The volume layout */ + struct index_geometry *geometry; + + /* Index owner's nonce */ + u64 nonce; + + /* The number of threads used to process index requests */ + unsigned int zone_count; + + /* The number of threads used to read volume pages */ + unsigned int read_threads; + + /* Size of the page cache and sparse chapter index cache in chapters */ + u32 cache_chapters; + + /* Parameters for the volume index */ + + /* The mean delta for the volume index */ + u32 volume_index_mean_delta; + + /* Sampling rate for sparse indexing */ + u32 sparse_sample_rate; +}; + +/* On-disk structure of data for a version 8.02 index. */ +struct uds_configuration_8_02 { + /* Smaller (16), Small (64) or large (256) indices */ + u32 record_pages_per_chapter; + /* Total number of chapters per volume */ + u32 chapters_per_volume; + /* Number of sparse chapters per volume */ + u32 sparse_chapters_per_volume; + /* Size of the page cache, in chapters */ + u32 cache_chapters; + /* Unused field */ + u32 unused; + /* The volume index mean delta to use */ + u32 volume_index_mean_delta; + /* Size of a page, used for both record pages and index pages */ + u32 bytes_per_page; + /* Sampling rate for sparse indexing */ + u32 sparse_sample_rate; + /* Index owner's nonce */ + u64 nonce; + /* Virtual chapter remapped from physical chapter 0 */ + u64 remapped_virtual; + /* New physical chapter which remapped chapter was moved to */ + u64 remapped_physical; +} __packed; + +/* On-disk structure of data for a version 6.02 index. */ +struct uds_configuration_6_02 { + /* Smaller (16), Small (64) or large (256) indices */ + u32 record_pages_per_chapter; + /* Total number of chapters per volume */ + u32 chapters_per_volume; + /* Number of sparse chapters per volume */ + u32 sparse_chapters_per_volume; + /* Size of the page cache, in chapters */ + u32 cache_chapters; + /* Unused field */ + u32 unused; + /* The volume index mean delta to use */ + u32 volume_index_mean_delta; + /* Size of a page, used for both record pages and index pages */ + u32 bytes_per_page; + /* Sampling rate for sparse indexing */ + u32 sparse_sample_rate; + /* Index owner's nonce */ + u64 nonce; +} __packed; + +int __must_check uds_make_configuration(const struct uds_parameters *params, + struct uds_configuration **config_ptr); + +void uds_free_configuration(struct uds_configuration *config); + +int __must_check uds_validate_config_contents(struct buffered_reader *reader, + struct uds_configuration *config); + +int __must_check uds_write_config_contents(struct buffered_writer *writer, + struct uds_configuration *config, u32 version); + +void uds_log_configuration(struct uds_configuration *config); + +#endif /* UDS_CONFIG_H */ diff --git a/drivers/md/dm-vdo/indexer/delta-index.c b/drivers/md/dm-vdo/indexer/delta-index.c new file mode 100644 index 0000000000..0ac2443f0d --- /dev/null +++ b/drivers/md/dm-vdo/indexer/delta-index.c @@ -0,0 +1,1970 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2023 Red Hat + */ +#include "delta-index.h" + +#include +#include +#include +#include +#include + +#include "cpu.h" +#include "errors.h" +#include "logger.h" +#include "memory-alloc.h" +#include "numeric.h" +#include "permassert.h" +#include "string-utils.h" +#include "time-utils.h" + +#include "config.h" +#include "indexer.h" + +/* + * The entries in a delta index could be stored in a single delta list, but to reduce search times + * and update costs it uses multiple delta lists. These lists are stored in a single chunk of + * memory managed by the delta_zone structure. The delta_zone can move the data around within its + * memory, so the location of each delta list is recorded as a bit offset into the memory. Because + * the volume index can contain over a million delta lists, we want to be efficient with the size + * of the delta list header information. This information is encoded into 16 bytes per list. The + * volume index delta list memory can easily exceed 4 gigabits, so a 64 bit value is needed to + * address the memory. The volume index delta lists average around 6 kilobits, so 16 bits are + * sufficient to store the size of a delta list. + * + * Each delta list is stored as a bit stream. Within the delta list encoding, bits and bytes are + * numbered in little endian order. Within a byte, bit 0 is the least significant bit (0x1), and + * bit 7 is the most significant bit (0x80). Within a bit stream, bit 7 is the most significant bit + * of byte 0, and bit 8 is the least significant bit of byte 1. Within a byte array, a byte's + * number corresponds to its index in the array. + * + * A standard delta list entry is stored as a fixed length payload (the value) followed by a + * variable length key (the delta). A collision entry is used when two block names have the same + * delta list address. A collision entry always follows a standard entry for the hash with which it + * collides, and is encoded with DELTA == 0 with an additional 256 bits field at the end, + * containing the full block name. An entry with a delta of 0 at the beginning of a delta list + * indicates a normal entry. + * + * The delta in each entry is encoded with a variable-length Huffman code to minimize the memory + * used by small deltas. The Huffman code is specified by three parameters, which can be computed + * from the desired mean delta when the index is full. (See compute_coding_constants() for + * details.) + * + * The bit field utilities used to read and write delta entries assume that it is possible to read + * some bytes beyond the end of the bit field, so a delta_zone memory allocation is guarded by two + * invalid delta lists to prevent reading outside the delta_zone memory. The valid delta lists are + * numbered 1 to N, and the guard lists are numbered 0 and N+1. The function to decode the bit + * stream include a step that skips over bits set to 0 until the first 1 bit is found. A corrupted + * delta list could cause this step to run off the end of the delta_zone memory, so as extra + * protection against this happening, the tail guard list is set to all ones. + * + * The delta_index supports two different forms. The mutable form is created by + * uds_initialize_delta_index(), and is used for the volume index and for open chapter indexes. The + * immutable form is created by uds_initialize_delta_index_page(), and is used for closed (and + * cached) chapter index pages. The immutable form does not allocate delta list headers or + * temporary offsets, and thus is somewhat more memory efficient. + */ + +/* + * This is the largest field size supported by get_field() and set_field(). Any field that is + * larger is not guaranteed to fit in a single byte-aligned u32. + */ +#define MAX_FIELD_BITS ((sizeof(u32) - 1) * BITS_PER_BYTE + 1) + +/* + * This is the largest field size supported by get_big_field() and set_big_field(). Any field that + * is larger is not guaranteed to fit in a single byte-aligned u64. + */ +#define MAX_BIG_FIELD_BITS ((sizeof(u64) - 1) * BITS_PER_BYTE + 1) + +/* + * This is the number of guard bytes needed at the end of the memory byte array when using the bit + * utilities. These utilities call get_big_field() and set_big_field(), which can access up to 7 + * bytes beyond the end of the desired field. The definition is written to make it clear how this + * value is derived. + */ +#define POST_FIELD_GUARD_BYTES (sizeof(u64) - 1) + +/* The number of guard bits that are needed in the tail guard list */ +#define GUARD_BITS (POST_FIELD_GUARD_BYTES * BITS_PER_BYTE) + +/* + * The maximum size of a single delta list in bytes. We count guard bytes in this value because a + * buffer of this size can be used with move_bits(). + */ +#define DELTA_LIST_MAX_BYTE_COUNT \ + ((U16_MAX + BITS_PER_BYTE) / BITS_PER_BYTE + POST_FIELD_GUARD_BYTES) + +/* The number of extra bytes and bits needed to store a collision entry */ +#define COLLISION_BYTES UDS_RECORD_NAME_SIZE +#define COLLISION_BITS (COLLISION_BYTES * BITS_PER_BYTE) + +/* + * Immutable delta lists are packed into pages containing a header that encodes the delta list + * information into 19 bits per list (64KB bit offset). + */ +#define IMMUTABLE_HEADER_SIZE 19 + +/* + * Constants and structures for the saved delta index. "DI" is for delta_index, and -##### is a + * number to increment when the format of the data changes. + */ +#define MAGIC_SIZE 8 + +static const char DELTA_INDEX_MAGIC[] = "DI-00002"; + +struct delta_index_header { + char magic[MAGIC_SIZE]; + u32 zone_number; + u32 zone_count; + u32 first_list; + u32 list_count; + u64 record_count; + u64 collision_count; +}; + +/* + * Header data used for immutable delta index pages. This data is followed by the delta list offset + * table. + */ +struct delta_page_header { + /* Externally-defined nonce */ + u64 nonce; + /* The virtual chapter number */ + u64 virtual_chapter_number; + /* Index of the first delta list on the page */ + u16 first_list; + /* Number of delta lists on the page */ + u16 list_count; +} __packed; + +static inline u64 get_delta_list_byte_start(const struct delta_list *delta_list) +{ + return delta_list->start / BITS_PER_BYTE; +} + +static inline u16 get_delta_list_byte_size(const struct delta_list *delta_list) +{ + unsigned int bit_offset = delta_list->start % BITS_PER_BYTE; + + return BITS_TO_BYTES(bit_offset + delta_list->size); +} + +static void rebalance_delta_zone(const struct delta_zone *delta_zone, u32 first, + u32 last) +{ + struct delta_list *delta_list; + u64 new_start; + + if (first == last) { + /* Only one list is moving, and we know there is space. */ + delta_list = &delta_zone->delta_lists[first]; + new_start = delta_zone->new_offsets[first]; + if (delta_list->start != new_start) { + u64 source; + u64 destination; + + source = get_delta_list_byte_start(delta_list); + delta_list->start = new_start; + destination = get_delta_list_byte_start(delta_list); + memmove(delta_zone->memory + destination, + delta_zone->memory + source, + get_delta_list_byte_size(delta_list)); + } + } else { + /* + * There is more than one list. Divide the problem in half, and use recursive calls + * to process each half. Note that after this computation, first <= middle, and + * middle < last. + */ + u32 middle = (first + last) / 2; + + delta_list = &delta_zone->delta_lists[middle]; + new_start = delta_zone->new_offsets[middle]; + + /* + * The direction that our middle list is moving determines which half of the + * problem must be processed first. + */ + if (new_start > delta_list->start) { + rebalance_delta_zone(delta_zone, middle + 1, last); + rebalance_delta_zone(delta_zone, first, middle); + } else { + rebalance_delta_zone(delta_zone, first, middle); + rebalance_delta_zone(delta_zone, middle + 1, last); + } + } +} + +static inline size_t get_zone_memory_size(unsigned int zone_count, size_t memory_size) +{ + /* Round up so that each zone is a multiple of 64K in size. */ + size_t ALLOC_BOUNDARY = 64 * 1024; + + return (memory_size / zone_count + ALLOC_BOUNDARY - 1) & -ALLOC_BOUNDARY; +} + +void uds_reset_delta_index(const struct delta_index *delta_index) +{ + unsigned int z; + + /* + * Initialize all delta lists to be empty. We keep 2 extra delta list descriptors, one + * before the first real entry and one after so that we don't need to bounds check the + * array access when calculating preceding and following gap sizes. + */ + for (z = 0; z < delta_index->zone_count; z++) { + u64 list_bits; + u64 spacing; + u64 offset; + unsigned int i; + struct delta_zone *zone = &delta_index->delta_zones[z]; + struct delta_list *delta_lists = zone->delta_lists; + + /* Zeroing the delta list headers initializes the head guard list correctly. */ + memset(delta_lists, 0, + (zone->list_count + 2) * sizeof(struct delta_list)); + + /* Set all the bits in the end guard list. */ + list_bits = (u64) zone->size * BITS_PER_BYTE - GUARD_BITS; + delta_lists[zone->list_count + 1].start = list_bits; + delta_lists[zone->list_count + 1].size = GUARD_BITS; + memset(zone->memory + (list_bits / BITS_PER_BYTE), ~0, + POST_FIELD_GUARD_BYTES); + + /* Evenly space out the real delta lists by setting regular offsets. */ + spacing = list_bits / zone->list_count; + offset = spacing / 2; + for (i = 1; i <= zone->list_count; i++) { + delta_lists[i].start = offset; + offset += spacing; + } + + /* Update the statistics. */ + zone->discard_count += zone->record_count; + zone->record_count = 0; + zone->collision_count = 0; + } +} + +/* Compute the Huffman coding parameters for the given mean delta. The Huffman code is specified by + * three parameters: + * + * MINBITS The number of bits in the smallest code + * BASE The number of values coded using a code of length MINBITS + * INCR The number of values coded by using one additional bit + * + * These parameters are related by this equation: + * + * BASE + INCR == 1 << MINBITS + * + * The math for the Huffman code of an exponential distribution says that + * + * INCR = log(2) * MEAN_DELTA + * + * Then use the smallest MINBITS value so that + * + * (1 << MINBITS) > INCR + * + * And then + * + * BASE = (1 << MINBITS) - INCR + * + * Now the index can generate a code such that + * - The first BASE values code using MINBITS bits. + * - The next INCR values code using MINBITS+1 bits. + * - The next INCR values code using MINBITS+2 bits. + * - (and so on). + */ +static void compute_coding_constants(u32 mean_delta, u16 *min_bits, u32 *min_keys, u32 *incr_keys) +{ + /* + * We want to compute the rounded value of log(2) * mean_delta. Since we cannot always use + * floating point, use a really good integer approximation. + */ + *incr_keys = (836158UL * mean_delta + 603160UL) / 1206321UL; + *min_bits = bits_per(*incr_keys + 1); + *min_keys = (1 << *min_bits) - *incr_keys; +} + +void uds_uninitialize_delta_index(struct delta_index *delta_index) +{ + unsigned int z; + + if (delta_index->delta_zones == NULL) + return; + + for (z = 0; z < delta_index->zone_count; z++) { + vdo_free(vdo_forget(delta_index->delta_zones[z].new_offsets)); + vdo_free(vdo_forget(delta_index->delta_zones[z].delta_lists)); + vdo_free(vdo_forget(delta_index->delta_zones[z].memory)); + } + + vdo_free(delta_index->delta_zones); + memset(delta_index, 0, sizeof(struct delta_index)); +} + +static int initialize_delta_zone(struct delta_zone *delta_zone, size_t size, + u32 first_list, u32 list_count, u32 mean_delta, + u32 payload_bits, u8 tag) +{ + int result; + + result = vdo_allocate(size, u8, "delta list", &delta_zone->memory); + if (result != VDO_SUCCESS) + return result; + + result = vdo_allocate(list_count + 2, u64, "delta list temp", + &delta_zone->new_offsets); + if (result != VDO_SUCCESS) + return result; + + /* Allocate the delta lists. */ + result = vdo_allocate(list_count + 2, struct delta_list, "delta lists", + &delta_zone->delta_lists); + if (result != VDO_SUCCESS) + return result; + + compute_coding_constants(mean_delta, &delta_zone->min_bits, + &delta_zone->min_keys, &delta_zone->incr_keys); + delta_zone->value_bits = payload_bits; + delta_zone->buffered_writer = NULL; + delta_zone->size = size; + delta_zone->rebalance_time = 0; + delta_zone->rebalance_count = 0; + delta_zone->record_count = 0; + delta_zone->collision_count = 0; + delta_zone->discard_count = 0; + delta_zone->overflow_count = 0; + delta_zone->first_list = first_list; + delta_zone->list_count = list_count; + delta_zone->tag = tag; + + return UDS_SUCCESS; +} + +int uds_initialize_delta_index(struct delta_index *delta_index, unsigned int zone_count, + u32 list_count, u32 mean_delta, u32 payload_bits, + size_t memory_size, u8 tag) +{ + int result; + unsigned int z; + size_t zone_memory; + + result = vdo_allocate(zone_count, struct delta_zone, "Delta Index Zones", + &delta_index->delta_zones); + if (result != VDO_SUCCESS) + return result; + + delta_index->zone_count = zone_count; + delta_index->list_count = list_count; + delta_index->lists_per_zone = DIV_ROUND_UP(list_count, zone_count); + delta_index->memory_size = 0; + delta_index->mutable = true; + delta_index->tag = tag; + + for (z = 0; z < zone_count; z++) { + u32 lists_in_zone = delta_index->lists_per_zone; + u32 first_list_in_zone = z * lists_in_zone; + + if (z == zone_count - 1) { + /* + * The last zone gets fewer lists if zone_count doesn't evenly divide + * list_count. We'll have an underflow if the assertion below doesn't hold. + */ + if (delta_index->list_count <= first_list_in_zone) { + uds_uninitialize_delta_index(delta_index); + return vdo_log_error_strerror(UDS_INVALID_ARGUMENT, + "%u delta lists not enough for %u zones", + list_count, zone_count); + } + lists_in_zone = delta_index->list_count - first_list_in_zone; + } + + zone_memory = get_zone_memory_size(zone_count, memory_size); + result = initialize_delta_zone(&delta_index->delta_zones[z], zone_memory, + first_list_in_zone, lists_in_zone, + mean_delta, payload_bits, tag); + if (result != UDS_SUCCESS) { + uds_uninitialize_delta_index(delta_index); + return result; + } + + delta_index->memory_size += + (sizeof(struct delta_zone) + zone_memory + + (lists_in_zone + 2) * (sizeof(struct delta_list) + sizeof(u64))); + } + + uds_reset_delta_index(delta_index); + return UDS_SUCCESS; +} + +/* Read a bit field from an arbitrary bit boundary. */ +static inline u32 get_field(const u8 *memory, u64 offset, u8 size) +{ + const void *addr = memory + offset / BITS_PER_BYTE; + + return (get_unaligned_le32(addr) >> (offset % BITS_PER_BYTE)) & ((1 << size) - 1); +} + +/* Write a bit field to an arbitrary bit boundary. */ +static inline void set_field(u32 value, u8 *memory, u64 offset, u8 size) +{ + void *addr = memory + offset / BITS_PER_BYTE; + int shift = offset % BITS_PER_BYTE; + u32 data = get_unaligned_le32(addr); + + data &= ~(((1 << size) - 1) << shift); + data |= value << shift; + put_unaligned_le32(data, addr); +} + +/* Get the bit offset to the immutable delta list header. */ +static inline u32 get_immutable_header_offset(u32 list_number) +{ + return sizeof(struct delta_page_header) * BITS_PER_BYTE + + list_number * IMMUTABLE_HEADER_SIZE; +} + +/* Get the bit offset to the start of the immutable delta list bit stream. */ +static inline u32 get_immutable_start(const u8 *memory, u32 list_number) +{ + return get_field(memory, get_immutable_header_offset(list_number), + IMMUTABLE_HEADER_SIZE); +} + +/* Set the bit offset to the start of the immutable delta list bit stream. */ +static inline void set_immutable_start(u8 *memory, u32 list_number, u32 start) +{ + set_field(start, memory, get_immutable_header_offset(list_number), + IMMUTABLE_HEADER_SIZE); +} + +static bool verify_delta_index_page(u64 nonce, u16 list_count, u64 expected_nonce, + u8 *memory, size_t memory_size) +{ + unsigned int i; + + /* + * Verify the nonce. A mismatch can happen here during rebuild if we haven't written the + * entire volume at least once. + */ + if (nonce != expected_nonce) + return false; + + /* Verify that the number of delta lists can fit in the page. */ + if (list_count > ((memory_size - sizeof(struct delta_page_header)) * + BITS_PER_BYTE / IMMUTABLE_HEADER_SIZE)) + return false; + + /* + * Verify that the first delta list is immediately after the last delta + * list header. + */ + if (get_immutable_start(memory, 0) != get_immutable_header_offset(list_count + 1)) + return false; + + /* Verify that the lists are in the correct order. */ + for (i = 0; i < list_count; i++) { + if (get_immutable_start(memory, i) > get_immutable_start(memory, i + 1)) + return false; + } + + /* + * Verify that the last list ends on the page, and that there is room + * for the post-field guard bits. + */ + if (get_immutable_start(memory, list_count) > + (memory_size - POST_FIELD_GUARD_BYTES) * BITS_PER_BYTE) + return false; + + /* Verify that the guard bytes are correctly set to all ones. */ + for (i = 0; i < POST_FIELD_GUARD_BYTES; i++) { + if (memory[memory_size - POST_FIELD_GUARD_BYTES + i] != (u8) ~0) + return false; + } + + /* All verifications passed. */ + return true; +} + +/* Initialize a delta index page to refer to a supplied page. */ +int uds_initialize_delta_index_page(struct delta_index_page *delta_index_page, + u64 expected_nonce, u32 mean_delta, u32 payload_bits, + u8 *memory, size_t memory_size) +{ + u64 nonce; + u64 vcn; + u64 first_list; + u64 list_count; + struct delta_page_header *header = (struct delta_page_header *) memory; + struct delta_zone *delta_zone = &delta_index_page->delta_zone; + const u8 *nonce_addr = (const u8 *) &header->nonce; + const u8 *vcn_addr = (const u8 *) &header->virtual_chapter_number; + const u8 *first_list_addr = (const u8 *) &header->first_list; + const u8 *list_count_addr = (const u8 *) &header->list_count; + + /* First assume that the header is little endian. */ + nonce = get_unaligned_le64(nonce_addr); + vcn = get_unaligned_le64(vcn_addr); + first_list = get_unaligned_le16(first_list_addr); + list_count = get_unaligned_le16(list_count_addr); + if (!verify_delta_index_page(nonce, list_count, expected_nonce, memory, + memory_size)) { + /* If that fails, try big endian. */ + nonce = get_unaligned_be64(nonce_addr); + vcn = get_unaligned_be64(vcn_addr); + first_list = get_unaligned_be16(first_list_addr); + list_count = get_unaligned_be16(list_count_addr); + if (!verify_delta_index_page(nonce, list_count, expected_nonce, memory, + memory_size)) { + /* + * Both attempts failed. Do not log this as an error, because it can happen + * during a rebuild if we haven't written the entire volume at least once. + */ + return UDS_CORRUPT_DATA; + } + } + + delta_index_page->delta_index.delta_zones = delta_zone; + delta_index_page->delta_index.zone_count = 1; + delta_index_page->delta_index.list_count = list_count; + delta_index_page->delta_index.lists_per_zone = list_count; + delta_index_page->delta_index.mutable = false; + delta_index_page->delta_index.tag = 'p'; + delta_index_page->virtual_chapter_number = vcn; + delta_index_page->lowest_list_number = first_list; + delta_index_page->highest_list_number = first_list + list_count - 1; + + compute_coding_constants(mean_delta, &delta_zone->min_bits, + &delta_zone->min_keys, &delta_zone->incr_keys); + delta_zone->value_bits = payload_bits; + delta_zone->memory = memory; + delta_zone->delta_lists = NULL; + delta_zone->new_offsets = NULL; + delta_zone->buffered_writer = NULL; + delta_zone->size = memory_size; + delta_zone->rebalance_time = 0; + delta_zone->rebalance_count = 0; + delta_zone->record_count = 0; + delta_zone->collision_count = 0; + delta_zone->discard_count = 0; + delta_zone->overflow_count = 0; + delta_zone->first_list = 0; + delta_zone->list_count = list_count; + delta_zone->tag = 'p'; + + return UDS_SUCCESS; +} + +/* Read a large bit field from an arbitrary bit boundary. */ +static inline u64 get_big_field(const u8 *memory, u64 offset, u8 size) +{ + const void *addr = memory + offset / BITS_PER_BYTE; + + return (get_unaligned_le64(addr) >> (offset % BITS_PER_BYTE)) & ((1UL << size) - 1); +} + +/* Write a large bit field to an arbitrary bit boundary. */ +static inline void set_big_field(u64 value, u8 *memory, u64 offset, u8 size) +{ + void *addr = memory + offset / BITS_PER_BYTE; + u8 shift = offset % BITS_PER_BYTE; + u64 data = get_unaligned_le64(addr); + + data &= ~(((1UL << size) - 1) << shift); + data |= value << shift; + put_unaligned_le64(data, addr); +} + +/* Set a sequence of bits to all zeros. */ +static inline void set_zero(u8 *memory, u64 offset, u32 size) +{ + if (size > 0) { + u8 *addr = memory + offset / BITS_PER_BYTE; + u8 shift = offset % BITS_PER_BYTE; + u32 count = size + shift > BITS_PER_BYTE ? (u32) BITS_PER_BYTE - shift : size; + + *addr++ &= ~(((1 << count) - 1) << shift); + for (size -= count; size > BITS_PER_BYTE; size -= BITS_PER_BYTE) + *addr++ = 0; + + if (size > 0) + *addr &= 0xFF << size; + } +} + +/* + * Move several bits from a higher to a lower address, moving the lower addressed bits first. The + * size and memory offsets are measured in bits. + */ +static void move_bits_down(const u8 *from, u64 from_offset, u8 *to, u64 to_offset, u32 size) +{ + const u8 *source; + u8 *destination; + u8 offset; + u8 count; + u64 field; + + /* Start by moving one field that ends on a to int boundary. */ + count = (MAX_BIG_FIELD_BITS - ((to_offset + MAX_BIG_FIELD_BITS) % BITS_PER_TYPE(u32))); + field = get_big_field(from, from_offset, count); + set_big_field(field, to, to_offset, count); + from_offset += count; + to_offset += count; + size -= count; + + /* Now do the main loop to copy 32 bit chunks that are int-aligned at the destination. */ + offset = from_offset % BITS_PER_TYPE(u32); + source = from + (from_offset - offset) / BITS_PER_BYTE; + destination = to + to_offset / BITS_PER_BYTE; + while (size > MAX_BIG_FIELD_BITS) { + put_unaligned_le32(get_unaligned_le64(source) >> offset, destination); + source += sizeof(u32); + destination += sizeof(u32); + from_offset += BITS_PER_TYPE(u32); + to_offset += BITS_PER_TYPE(u32); + size -= BITS_PER_TYPE(u32); + } + + /* Finish up by moving any remaining bits. */ + if (size > 0) { + field = get_big_field(from, from_offset, size); + set_big_field(field, to, to_offset, size); + } +} + +/* + * Move several bits from a lower to a higher address, moving the higher addressed bits first. The + * size and memory offsets are measured in bits. + */ +static void move_bits_up(const u8 *from, u64 from_offset, u8 *to, u64 to_offset, u32 size) +{ + const u8 *source; + u8 *destination; + u8 offset; + u8 count; + u64 field; + + /* Start by moving one field that begins on a destination int boundary. */ + count = (to_offset + size) % BITS_PER_TYPE(u32); + if (count > 0) { + size -= count; + field = get_big_field(from, from_offset + size, count); + set_big_field(field, to, to_offset + size, count); + } + + /* Now do the main loop to copy 32 bit chunks that are int-aligned at the destination. */ + offset = (from_offset + size) % BITS_PER_TYPE(u32); + source = from + (from_offset + size - offset) / BITS_PER_BYTE; + destination = to + (to_offset + size) / BITS_PER_BYTE; + while (size > MAX_BIG_FIELD_BITS) { + source -= sizeof(u32); + destination -= sizeof(u32); + size -= BITS_PER_TYPE(u32); + put_unaligned_le32(get_unaligned_le64(source) >> offset, destination); + } + + /* Finish up by moving any remaining bits. */ + if (size > 0) { + field = get_big_field(from, from_offset, size); + set_big_field(field, to, to_offset, size); + } +} + +/* + * Move bits from one field to another. When the fields overlap, behave as if we first move all the + * bits from the source to a temporary value, and then move all the bits from the temporary value + * to the destination. The size and memory offsets are measured in bits. + */ +static void move_bits(const u8 *from, u64 from_offset, u8 *to, u64 to_offset, u32 size) +{ + u64 field; + + /* A small move doesn't require special handling. */ + if (size <= MAX_BIG_FIELD_BITS) { + if (size > 0) { + field = get_big_field(from, from_offset, size); + set_big_field(field, to, to_offset, size); + } + + return; + } + + if (from_offset > to_offset) + move_bits_down(from, from_offset, to, to_offset, size); + else + move_bits_up(from, from_offset, to, to_offset, size); +} + +/* + * Pack delta lists from a mutable delta index into an immutable delta index page. A range of delta + * lists (starting with a specified list index) is copied from the mutable delta index into a + * memory page used in the immutable index. The number of lists copied onto the page is returned in + * list_count. + */ +int uds_pack_delta_index_page(const struct delta_index *delta_index, u64 header_nonce, + u8 *memory, size_t memory_size, u64 virtual_chapter_number, + u32 first_list, u32 *list_count) +{ + const struct delta_zone *delta_zone; + struct delta_list *delta_lists; + u32 max_lists; + u32 n_lists = 0; + u32 offset; + u32 i; + int free_bits; + int bits; + struct delta_page_header *header; + + delta_zone = &delta_index->delta_zones[0]; + delta_lists = &delta_zone->delta_lists[first_list + 1]; + max_lists = delta_index->list_count - first_list; + + /* + * Compute how many lists will fit on the page. Subtract the size of the fixed header, one + * delta list offset, and the guard bytes from the page size to determine how much space is + * available for delta lists. + */ + free_bits = memory_size * BITS_PER_BYTE; + free_bits -= get_immutable_header_offset(1); + free_bits -= GUARD_BITS; + if (free_bits < IMMUTABLE_HEADER_SIZE) { + /* This page is too small to store any delta lists. */ + return vdo_log_error_strerror(UDS_OVERFLOW, + "Chapter Index Page of %zu bytes is too small", + memory_size); + } + + while (n_lists < max_lists) { + /* Each list requires a delta list offset and the list data. */ + bits = IMMUTABLE_HEADER_SIZE + delta_lists[n_lists].size; + if (bits > free_bits) + break; + + n_lists++; + free_bits -= bits; + } + + *list_count = n_lists; + + header = (struct delta_page_header *) memory; + put_unaligned_le64(header_nonce, (u8 *) &header->nonce); + put_unaligned_le64(virtual_chapter_number, + (u8 *) &header->virtual_chapter_number); + put_unaligned_le16(first_list, (u8 *) &header->first_list); + put_unaligned_le16(n_lists, (u8 *) &header->list_count); + + /* Construct the delta list offset table. */ + offset = get_immutable_header_offset(n_lists + 1); + set_immutable_start(memory, 0, offset); + for (i = 0; i < n_lists; i++) { + offset += delta_lists[i].size; + set_immutable_start(memory, i + 1, offset); + } + + /* Copy the delta list data onto the memory page. */ + for (i = 0; i < n_lists; i++) { + move_bits(delta_zone->memory, delta_lists[i].start, memory, + get_immutable_start(memory, i), delta_lists[i].size); + } + + /* Set all the bits in the guard bytes. */ + memset(memory + memory_size - POST_FIELD_GUARD_BYTES, ~0, + POST_FIELD_GUARD_BYTES); + return UDS_SUCCESS; +} + +/* Compute the new offsets of the delta lists. */ +static void compute_new_list_offsets(struct delta_zone *delta_zone, u32 growing_index, + size_t growing_size, size_t used_space) +{ + size_t spacing; + u32 i; + struct delta_list *delta_lists = delta_zone->delta_lists; + u32 tail_guard_index = delta_zone->list_count + 1; + + spacing = (delta_zone->size - used_space) / delta_zone->list_count; + delta_zone->new_offsets[0] = 0; + for (i = 0; i <= delta_zone->list_count; i++) { + delta_zone->new_offsets[i + 1] = + (delta_zone->new_offsets[i] + + get_delta_list_byte_size(&delta_lists[i]) + spacing); + delta_zone->new_offsets[i] *= BITS_PER_BYTE; + delta_zone->new_offsets[i] += delta_lists[i].start % BITS_PER_BYTE; + if (i == 0) + delta_zone->new_offsets[i + 1] -= spacing / 2; + if (i + 1 == growing_index) + delta_zone->new_offsets[i + 1] += growing_size; + } + + delta_zone->new_offsets[tail_guard_index] = + (delta_zone->size * BITS_PER_BYTE - delta_lists[tail_guard_index].size); +} + +static void rebalance_lists(struct delta_zone *delta_zone) +{ + struct delta_list *delta_lists; + u32 i; + size_t used_space = 0; + + /* Extend and balance memory to receive the delta lists */ + delta_lists = delta_zone->delta_lists; + for (i = 0; i <= delta_zone->list_count + 1; i++) + used_space += get_delta_list_byte_size(&delta_lists[i]); + + compute_new_list_offsets(delta_zone, 0, 0, used_space); + for (i = 1; i <= delta_zone->list_count + 1; i++) + delta_lists[i].start = delta_zone->new_offsets[i]; +} + +/* Start restoring a delta index from multiple input streams. */ +int uds_start_restoring_delta_index(struct delta_index *delta_index, + struct buffered_reader **buffered_readers, + unsigned int reader_count) +{ + int result; + unsigned int zone_count = reader_count; + u64 record_count = 0; + u64 collision_count = 0; + u32 first_list[MAX_ZONES]; + u32 list_count[MAX_ZONES]; + unsigned int z; + u32 list_next = 0; + const struct delta_zone *delta_zone; + + /* Read and validate each header. */ + for (z = 0; z < zone_count; z++) { + struct delta_index_header header; + u8 buffer[sizeof(struct delta_index_header)]; + size_t offset = 0; + + result = uds_read_from_buffered_reader(buffered_readers[z], buffer, + sizeof(buffer)); + if (result != UDS_SUCCESS) { + return vdo_log_warning_strerror(result, + "failed to read delta index header"); + } + + memcpy(&header.magic, buffer, MAGIC_SIZE); + offset += MAGIC_SIZE; + decode_u32_le(buffer, &offset, &header.zone_number); + decode_u32_le(buffer, &offset, &header.zone_count); + decode_u32_le(buffer, &offset, &header.first_list); + decode_u32_le(buffer, &offset, &header.list_count); + decode_u64_le(buffer, &offset, &header.record_count); + decode_u64_le(buffer, &offset, &header.collision_count); + + result = VDO_ASSERT(offset == sizeof(struct delta_index_header), + "%zu bytes decoded of %zu expected", offset, + sizeof(struct delta_index_header)); + if (result != VDO_SUCCESS) { + return vdo_log_warning_strerror(result, + "failed to read delta index header"); + } + + if (memcmp(header.magic, DELTA_INDEX_MAGIC, MAGIC_SIZE) != 0) { + return vdo_log_warning_strerror(UDS_CORRUPT_DATA, + "delta index file has bad magic number"); + } + + if (zone_count != header.zone_count) { + return vdo_log_warning_strerror(UDS_CORRUPT_DATA, + "delta index files contain mismatched zone counts (%u,%u)", + zone_count, header.zone_count); + } + + if (header.zone_number != z) { + return vdo_log_warning_strerror(UDS_CORRUPT_DATA, + "delta index zone %u found in slot %u", + header.zone_number, z); + } + + first_list[z] = header.first_list; + list_count[z] = header.list_count; + record_count += header.record_count; + collision_count += header.collision_count; + + if (first_list[z] != list_next) { + return vdo_log_warning_strerror(UDS_CORRUPT_DATA, + "delta index file for zone %u starts with list %u instead of list %u", + z, first_list[z], list_next); + } + + list_next += list_count[z]; + } + + if (list_next != delta_index->list_count) { + return vdo_log_warning_strerror(UDS_CORRUPT_DATA, + "delta index files contain %u delta lists instead of %u delta lists", + list_next, delta_index->list_count); + } + + if (collision_count > record_count) { + return vdo_log_warning_strerror(UDS_CORRUPT_DATA, + "delta index files contain %llu collisions and %llu records", + (unsigned long long) collision_count, + (unsigned long long) record_count); + } + + uds_reset_delta_index(delta_index); + delta_index->delta_zones[0].record_count = record_count; + delta_index->delta_zones[0].collision_count = collision_count; + + /* Read the delta lists and distribute them to the proper zones. */ + for (z = 0; z < zone_count; z++) { + u32 i; + + delta_index->load_lists[z] = 0; + for (i = 0; i < list_count[z]; i++) { + u16 delta_list_size; + u32 list_number; + unsigned int zone_number; + u8 size_data[sizeof(u16)]; + + result = uds_read_from_buffered_reader(buffered_readers[z], + size_data, + sizeof(size_data)); + if (result != UDS_SUCCESS) { + return vdo_log_warning_strerror(result, + "failed to read delta index size"); + } + + delta_list_size = get_unaligned_le16(size_data); + if (delta_list_size > 0) + delta_index->load_lists[z] += 1; + + list_number = first_list[z] + i; + zone_number = list_number / delta_index->lists_per_zone; + delta_zone = &delta_index->delta_zones[zone_number]; + list_number -= delta_zone->first_list; + delta_zone->delta_lists[list_number + 1].size = delta_list_size; + } + } + + /* Prepare each zone to start receiving the delta list data. */ + for (z = 0; z < delta_index->zone_count; z++) + rebalance_lists(&delta_index->delta_zones[z]); + + return UDS_SUCCESS; +} + +static int restore_delta_list_to_zone(struct delta_zone *delta_zone, + const struct delta_list_save_info *save_info, + const u8 *data) +{ + struct delta_list *delta_list; + u16 bit_count; + u16 byte_count; + u32 list_number = save_info->index - delta_zone->first_list; + + if (list_number >= delta_zone->list_count) { + return vdo_log_warning_strerror(UDS_CORRUPT_DATA, + "invalid delta list number %u not in range [%u,%u)", + save_info->index, delta_zone->first_list, + delta_zone->first_list + delta_zone->list_count); + } + + delta_list = &delta_zone->delta_lists[list_number + 1]; + if (delta_list->size == 0) { + return vdo_log_warning_strerror(UDS_CORRUPT_DATA, + "unexpected delta list number %u", + save_info->index); + } + + bit_count = delta_list->size + save_info->bit_offset; + byte_count = BITS_TO_BYTES(bit_count); + if (save_info->byte_count != byte_count) { + return vdo_log_warning_strerror(UDS_CORRUPT_DATA, + "unexpected delta list size %u != %u", + save_info->byte_count, byte_count); + } + + move_bits(data, save_info->bit_offset, delta_zone->memory, delta_list->start, + delta_list->size); + return UDS_SUCCESS; +} + +static int restore_delta_list_data(struct delta_index *delta_index, unsigned int load_zone, + struct buffered_reader *buffered_reader, u8 *data) +{ + int result; + struct delta_list_save_info save_info; + u8 buffer[sizeof(struct delta_list_save_info)]; + unsigned int new_zone; + + result = uds_read_from_buffered_reader(buffered_reader, buffer, sizeof(buffer)); + if (result != UDS_SUCCESS) { + return vdo_log_warning_strerror(result, + "failed to read delta list data"); + } + + save_info = (struct delta_list_save_info) { + .tag = buffer[0], + .bit_offset = buffer[1], + .byte_count = get_unaligned_le16(&buffer[2]), + .index = get_unaligned_le32(&buffer[4]), + }; + + if ((save_info.bit_offset >= BITS_PER_BYTE) || + (save_info.byte_count > DELTA_LIST_MAX_BYTE_COUNT)) { + return vdo_log_warning_strerror(UDS_CORRUPT_DATA, + "corrupt delta list data"); + } + + /* Make sure the data is intended for this delta index. */ + if (save_info.tag != delta_index->tag) + return UDS_CORRUPT_DATA; + + if (save_info.index >= delta_index->list_count) { + return vdo_log_warning_strerror(UDS_CORRUPT_DATA, + "invalid delta list number %u of %u", + save_info.index, + delta_index->list_count); + } + + result = uds_read_from_buffered_reader(buffered_reader, data, + save_info.byte_count); + if (result != UDS_SUCCESS) { + return vdo_log_warning_strerror(result, + "failed to read delta list data"); + } + + delta_index->load_lists[load_zone] -= 1; + new_zone = save_info.index / delta_index->lists_per_zone; + return restore_delta_list_to_zone(&delta_index->delta_zones[new_zone], + &save_info, data); +} + +/* Restore delta lists from saved data. */ +int uds_finish_restoring_delta_index(struct delta_index *delta_index, + struct buffered_reader **buffered_readers, + unsigned int reader_count) +{ + int result; + int saved_result = UDS_SUCCESS; + unsigned int z; + u8 *data; + + result = vdo_allocate(DELTA_LIST_MAX_BYTE_COUNT, u8, __func__, &data); + if (result != VDO_SUCCESS) + return result; + + for (z = 0; z < reader_count; z++) { + while (delta_index->load_lists[z] > 0) { + result = restore_delta_list_data(delta_index, z, + buffered_readers[z], data); + if (result != UDS_SUCCESS) { + saved_result = result; + break; + } + } + } + + vdo_free(data); + return saved_result; +} + +int uds_check_guard_delta_lists(struct buffered_reader **buffered_readers, + unsigned int reader_count) +{ + int result; + unsigned int z; + u8 buffer[sizeof(struct delta_list_save_info)]; + + for (z = 0; z < reader_count; z++) { + result = uds_read_from_buffered_reader(buffered_readers[z], buffer, + sizeof(buffer)); + if (result != UDS_SUCCESS) + return result; + + if (buffer[0] != 'z') + return UDS_CORRUPT_DATA; + } + + return UDS_SUCCESS; +} + +static int flush_delta_list(struct delta_zone *zone, u32 flush_index) +{ + struct delta_list *delta_list; + u8 buffer[sizeof(struct delta_list_save_info)]; + int result; + + delta_list = &zone->delta_lists[flush_index + 1]; + + buffer[0] = zone->tag; + buffer[1] = delta_list->start % BITS_PER_BYTE; + put_unaligned_le16(get_delta_list_byte_size(delta_list), &buffer[2]); + put_unaligned_le32(zone->first_list + flush_index, &buffer[4]); + + result = uds_write_to_buffered_writer(zone->buffered_writer, buffer, + sizeof(buffer)); + if (result != UDS_SUCCESS) { + vdo_log_warning_strerror(result, "failed to write delta list memory"); + return result; + } + + result = uds_write_to_buffered_writer(zone->buffered_writer, + zone->memory + get_delta_list_byte_start(delta_list), + get_delta_list_byte_size(delta_list)); + if (result != UDS_SUCCESS) + vdo_log_warning_strerror(result, "failed to write delta list memory"); + + return result; +} + +/* Start saving a delta index zone to a buffered output stream. */ +int uds_start_saving_delta_index(const struct delta_index *delta_index, + unsigned int zone_number, + struct buffered_writer *buffered_writer) +{ + int result; + u32 i; + struct delta_zone *delta_zone; + u8 buffer[sizeof(struct delta_index_header)]; + size_t offset = 0; + + delta_zone = &delta_index->delta_zones[zone_number]; + memcpy(buffer, DELTA_INDEX_MAGIC, MAGIC_SIZE); + offset += MAGIC_SIZE; + encode_u32_le(buffer, &offset, zone_number); + encode_u32_le(buffer, &offset, delta_index->zone_count); + encode_u32_le(buffer, &offset, delta_zone->first_list); + encode_u32_le(buffer, &offset, delta_zone->list_count); + encode_u64_le(buffer, &offset, delta_zone->record_count); + encode_u64_le(buffer, &offset, delta_zone->collision_count); + + result = VDO_ASSERT(offset == sizeof(struct delta_index_header), + "%zu bytes encoded of %zu expected", offset, + sizeof(struct delta_index_header)); + if (result != VDO_SUCCESS) + return result; + + result = uds_write_to_buffered_writer(buffered_writer, buffer, offset); + if (result != UDS_SUCCESS) + return vdo_log_warning_strerror(result, + "failed to write delta index header"); + + for (i = 0; i < delta_zone->list_count; i++) { + u8 data[sizeof(u16)]; + struct delta_list *delta_list; + + delta_list = &delta_zone->delta_lists[i + 1]; + put_unaligned_le16(delta_list->size, data); + result = uds_write_to_buffered_writer(buffered_writer, data, + sizeof(data)); + if (result != UDS_SUCCESS) + return vdo_log_warning_strerror(result, + "failed to write delta list size"); + } + + delta_zone->buffered_writer = buffered_writer; + return UDS_SUCCESS; +} + +int uds_finish_saving_delta_index(const struct delta_index *delta_index, + unsigned int zone_number) +{ + int result; + int first_error = UDS_SUCCESS; + u32 i; + struct delta_zone *delta_zone; + struct delta_list *delta_list; + + delta_zone = &delta_index->delta_zones[zone_number]; + for (i = 0; i < delta_zone->list_count; i++) { + delta_list = &delta_zone->delta_lists[i + 1]; + if (delta_list->size > 0) { + result = flush_delta_list(delta_zone, i); + if ((result != UDS_SUCCESS) && (first_error == UDS_SUCCESS)) + first_error = result; + } + } + + delta_zone->buffered_writer = NULL; + return first_error; +} + +int uds_write_guard_delta_list(struct buffered_writer *buffered_writer) +{ + int result; + u8 buffer[sizeof(struct delta_list_save_info)]; + + memset(buffer, 0, sizeof(struct delta_list_save_info)); + buffer[0] = 'z'; + + result = uds_write_to_buffered_writer(buffered_writer, buffer, sizeof(buffer)); + if (result != UDS_SUCCESS) + vdo_log_warning_strerror(result, "failed to write guard delta list"); + + return UDS_SUCCESS; +} + +size_t uds_compute_delta_index_save_bytes(u32 list_count, size_t memory_size) +{ + /* One zone will use at least as much memory as other zone counts. */ + return (sizeof(struct delta_index_header) + + list_count * (sizeof(struct delta_list_save_info) + 1) + + get_zone_memory_size(1, memory_size)); +} + +static int assert_not_at_end(const struct delta_index_entry *delta_entry) +{ + int result = VDO_ASSERT(!delta_entry->at_end, + "operation is invalid because the list entry is at the end of the delta list"); + if (result != VDO_SUCCESS) + result = UDS_BAD_STATE; + + return result; +} + +/* + * Prepare to search for an entry in the specified delta list. + * + * This is always the first function to be called when dealing with delta index entries. It is + * always followed by calls to uds_next_delta_index_entry() to iterate through a delta list. The + * fields of the delta_index_entry argument will be set up for iteration, but will not contain an + * entry from the list. + */ +int uds_start_delta_index_search(const struct delta_index *delta_index, u32 list_number, + u32 key, struct delta_index_entry *delta_entry) +{ + int result; + unsigned int zone_number; + struct delta_zone *delta_zone; + struct delta_list *delta_list; + + result = VDO_ASSERT((list_number < delta_index->list_count), + "Delta list number (%u) is out of range (%u)", list_number, + delta_index->list_count); + if (result != VDO_SUCCESS) + return UDS_CORRUPT_DATA; + + zone_number = list_number / delta_index->lists_per_zone; + delta_zone = &delta_index->delta_zones[zone_number]; + list_number -= delta_zone->first_list; + result = VDO_ASSERT((list_number < delta_zone->list_count), + "Delta list number (%u) is out of range (%u) for zone (%u)", + list_number, delta_zone->list_count, zone_number); + if (result != VDO_SUCCESS) + return UDS_CORRUPT_DATA; + + if (delta_index->mutable) { + delta_list = &delta_zone->delta_lists[list_number + 1]; + } else { + u32 end_offset; + + /* + * Translate the immutable delta list header into a temporary + * full delta list header. + */ + delta_list = &delta_entry->temp_delta_list; + delta_list->start = get_immutable_start(delta_zone->memory, list_number); + end_offset = get_immutable_start(delta_zone->memory, list_number + 1); + delta_list->size = end_offset - delta_list->start; + delta_list->save_key = 0; + delta_list->save_offset = 0; + } + + if (key > delta_list->save_key) { + delta_entry->key = delta_list->save_key; + delta_entry->offset = delta_list->save_offset; + } else { + delta_entry->key = 0; + delta_entry->offset = 0; + if (key == 0) { + /* + * This usually means we're about to walk the entire delta list, so get all + * of it into the CPU cache. + */ + uds_prefetch_range(&delta_zone->memory[delta_list->start / BITS_PER_BYTE], + delta_list->size / BITS_PER_BYTE, false); + } + } + + delta_entry->at_end = false; + delta_entry->delta_zone = delta_zone; + delta_entry->delta_list = delta_list; + delta_entry->entry_bits = 0; + delta_entry->is_collision = false; + delta_entry->list_number = list_number; + delta_entry->list_overflow = false; + delta_entry->value_bits = delta_zone->value_bits; + return UDS_SUCCESS; +} + +static inline u64 get_delta_entry_offset(const struct delta_index_entry *delta_entry) +{ + return delta_entry->delta_list->start + delta_entry->offset; +} + +/* + * Decode a delta index entry delta value. The delta_index_entry basically describes the previous + * list entry, and has had its offset field changed to point to the subsequent entry. We decode the + * bit stream and update the delta_list_entry to describe the entry. + */ +static inline void decode_delta(struct delta_index_entry *delta_entry) +{ + int key_bits; + u32 delta; + const struct delta_zone *delta_zone = delta_entry->delta_zone; + const u8 *memory = delta_zone->memory; + u64 delta_offset = get_delta_entry_offset(delta_entry) + delta_entry->value_bits; + const u8 *addr = memory + delta_offset / BITS_PER_BYTE; + int offset = delta_offset % BITS_PER_BYTE; + u32 data = get_unaligned_le32(addr) >> offset; + + addr += sizeof(u32); + key_bits = delta_zone->min_bits; + delta = data & ((1 << key_bits) - 1); + if (delta >= delta_zone->min_keys) { + data >>= key_bits; + if (data == 0) { + key_bits = sizeof(u32) * BITS_PER_BYTE - offset; + while ((data = get_unaligned_le32(addr)) == 0) { + addr += sizeof(u32); + key_bits += sizeof(u32) * BITS_PER_BYTE; + } + } + key_bits += ffs(data); + delta += ((key_bits - delta_zone->min_bits - 1) * delta_zone->incr_keys); + } + delta_entry->delta = delta; + delta_entry->key += delta; + + /* Check for a collision, a delta of zero after the start. */ + if (unlikely((delta == 0) && (delta_entry->offset > 0))) { + delta_entry->is_collision = true; + delta_entry->entry_bits = delta_entry->value_bits + key_bits + COLLISION_BITS; + } else { + delta_entry->is_collision = false; + delta_entry->entry_bits = delta_entry->value_bits + key_bits; + } +} + +noinline int uds_next_delta_index_entry(struct delta_index_entry *delta_entry) +{ + int result; + const struct delta_list *delta_list; + u32 next_offset; + u16 size; + + result = assert_not_at_end(delta_entry); + if (result != UDS_SUCCESS) + return result; + + delta_list = delta_entry->delta_list; + delta_entry->offset += delta_entry->entry_bits; + size = delta_list->size; + if (unlikely(delta_entry->offset >= size)) { + delta_entry->at_end = true; + delta_entry->delta = 0; + delta_entry->is_collision = false; + result = VDO_ASSERT((delta_entry->offset == size), + "next offset past end of delta list"); + if (result != VDO_SUCCESS) + result = UDS_CORRUPT_DATA; + + return result; + } + + decode_delta(delta_entry); + + next_offset = delta_entry->offset + delta_entry->entry_bits; + if (next_offset > size) { + /* + * This is not an assertion because uds_validate_chapter_index_page() wants to + * handle this error. + */ + vdo_log_warning("Decoded past the end of the delta list"); + return UDS_CORRUPT_DATA; + } + + return UDS_SUCCESS; +} + +int uds_remember_delta_index_offset(const struct delta_index_entry *delta_entry) +{ + int result; + struct delta_list *delta_list = delta_entry->delta_list; + + result = VDO_ASSERT(!delta_entry->is_collision, "entry is not a collision"); + if (result != VDO_SUCCESS) + return result; + + delta_list->save_key = delta_entry->key - delta_entry->delta; + delta_list->save_offset = delta_entry->offset; + return UDS_SUCCESS; +} + +static void set_delta(struct delta_index_entry *delta_entry, u32 delta) +{ + const struct delta_zone *delta_zone = delta_entry->delta_zone; + u32 key_bits = (delta_zone->min_bits + + ((delta_zone->incr_keys - delta_zone->min_keys + delta) / + delta_zone->incr_keys)); + + delta_entry->delta = delta; + delta_entry->entry_bits = delta_entry->value_bits + key_bits; +} + +static void get_collision_name(const struct delta_index_entry *entry, u8 *name) +{ + u64 offset = get_delta_entry_offset(entry) + entry->entry_bits - COLLISION_BITS; + const u8 *addr = entry->delta_zone->memory + offset / BITS_PER_BYTE; + int size = COLLISION_BYTES; + int shift = offset % BITS_PER_BYTE; + + while (--size >= 0) + *name++ = get_unaligned_le16(addr++) >> shift; +} + +static void set_collision_name(const struct delta_index_entry *entry, const u8 *name) +{ + u64 offset = get_delta_entry_offset(entry) + entry->entry_bits - COLLISION_BITS; + u8 *addr = entry->delta_zone->memory + offset / BITS_PER_BYTE; + int size = COLLISION_BYTES; + int shift = offset % BITS_PER_BYTE; + u16 mask = ~((u16) 0xFF << shift); + u16 data; + + while (--size >= 0) { + data = (get_unaligned_le16(addr) & mask) | (*name++ << shift); + put_unaligned_le16(data, addr++); + } +} + +int uds_get_delta_index_entry(const struct delta_index *delta_index, u32 list_number, + u32 key, const u8 *name, + struct delta_index_entry *delta_entry) +{ + int result; + + result = uds_start_delta_index_search(delta_index, list_number, key, + delta_entry); + if (result != UDS_SUCCESS) + return result; + + do { + result = uds_next_delta_index_entry(delta_entry); + if (result != UDS_SUCCESS) + return result; + } while (!delta_entry->at_end && (key > delta_entry->key)); + + result = uds_remember_delta_index_offset(delta_entry); + if (result != UDS_SUCCESS) + return result; + + if (!delta_entry->at_end && (key == delta_entry->key)) { + struct delta_index_entry collision_entry = *delta_entry; + + for (;;) { + u8 full_name[COLLISION_BYTES]; + + result = uds_next_delta_index_entry(&collision_entry); + if (result != UDS_SUCCESS) + return result; + + if (collision_entry.at_end || !collision_entry.is_collision) + break; + + get_collision_name(&collision_entry, full_name); + if (memcmp(full_name, name, COLLISION_BYTES) == 0) { + *delta_entry = collision_entry; + break; + } + } + } + + return UDS_SUCCESS; +} + +int uds_get_delta_entry_collision(const struct delta_index_entry *delta_entry, u8 *name) +{ + int result; + + result = assert_not_at_end(delta_entry); + if (result != UDS_SUCCESS) + return result; + + result = VDO_ASSERT(delta_entry->is_collision, + "Cannot get full block name from a non-collision delta index entry"); + if (result != VDO_SUCCESS) + return UDS_BAD_STATE; + + get_collision_name(delta_entry, name); + return UDS_SUCCESS; +} + +u32 uds_get_delta_entry_value(const struct delta_index_entry *delta_entry) +{ + return get_field(delta_entry->delta_zone->memory, + get_delta_entry_offset(delta_entry), delta_entry->value_bits); +} + +static int assert_mutable_entry(const struct delta_index_entry *delta_entry) +{ + int result = VDO_ASSERT((delta_entry->delta_list != &delta_entry->temp_delta_list), + "delta index is mutable"); + if (result != VDO_SUCCESS) + result = UDS_BAD_STATE; + + return result; +} + +int uds_set_delta_entry_value(const struct delta_index_entry *delta_entry, u32 value) +{ + int result; + u32 value_mask = (1 << delta_entry->value_bits) - 1; + + result = assert_mutable_entry(delta_entry); + if (result != UDS_SUCCESS) + return result; + + result = assert_not_at_end(delta_entry); + if (result != UDS_SUCCESS) + return result; + + result = VDO_ASSERT((value & value_mask) == value, + "Value (%u) being set in a delta index is too large (must fit in %u bits)", + value, delta_entry->value_bits); + if (result != VDO_SUCCESS) + return UDS_INVALID_ARGUMENT; + + set_field(value, delta_entry->delta_zone->memory, + get_delta_entry_offset(delta_entry), delta_entry->value_bits); + return UDS_SUCCESS; +} + +/* + * Extend the memory used by the delta lists by adding growing_size bytes before the list indicated + * by growing_index, then rebalancing the lists in the new chunk. + */ +static int extend_delta_zone(struct delta_zone *delta_zone, u32 growing_index, + size_t growing_size) +{ + ktime_t start_time; + ktime_t end_time; + struct delta_list *delta_lists; + u32 i; + size_t used_space; + + + /* Calculate the amount of space that is or will be in use. */ + start_time = current_time_ns(CLOCK_MONOTONIC); + delta_lists = delta_zone->delta_lists; + used_space = growing_size; + for (i = 0; i <= delta_zone->list_count + 1; i++) + used_space += get_delta_list_byte_size(&delta_lists[i]); + + if (delta_zone->size < used_space) + return UDS_OVERFLOW; + + /* Compute the new offsets of the delta lists. */ + compute_new_list_offsets(delta_zone, growing_index, growing_size, used_space); + + /* + * When we rebalance the delta list, we will include the end guard list in the rebalancing. + * It contains the end guard data, which must be copied. + */ + rebalance_delta_zone(delta_zone, 1, delta_zone->list_count + 1); + end_time = current_time_ns(CLOCK_MONOTONIC); + delta_zone->rebalance_count++; + delta_zone->rebalance_time += ktime_sub(end_time, start_time); + return UDS_SUCCESS; +} + +static int insert_bits(struct delta_index_entry *delta_entry, u16 size) +{ + u64 free_before; + u64 free_after; + u64 source; + u64 destination; + u32 count; + bool before_flag; + u8 *memory; + struct delta_zone *delta_zone = delta_entry->delta_zone; + struct delta_list *delta_list = delta_entry->delta_list; + /* Compute bits in use before and after the inserted bits. */ + u32 total_size = delta_list->size; + u32 before_size = delta_entry->offset; + u32 after_size = total_size - delta_entry->offset; + + if (total_size + size > U16_MAX) { + delta_entry->list_overflow = true; + delta_zone->overflow_count++; + return UDS_OVERFLOW; + } + + /* Compute bits available before and after the delta list. */ + free_before = (delta_list[0].start - (delta_list[-1].start + delta_list[-1].size)); + free_after = (delta_list[1].start - (delta_list[0].start + delta_list[0].size)); + + if ((size <= free_before) && (size <= free_after)) { + /* + * We have enough space to use either before or after the list. Select the smaller + * amount of data. If it is exactly the same, try to take from the larger amount of + * free space. + */ + if (before_size < after_size) + before_flag = true; + else if (after_size < before_size) + before_flag = false; + else + before_flag = free_before > free_after; + } else if (size <= free_before) { + /* There is space before but not after. */ + before_flag = true; + } else if (size <= free_after) { + /* There is space after but not before. */ + before_flag = false; + } else { + /* + * Neither of the surrounding spaces is large enough for this request. Extend + * and/or rebalance the delta list memory choosing to move the least amount of + * data. + */ + int result; + u32 growing_index = delta_entry->list_number + 1; + + before_flag = before_size < after_size; + if (!before_flag) + growing_index++; + result = extend_delta_zone(delta_zone, growing_index, + BITS_TO_BYTES(size)); + if (result != UDS_SUCCESS) + return result; + } + + delta_list->size += size; + if (before_flag) { + source = delta_list->start; + destination = source - size; + delta_list->start -= size; + count = before_size; + } else { + source = delta_list->start + delta_entry->offset; + destination = source + size; + count = after_size; + } + + memory = delta_zone->memory; + move_bits(memory, source, memory, destination, count); + return UDS_SUCCESS; +} + +static void encode_delta(const struct delta_index_entry *delta_entry) +{ + u32 temp; + u32 t1; + u32 t2; + u64 offset; + const struct delta_zone *delta_zone = delta_entry->delta_zone; + u8 *memory = delta_zone->memory; + + offset = get_delta_entry_offset(delta_entry) + delta_entry->value_bits; + if (delta_entry->delta < delta_zone->min_keys) { + set_field(delta_entry->delta, memory, offset, delta_zone->min_bits); + return; + } + + temp = delta_entry->delta - delta_zone->min_keys; + t1 = (temp % delta_zone->incr_keys) + delta_zone->min_keys; + t2 = temp / delta_zone->incr_keys; + set_field(t1, memory, offset, delta_zone->min_bits); + set_zero(memory, offset + delta_zone->min_bits, t2); + set_field(1, memory, offset + delta_zone->min_bits + t2, 1); +} + +static void encode_entry(const struct delta_index_entry *delta_entry, u32 value, + const u8 *name) +{ + u8 *memory = delta_entry->delta_zone->memory; + u64 offset = get_delta_entry_offset(delta_entry); + + set_field(value, memory, offset, delta_entry->value_bits); + encode_delta(delta_entry); + if (name != NULL) + set_collision_name(delta_entry, name); +} + +/* + * Create a new entry in the delta index. If the entry is a collision, the full 256 bit name must + * be provided. + */ +int uds_put_delta_index_entry(struct delta_index_entry *delta_entry, u32 key, u32 value, + const u8 *name) +{ + int result; + struct delta_zone *delta_zone; + + result = assert_mutable_entry(delta_entry); + if (result != UDS_SUCCESS) + return result; + + if (delta_entry->is_collision) { + /* + * The caller wants us to insert a collision entry onto a collision entry. This + * happens when we find a collision and attempt to add the name again to the index. + * This is normally a fatal error unless we are replaying a closed chapter while we + * are rebuilding a volume index. + */ + return UDS_DUPLICATE_NAME; + } + + if (delta_entry->offset < delta_entry->delta_list->save_offset) { + /* + * The saved entry offset is after the new entry and will no longer be valid, so + * replace it with the insertion point. + */ + result = uds_remember_delta_index_offset(delta_entry); + if (result != UDS_SUCCESS) + return result; + } + + if (name != NULL) { + /* Insert a collision entry which is placed after this entry. */ + result = assert_not_at_end(delta_entry); + if (result != UDS_SUCCESS) + return result; + + result = VDO_ASSERT((key == delta_entry->key), + "incorrect key for collision entry"); + if (result != VDO_SUCCESS) + return result; + + delta_entry->offset += delta_entry->entry_bits; + set_delta(delta_entry, 0); + delta_entry->is_collision = true; + delta_entry->entry_bits += COLLISION_BITS; + result = insert_bits(delta_entry, delta_entry->entry_bits); + } else if (delta_entry->at_end) { + /* Insert a new entry at the end of the delta list. */ + result = VDO_ASSERT((key >= delta_entry->key), "key past end of list"); + if (result != VDO_SUCCESS) + return result; + + set_delta(delta_entry, key - delta_entry->key); + delta_entry->key = key; + delta_entry->at_end = false; + result = insert_bits(delta_entry, delta_entry->entry_bits); + } else { + u16 old_entry_size; + u16 additional_size; + struct delta_index_entry next_entry; + u32 next_value; + + /* + * Insert a new entry which requires the delta in the following entry to be + * updated. + */ + result = VDO_ASSERT((key < delta_entry->key), + "key precedes following entry"); + if (result != VDO_SUCCESS) + return result; + + result = VDO_ASSERT((key >= delta_entry->key - delta_entry->delta), + "key effects following entry's delta"); + if (result != VDO_SUCCESS) + return result; + + old_entry_size = delta_entry->entry_bits; + next_entry = *delta_entry; + next_value = uds_get_delta_entry_value(&next_entry); + set_delta(delta_entry, key - (delta_entry->key - delta_entry->delta)); + delta_entry->key = key; + set_delta(&next_entry, next_entry.key - key); + next_entry.offset += delta_entry->entry_bits; + /* The two new entries are always bigger than the single entry being replaced. */ + additional_size = (delta_entry->entry_bits + + next_entry.entry_bits - old_entry_size); + result = insert_bits(delta_entry, additional_size); + if (result != UDS_SUCCESS) + return result; + + encode_entry(&next_entry, next_value, NULL); + } + + if (result != UDS_SUCCESS) + return result; + + encode_entry(delta_entry, value, name); + delta_zone = delta_entry->delta_zone; + delta_zone->record_count++; + delta_zone->collision_count += delta_entry->is_collision ? 1 : 0; + return UDS_SUCCESS; +} + +static void delete_bits(const struct delta_index_entry *delta_entry, int size) +{ + u64 source; + u64 destination; + u32 count; + bool before_flag; + struct delta_list *delta_list = delta_entry->delta_list; + u8 *memory = delta_entry->delta_zone->memory; + /* Compute bits retained before and after the deleted bits. */ + u32 total_size = delta_list->size; + u32 before_size = delta_entry->offset; + u32 after_size = total_size - delta_entry->offset - size; + + /* + * Determine whether to add to the available space either before or after the delta list. + * We prefer to move the least amount of data. If it is exactly the same, try to add to the + * smaller amount of free space. + */ + if (before_size < after_size) { + before_flag = true; + } else if (after_size < before_size) { + before_flag = false; + } else { + u64 free_before = + (delta_list[0].start - (delta_list[-1].start + delta_list[-1].size)); + u64 free_after = + (delta_list[1].start - (delta_list[0].start + delta_list[0].size)); + + before_flag = (free_before < free_after); + } + + delta_list->size -= size; + if (before_flag) { + source = delta_list->start; + destination = source + size; + delta_list->start += size; + count = before_size; + } else { + destination = delta_list->start + delta_entry->offset; + source = destination + size; + count = after_size; + } + + move_bits(memory, source, memory, destination, count); +} + +int uds_remove_delta_index_entry(struct delta_index_entry *delta_entry) +{ + int result; + struct delta_index_entry next_entry; + struct delta_zone *delta_zone; + struct delta_list *delta_list; + + result = assert_mutable_entry(delta_entry); + if (result != UDS_SUCCESS) + return result; + + next_entry = *delta_entry; + result = uds_next_delta_index_entry(&next_entry); + if (result != UDS_SUCCESS) + return result; + + delta_zone = delta_entry->delta_zone; + + if (delta_entry->is_collision) { + /* This is a collision entry, so just remove it. */ + delete_bits(delta_entry, delta_entry->entry_bits); + next_entry.offset = delta_entry->offset; + delta_zone->collision_count -= 1; + } else if (next_entry.at_end) { + /* This entry is at the end of the list, so just remove it. */ + delete_bits(delta_entry, delta_entry->entry_bits); + next_entry.key -= delta_entry->delta; + next_entry.offset = delta_entry->offset; + } else { + /* The delta in the next entry needs to be updated. */ + u32 next_value = uds_get_delta_entry_value(&next_entry); + u16 old_size = delta_entry->entry_bits + next_entry.entry_bits; + + if (next_entry.is_collision) { + next_entry.is_collision = false; + delta_zone->collision_count -= 1; + } + + set_delta(&next_entry, delta_entry->delta + next_entry.delta); + next_entry.offset = delta_entry->offset; + /* The one new entry is always smaller than the two entries being replaced. */ + delete_bits(delta_entry, old_size - next_entry.entry_bits); + encode_entry(&next_entry, next_value, NULL); + } + + delta_zone->record_count--; + delta_zone->discard_count++; + *delta_entry = next_entry; + + delta_list = delta_entry->delta_list; + if (delta_entry->offset < delta_list->save_offset) { + /* The saved entry offset is no longer valid. */ + delta_list->save_key = 0; + delta_list->save_offset = 0; + } + + return UDS_SUCCESS; +} + +void uds_get_delta_index_stats(const struct delta_index *delta_index, + struct delta_index_stats *stats) +{ + unsigned int z; + const struct delta_zone *delta_zone; + + memset(stats, 0, sizeof(struct delta_index_stats)); + for (z = 0; z < delta_index->zone_count; z++) { + delta_zone = &delta_index->delta_zones[z]; + stats->rebalance_time += delta_zone->rebalance_time; + stats->rebalance_count += delta_zone->rebalance_count; + stats->record_count += delta_zone->record_count; + stats->collision_count += delta_zone->collision_count; + stats->discard_count += delta_zone->discard_count; + stats->overflow_count += delta_zone->overflow_count; + stats->list_count += delta_zone->list_count; + } +} + +size_t uds_compute_delta_index_size(u32 entry_count, u32 mean_delta, u32 payload_bits) +{ + u16 min_bits; + u32 incr_keys; + u32 min_keys; + + compute_coding_constants(mean_delta, &min_bits, &min_keys, &incr_keys); + /* On average, each delta is encoded into about min_bits + 1.5 bits. */ + return entry_count * (payload_bits + min_bits + 1) + entry_count / 2; +} + +u32 uds_get_delta_index_page_count(u32 entry_count, u32 list_count, u32 mean_delta, + u32 payload_bits, size_t bytes_per_page) +{ + unsigned int bits_per_delta_list; + unsigned int bits_per_page; + size_t bits_per_index; + + /* Compute the expected number of bits needed for all the entries. */ + bits_per_index = uds_compute_delta_index_size(entry_count, mean_delta, + payload_bits); + bits_per_delta_list = bits_per_index / list_count; + + /* Add in the immutable delta list headers. */ + bits_per_index += list_count * IMMUTABLE_HEADER_SIZE; + /* Compute the number of usable bits on an immutable index page. */ + bits_per_page = ((bytes_per_page - sizeof(struct delta_page_header)) * BITS_PER_BYTE); + /* + * Reduce the bits per page by one immutable delta list header and one delta list to + * account for internal fragmentation. + */ + bits_per_page -= IMMUTABLE_HEADER_SIZE + bits_per_delta_list; + /* Now compute the number of pages needed. */ + return DIV_ROUND_UP(bits_per_index, bits_per_page); +} + +void uds_log_delta_index_entry(struct delta_index_entry *delta_entry) +{ + vdo_log_ratelimit(vdo_log_info, + "List 0x%X Key 0x%X Offset 0x%X%s%s List_size 0x%X%s", + delta_entry->list_number, delta_entry->key, + delta_entry->offset, delta_entry->at_end ? " end" : "", + delta_entry->is_collision ? " collision" : "", + delta_entry->delta_list->size, + delta_entry->list_overflow ? " overflow" : ""); + delta_entry->list_overflow = false; +} diff --git a/drivers/md/dm-vdo/indexer/delta-index.h b/drivers/md/dm-vdo/indexer/delta-index.h new file mode 100644 index 0000000000..53f6c6ac0b --- /dev/null +++ b/drivers/md/dm-vdo/indexer/delta-index.h @@ -0,0 +1,279 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright 2023 Red Hat + */ + +#ifndef UDS_DELTA_INDEX_H +#define UDS_DELTA_INDEX_H + +#include + +#include "numeric.h" +#include "time-utils.h" + +#include "config.h" +#include "io-factory.h" + +/* + * A delta index is a key-value store, where each entry maps an address (the key) to a payload (the + * value). The entries are sorted by address, and only the delta between successive addresses is + * stored in the entry. The addresses are assumed to be uniformly distributed, and the deltas are + * therefore exponentially distributed. + * + * A delta_index can either be mutable or immutable depending on its expected use. The immutable + * form of a delta index is used for the indexes of closed chapters committed to the volume. The + * mutable form of a delta index is used by the volume index, and also by the chapter index in an + * open chapter. Like the index as a whole, each mutable delta index is divided into a number of + * independent zones. + */ + +struct delta_list { + /* The offset of the delta list start, in bits */ + u64 start; + /* The number of bits in the delta list */ + u16 size; + /* Where the last search "found" the key, in bits */ + u16 save_offset; + /* The key for the record just before save_offset */ + u32 save_key; +}; + +struct delta_zone { + /* The delta list memory */ + u8 *memory; + /* The delta list headers */ + struct delta_list *delta_lists; + /* Temporary starts of delta lists */ + u64 *new_offsets; + /* Buffered writer for saving an index */ + struct buffered_writer *buffered_writer; + /* The size of delta list memory */ + size_t size; + /* Nanoseconds spent rebalancing */ + ktime_t rebalance_time; + /* Number of memory rebalances */ + u32 rebalance_count; + /* The number of bits in a stored value */ + u8 value_bits; + /* The number of bits in the minimal key code */ + u16 min_bits; + /* The number of keys used in a minimal code */ + u32 min_keys; + /* The number of keys used for another code bit */ + u32 incr_keys; + /* The number of records in the index */ + u64 record_count; + /* The number of collision records */ + u64 collision_count; + /* The number of records removed */ + u64 discard_count; + /* The number of UDS_OVERFLOW errors detected */ + u64 overflow_count; + /* The index of the first delta list */ + u32 first_list; + /* The number of delta lists */ + u32 list_count; + /* Tag belonging to this delta index */ + u8 tag; +} __aligned(L1_CACHE_BYTES); + +struct delta_list_save_info { + /* Tag identifying which delta index this list is in */ + u8 tag; + /* Bit offset of the start of the list data */ + u8 bit_offset; + /* Number of bytes of list data */ + u16 byte_count; + /* The delta list number within the delta index */ + u32 index; +} __packed; + +struct delta_index { + /* The zones */ + struct delta_zone *delta_zones; + /* The number of zones */ + unsigned int zone_count; + /* The number of delta lists */ + u32 list_count; + /* Maximum lists per zone */ + u32 lists_per_zone; + /* Total memory allocated to this index */ + size_t memory_size; + /* The number of non-empty lists at load time per zone */ + u32 load_lists[MAX_ZONES]; + /* True if this index is mutable */ + bool mutable; + /* Tag belonging to this delta index */ + u8 tag; +}; + +/* + * A delta_index_page describes a single page of a chapter index. The delta_index field allows the + * page to be treated as an immutable delta_index. We use the delta_zone field to treat the chapter + * index page as a single zone index, and without the need to do an additional memory allocation. + */ +struct delta_index_page { + struct delta_index delta_index; + /* These values are loaded from the delta_page_header */ + u32 lowest_list_number; + u32 highest_list_number; + u64 virtual_chapter_number; + /* This structure describes the single zone of a delta index page. */ + struct delta_zone delta_zone; +}; + +/* + * Notes on the delta_index_entries: + * + * The fields documented as "public" can be read by any code that uses a delta_index. The fields + * documented as "private" carry information between delta_index method calls and should not be + * used outside the delta_index module. + * + * (1) The delta_index_entry is used like an iterator when searching a delta list. + * + * (2) It is also the result of a successful search and can be used to refer to the element found + * by the search. + * + * (3) It is also the result of an unsuccessful search and can be used to refer to the insertion + * point for a new record. + * + * (4) If at_end is true, the delta_list entry can only be used as the insertion point for a new + * record at the end of the list. + * + * (5) If at_end is false and is_collision is true, the delta_list entry fields refer to a + * collision entry in the list, and the delta_list entry can be used as a reference to this + * entry. + * + * (6) If at_end is false and is_collision is false, the delta_list entry fields refer to a + * non-collision entry in the list. Such delta_list entries can be used as a reference to a + * found entry, or an insertion point for a non-collision entry before this entry, or an + * insertion point for a collision entry that collides with this entry. + */ +struct delta_index_entry { + /* Public fields */ + /* The key for this entry */ + u32 key; + /* We are after the last list entry */ + bool at_end; + /* This record is a collision */ + bool is_collision; + + /* Private fields */ + /* This delta list overflowed */ + bool list_overflow; + /* The number of bits used for the value */ + u8 value_bits; + /* The number of bits used for the entire entry */ + u16 entry_bits; + /* The delta index zone */ + struct delta_zone *delta_zone; + /* The delta list containing the entry */ + struct delta_list *delta_list; + /* The delta list number */ + u32 list_number; + /* Bit offset of this entry within the list */ + u16 offset; + /* The delta between this and previous entry */ + u32 delta; + /* Temporary delta list for immutable indices */ + struct delta_list temp_delta_list; +}; + +struct delta_index_stats { + /* Number of bytes allocated */ + size_t memory_allocated; + /* Nanoseconds spent rebalancing */ + ktime_t rebalance_time; + /* Number of memory rebalances */ + u32 rebalance_count; + /* The number of records in the index */ + u64 record_count; + /* The number of collision records */ + u64 collision_count; + /* The number of records removed */ + u64 discard_count; + /* The number of UDS_OVERFLOW errors detected */ + u64 overflow_count; + /* The number of delta lists */ + u32 list_count; +}; + +int __must_check uds_initialize_delta_index(struct delta_index *delta_index, + unsigned int zone_count, u32 list_count, + u32 mean_delta, u32 payload_bits, + size_t memory_size, u8 tag); + +int __must_check uds_initialize_delta_index_page(struct delta_index_page *delta_index_page, + u64 expected_nonce, u32 mean_delta, + u32 payload_bits, u8 *memory, + size_t memory_size); + +void uds_uninitialize_delta_index(struct delta_index *delta_index); + +void uds_reset_delta_index(const struct delta_index *delta_index); + +int __must_check uds_pack_delta_index_page(const struct delta_index *delta_index, + u64 header_nonce, u8 *memory, + size_t memory_size, + u64 virtual_chapter_number, u32 first_list, + u32 *list_count); + +int __must_check uds_start_restoring_delta_index(struct delta_index *delta_index, + struct buffered_reader **buffered_readers, + unsigned int reader_count); + +int __must_check uds_finish_restoring_delta_index(struct delta_index *delta_index, + struct buffered_reader **buffered_readers, + unsigned int reader_count); + +int __must_check uds_check_guard_delta_lists(struct buffered_reader **buffered_readers, + unsigned int reader_count); + +int __must_check uds_start_saving_delta_index(const struct delta_index *delta_index, + unsigned int zone_number, + struct buffered_writer *buffered_writer); + +int __must_check uds_finish_saving_delta_index(const struct delta_index *delta_index, + unsigned int zone_number); + +int __must_check uds_write_guard_delta_list(struct buffered_writer *buffered_writer); + +size_t __must_check uds_compute_delta_index_save_bytes(u32 list_count, + size_t memory_size); + +int __must_check uds_start_delta_index_search(const struct delta_index *delta_index, + u32 list_number, u32 key, + struct delta_index_entry *iterator); + +int __must_check uds_next_delta_index_entry(struct delta_index_entry *delta_entry); + +int __must_check uds_remember_delta_index_offset(const struct delta_index_entry *delta_entry); + +int __must_check uds_get_delta_index_entry(const struct delta_index *delta_index, + u32 list_number, u32 key, const u8 *name, + struct delta_index_entry *delta_entry); + +int __must_check uds_get_delta_entry_collision(const struct delta_index_entry *delta_entry, + u8 *name); + +u32 __must_check uds_get_delta_entry_value(const struct delta_index_entry *delta_entry); + +int __must_check uds_set_delta_entry_value(const struct delta_index_entry *delta_entry, u32 value); + +int __must_check uds_put_delta_index_entry(struct delta_index_entry *delta_entry, u32 key, + u32 value, const u8 *name); + +int __must_check uds_remove_delta_index_entry(struct delta_index_entry *delta_entry); + +void uds_get_delta_index_stats(const struct delta_index *delta_index, + struct delta_index_stats *stats); + +size_t __must_check uds_compute_delta_index_size(u32 entry_count, u32 mean_delta, + u32 payload_bits); + +u32 uds_get_delta_index_page_count(u32 entry_count, u32 list_count, u32 mean_delta, + u32 payload_bits, size_t bytes_per_page); + +void uds_log_delta_index_entry(struct delta_index_entry *delta_entry); + +#endif /* UDS_DELTA_INDEX_H */ diff --git a/drivers/md/dm-vdo/indexer/funnel-requestqueue.c b/drivers/md/dm-vdo/indexer/funnel-requestqueue.c new file mode 100644 index 0000000000..1a5735375d --- /dev/null +++ b/drivers/md/dm-vdo/indexer/funnel-requestqueue.c @@ -0,0 +1,279 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2023 Red Hat + */ + +#include "funnel-requestqueue.h" + +#include +#include +#include + +#include "funnel-queue.h" +#include "logger.h" +#include "memory-alloc.h" +#include "thread-utils.h" + +/* + * This queue will attempt to handle requests in reasonably sized batches instead of reacting + * immediately to each new request. The wait time between batches is dynamically adjusted up or + * down to try to balance responsiveness against wasted thread run time. + * + * If the wait time becomes long enough, the queue will become dormant and must be explicitly + * awoken when a new request is enqueued. The enqueue operation updates "newest" in the funnel + * queue via xchg (which is a memory barrier), and later checks "dormant" to decide whether to do a + * wakeup of the worker thread. + * + * When deciding to go to sleep, the worker thread sets "dormant" and then examines "newest" to + * decide if the funnel queue is idle. In dormant mode, the last examination of "newest" before + * going to sleep is done inside the wait_event_interruptible() macro, after a point where one or + * more memory barriers have been issued. (Preparing to sleep uses spin locks.) Even if the funnel + * queue's "next" field update isn't visible yet to make the entry accessible, its existence will + * kick the worker thread out of dormant mode and back into timer-based mode. + * + * Unbatched requests are used to communicate between different zone threads and will also cause + * the queue to awaken immediately. + */ + +enum { + NANOSECOND = 1, + MICROSECOND = 1000 * NANOSECOND, + MILLISECOND = 1000 * MICROSECOND, + DEFAULT_WAIT_TIME = 20 * MICROSECOND, + MINIMUM_WAIT_TIME = DEFAULT_WAIT_TIME / 2, + MAXIMUM_WAIT_TIME = MILLISECOND, + MINIMUM_BATCH = 32, + MAXIMUM_BATCH = 64, +}; + +struct uds_request_queue { + /* Wait queue for synchronizing producers and consumer */ + struct wait_queue_head wait_head; + /* Function to process a request */ + uds_request_queue_processor_fn processor; + /* Queue of new incoming requests */ + struct funnel_queue *main_queue; + /* Queue of old requests to retry */ + struct funnel_queue *retry_queue; + /* The thread id of the worker thread */ + struct thread *thread; + /* True if the worker was started */ + bool started; + /* When true, requests can be enqueued */ + bool running; + /* A flag set when the worker is waiting without a timeout */ + atomic_t dormant; +}; + +static inline struct uds_request *poll_queues(struct uds_request_queue *queue) +{ + struct funnel_queue_entry *entry; + + entry = vdo_funnel_queue_poll(queue->retry_queue); + if (entry != NULL) + return container_of(entry, struct uds_request, queue_link); + + entry = vdo_funnel_queue_poll(queue->main_queue); + if (entry != NULL) + return container_of(entry, struct uds_request, queue_link); + + return NULL; +} + +static inline bool are_queues_idle(struct uds_request_queue *queue) +{ + return vdo_is_funnel_queue_idle(queue->retry_queue) && + vdo_is_funnel_queue_idle(queue->main_queue); +} + +/* + * Determine if there is a next request to process, and return it if there is. Also return flags + * indicating whether the worker thread can sleep (for the use of wait_event() macros) and whether + * the thread did sleep before returning a new request. + */ +static inline bool dequeue_request(struct uds_request_queue *queue, + struct uds_request **request_ptr, bool *waited_ptr) +{ + struct uds_request *request = poll_queues(queue); + + if (request != NULL) { + *request_ptr = request; + return true; + } + + if (!READ_ONCE(queue->running)) { + /* Wake the worker thread so it can exit. */ + *request_ptr = NULL; + return true; + } + + *request_ptr = NULL; + *waited_ptr = true; + return false; +} + +static void wait_for_request(struct uds_request_queue *queue, bool dormant, + unsigned long timeout, struct uds_request **request, + bool *waited) +{ + if (dormant) { + wait_event_interruptible(queue->wait_head, + (dequeue_request(queue, request, waited) || + !are_queues_idle(queue))); + return; + } + + wait_event_interruptible_hrtimeout(queue->wait_head, + dequeue_request(queue, request, waited), + ns_to_ktime(timeout)); +} + +static void request_queue_worker(void *arg) +{ + struct uds_request_queue *queue = arg; + struct uds_request *request = NULL; + unsigned long time_batch = DEFAULT_WAIT_TIME; + bool dormant = atomic_read(&queue->dormant); + bool waited = false; + long current_batch = 0; + + for (;;) { + wait_for_request(queue, dormant, time_batch, &request, &waited); + if (likely(request != NULL)) { + current_batch++; + queue->processor(request); + } else if (!READ_ONCE(queue->running)) { + break; + } + + if (dormant) { + /* + * The queue has been roused from dormancy. Clear the flag so enqueuers can + * stop broadcasting. No fence is needed for this transition. + */ + atomic_set(&queue->dormant, false); + dormant = false; + time_batch = DEFAULT_WAIT_TIME; + } else if (waited) { + /* + * We waited for this request to show up. Adjust the wait time to smooth + * out the batch size. + */ + if (current_batch < MINIMUM_BATCH) { + /* + * If the last batch of requests was too small, increase the wait + * time. + */ + time_batch += time_batch / 4; + if (time_batch >= MAXIMUM_WAIT_TIME) { + atomic_set(&queue->dormant, true); + dormant = true; + } + } else if (current_batch > MAXIMUM_BATCH) { + /* + * If the last batch of requests was too large, decrease the wait + * time. + */ + time_batch -= time_batch / 4; + if (time_batch < MINIMUM_WAIT_TIME) + time_batch = MINIMUM_WAIT_TIME; + } + current_batch = 0; + } + } + + /* + * Ensure that we process any remaining requests that were enqueued before trying to shut + * down. The corresponding write barrier is in uds_request_queue_finish(). + */ + smp_rmb(); + while ((request = poll_queues(queue)) != NULL) + queue->processor(request); +} + +int uds_make_request_queue(const char *queue_name, + uds_request_queue_processor_fn processor, + struct uds_request_queue **queue_ptr) +{ + int result; + struct uds_request_queue *queue; + + result = vdo_allocate(1, struct uds_request_queue, __func__, &queue); + if (result != VDO_SUCCESS) + return result; + + queue->processor = processor; + queue->running = true; + atomic_set(&queue->dormant, false); + init_waitqueue_head(&queue->wait_head); + + result = vdo_make_funnel_queue(&queue->main_queue); + if (result != VDO_SUCCESS) { + uds_request_queue_finish(queue); + return result; + } + + result = vdo_make_funnel_queue(&queue->retry_queue); + if (result != VDO_SUCCESS) { + uds_request_queue_finish(queue); + return result; + } + + result = vdo_create_thread(request_queue_worker, queue, queue_name, + &queue->thread); + if (result != VDO_SUCCESS) { + uds_request_queue_finish(queue); + return result; + } + + queue->started = true; + *queue_ptr = queue; + return UDS_SUCCESS; +} + +static inline void wake_up_worker(struct uds_request_queue *queue) +{ + if (wq_has_sleeper(&queue->wait_head)) + wake_up(&queue->wait_head); +} + +void uds_request_queue_enqueue(struct uds_request_queue *queue, + struct uds_request *request) +{ + struct funnel_queue *sub_queue; + bool unbatched = request->unbatched; + + sub_queue = request->requeued ? queue->retry_queue : queue->main_queue; + vdo_funnel_queue_put(sub_queue, &request->queue_link); + + /* + * We must wake the worker thread when it is dormant. A read fence isn't needed here since + * we know the queue operation acts as one. + */ + if (atomic_read(&queue->dormant) || unbatched) + wake_up_worker(queue); +} + +void uds_request_queue_finish(struct uds_request_queue *queue) +{ + if (queue == NULL) + return; + + /* + * This memory barrier ensures that any requests we queued will be seen. The point is that + * when dequeue_request() sees the following update to the running flag, it will also be + * able to see any change we made to a next field in the funnel queue entry. The + * corresponding read barrier is in request_queue_worker(). + */ + smp_wmb(); + WRITE_ONCE(queue->running, false); + + if (queue->started) { + wake_up_worker(queue); + vdo_join_threads(queue->thread); + } + + vdo_free_funnel_queue(queue->main_queue); + vdo_free_funnel_queue(queue->retry_queue); + vdo_free(queue); +} diff --git a/drivers/md/dm-vdo/indexer/funnel-requestqueue.h b/drivers/md/dm-vdo/indexer/funnel-requestqueue.h new file mode 100644 index 0000000000..9b0f53939b --- /dev/null +++ b/drivers/md/dm-vdo/indexer/funnel-requestqueue.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright 2023 Red Hat + */ + +#ifndef UDS_REQUEST_QUEUE_H +#define UDS_REQUEST_QUEUE_H + +#include "indexer.h" + +/* + * A simple request queue which will handle new requests in the order in which they are received, + * and will attempt to handle requeued requests before new ones. However, the nature of the + * implementation means that it cannot guarantee this ordering; the prioritization is merely a + * hint. + */ + +struct uds_request_queue; + +typedef void (*uds_request_queue_processor_fn)(struct uds_request *); + +int __must_check uds_make_request_queue(const char *queue_name, + uds_request_queue_processor_fn processor, + struct uds_request_queue **queue_ptr); + +void uds_request_queue_enqueue(struct uds_request_queue *queue, + struct uds_request *request); + +void uds_request_queue_finish(struct uds_request_queue *queue); + +#endif /* UDS_REQUEST_QUEUE_H */ diff --git a/drivers/md/dm-vdo/indexer/geometry.c b/drivers/md/dm-vdo/indexer/geometry.c new file mode 100644 index 0000000000..c0575612e8 --- /dev/null +++ b/drivers/md/dm-vdo/indexer/geometry.c @@ -0,0 +1,201 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2023 Red Hat + */ + +#include "geometry.h" + +#include +#include + +#include "errors.h" +#include "logger.h" +#include "memory-alloc.h" +#include "permassert.h" + +#include "delta-index.h" +#include "indexer.h" + +/* + * An index volume is divided into a fixed number of fixed-size chapters, each consisting of a + * fixed number of fixed-size pages. The volume layout is defined by two constants and four + * parameters. The constants are that index records are 32 bytes long (16-byte block name plus + * 16-byte metadata) and that open chapter index hash slots are one byte long. The four parameters + * are the number of bytes in a page, the number of record pages in a chapter, the number of + * chapters in a volume, and the number of chapters that are sparse. From these parameters, we can + * derive the rest of the layout and other index properties. + * + * The index volume is sized by its maximum memory footprint. For a dense index, the persistent + * storage is about 10 times the size of the memory footprint. For a sparse index, the persistent + * storage is about 100 times the size of the memory footprint. + * + * For a small index with a memory footprint less than 1GB, there are three possible memory + * configurations: 0.25GB, 0.5GB and 0.75GB. The default geometry for each is 1024 index records + * per 32 KB page, 1024 chapters per volume, and either 64, 128, or 192 record pages per chapter + * (resulting in 6, 13, or 20 index pages per chapter) depending on the memory configuration. For + * the VDO default of a 0.25 GB index, this yields a deduplication window of 256 GB using about 2.5 + * GB for the persistent storage and 256 MB of RAM. + * + * For a larger index with a memory footprint that is a multiple of 1 GB, the geometry is 1024 + * index records per 32 KB page, 256 record pages per chapter, 26 index pages per chapter, and 1024 + * chapters for every GB of memory footprint. For a 1 GB volume, this yields a deduplication window + * of 1 TB using about 9GB of persistent storage and 1 GB of RAM. + * + * The above numbers hold for volumes which have no sparse chapters. A sparse volume has 10 times + * as many chapters as the corresponding non-sparse volume, which provides 10 times the + * deduplication window while using 10 times as much persistent storage as the equivalent + * non-sparse volume with the same memory footprint. + * + * If the volume has been converted from a non-lvm format to an lvm volume, the number of chapters + * per volume will have been reduced by one by eliminating physical chapter 0, and the virtual + * chapter that formerly mapped to physical chapter 0 may be remapped to another physical chapter. + * This remapping is expressed by storing which virtual chapter was remapped, and which physical + * chapter it was moved to. + */ + +int uds_make_index_geometry(size_t bytes_per_page, u32 record_pages_per_chapter, + u32 chapters_per_volume, u32 sparse_chapters_per_volume, + u64 remapped_virtual, u64 remapped_physical, + struct index_geometry **geometry_ptr) +{ + int result; + struct index_geometry *geometry; + + result = vdo_allocate(1, struct index_geometry, "geometry", &geometry); + if (result != VDO_SUCCESS) + return result; + + geometry->bytes_per_page = bytes_per_page; + geometry->record_pages_per_chapter = record_pages_per_chapter; + geometry->chapters_per_volume = chapters_per_volume; + geometry->sparse_chapters_per_volume = sparse_chapters_per_volume; + geometry->dense_chapters_per_volume = chapters_per_volume - sparse_chapters_per_volume; + geometry->remapped_virtual = remapped_virtual; + geometry->remapped_physical = remapped_physical; + + geometry->records_per_page = bytes_per_page / BYTES_PER_RECORD; + geometry->records_per_chapter = geometry->records_per_page * record_pages_per_chapter; + geometry->records_per_volume = (u64) geometry->records_per_chapter * chapters_per_volume; + + geometry->chapter_mean_delta = 1 << DEFAULT_CHAPTER_MEAN_DELTA_BITS; + geometry->chapter_payload_bits = bits_per(record_pages_per_chapter - 1); + /* + * We want 1 delta list for every 64 records in the chapter. + * The "| 077" ensures that the chapter_delta_list_bits computation + * does not underflow. + */ + geometry->chapter_delta_list_bits = + bits_per((geometry->records_per_chapter - 1) | 077) - 6; + geometry->delta_lists_per_chapter = 1 << geometry->chapter_delta_list_bits; + /* We need enough address bits to achieve the desired mean delta. */ + geometry->chapter_address_bits = + (DEFAULT_CHAPTER_MEAN_DELTA_BITS - + geometry->chapter_delta_list_bits + + bits_per(geometry->records_per_chapter - 1)); + geometry->index_pages_per_chapter = + uds_get_delta_index_page_count(geometry->records_per_chapter, + geometry->delta_lists_per_chapter, + geometry->chapter_mean_delta, + geometry->chapter_payload_bits, + bytes_per_page); + + geometry->pages_per_chapter = geometry->index_pages_per_chapter + record_pages_per_chapter; + geometry->pages_per_volume = geometry->pages_per_chapter * chapters_per_volume; + geometry->bytes_per_volume = + bytes_per_page * (geometry->pages_per_volume + HEADER_PAGES_PER_VOLUME); + + *geometry_ptr = geometry; + return UDS_SUCCESS; +} + +int uds_copy_index_geometry(struct index_geometry *source, + struct index_geometry **geometry_ptr) +{ + return uds_make_index_geometry(source->bytes_per_page, + source->record_pages_per_chapter, + source->chapters_per_volume, + source->sparse_chapters_per_volume, + source->remapped_virtual, source->remapped_physical, + geometry_ptr); +} + +void uds_free_index_geometry(struct index_geometry *geometry) +{ + vdo_free(geometry); +} + +u32 __must_check uds_map_to_physical_chapter(const struct index_geometry *geometry, + u64 virtual_chapter) +{ + u64 delta; + + if (!uds_is_reduced_index_geometry(geometry)) + return virtual_chapter % geometry->chapters_per_volume; + + if (likely(virtual_chapter > geometry->remapped_virtual)) { + delta = virtual_chapter - geometry->remapped_virtual; + if (likely(delta > geometry->remapped_physical)) + return delta % geometry->chapters_per_volume; + else + return delta - 1; + } + + if (virtual_chapter == geometry->remapped_virtual) + return geometry->remapped_physical; + + delta = geometry->remapped_virtual - virtual_chapter; + if (delta < geometry->chapters_per_volume) + return geometry->chapters_per_volume - delta; + + /* This chapter is so old the answer doesn't matter. */ + return 0; +} + +/* Check whether any sparse chapters are in use. */ +bool uds_has_sparse_chapters(const struct index_geometry *geometry, + u64 oldest_virtual_chapter, u64 newest_virtual_chapter) +{ + return uds_is_sparse_index_geometry(geometry) && + ((newest_virtual_chapter - oldest_virtual_chapter + 1) > + geometry->dense_chapters_per_volume); +} + +bool uds_is_chapter_sparse(const struct index_geometry *geometry, + u64 oldest_virtual_chapter, u64 newest_virtual_chapter, + u64 virtual_chapter_number) +{ + return uds_has_sparse_chapters(geometry, oldest_virtual_chapter, + newest_virtual_chapter) && + ((virtual_chapter_number + geometry->dense_chapters_per_volume) <= + newest_virtual_chapter); +} + +/* Calculate how many chapters to expire after opening the newest chapter. */ +u32 uds_chapters_to_expire(const struct index_geometry *geometry, u64 newest_chapter) +{ + /* If the index isn't full yet, don't expire anything. */ + if (newest_chapter < geometry->chapters_per_volume) + return 0; + + /* If a chapter is out of order... */ + if (geometry->remapped_physical > 0) { + u64 oldest_chapter = newest_chapter - geometry->chapters_per_volume; + + /* + * ... expire an extra chapter when expiring the moved chapter to free physical + * space for the new chapter ... + */ + if (oldest_chapter == geometry->remapped_virtual) + return 2; + + /* + * ... but don't expire anything when the new chapter will use the physical chapter + * freed by expiring the moved chapter. + */ + if (oldest_chapter == (geometry->remapped_virtual + geometry->remapped_physical)) + return 0; + } + + /* Normally, just expire one. */ + return 1; +} diff --git a/drivers/md/dm-vdo/indexer/geometry.h b/drivers/md/dm-vdo/indexer/geometry.h new file mode 100644 index 0000000000..a2ecdb238c --- /dev/null +++ b/drivers/md/dm-vdo/indexer/geometry.h @@ -0,0 +1,140 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright 2023 Red Hat + */ + +#ifndef UDS_INDEX_GEOMETRY_H +#define UDS_INDEX_GEOMETRY_H + +#include "indexer.h" + +/* + * The index_geometry records parameters that define the layout of a UDS index volume, and the size and + * shape of various index structures. It is created when the index is created, and is referenced by + * many index sub-components. + */ + +struct index_geometry { + /* Size of a chapter page, in bytes */ + size_t bytes_per_page; + /* Number of record pages in a chapter */ + u32 record_pages_per_chapter; + /* Total number of chapters in a volume */ + u32 chapters_per_volume; + /* Number of sparsely-indexed chapters in a volume */ + u32 sparse_chapters_per_volume; + /* Number of bits used to determine delta list numbers */ + u8 chapter_delta_list_bits; + /* Virtual chapter remapped from physical chapter 0 */ + u64 remapped_virtual; + /* New physical chapter where the remapped chapter can be found */ + u64 remapped_physical; + + /* + * The following properties are derived from the ones above, but they are computed and + * recorded as fields for convenience. + */ + /* Total number of pages in a volume, excluding the header */ + u32 pages_per_volume; + /* Total number of bytes in a volume, including the header */ + size_t bytes_per_volume; + /* Number of pages in a chapter */ + u32 pages_per_chapter; + /* Number of index pages in a chapter index */ + u32 index_pages_per_chapter; + /* Number of records that fit on a page */ + u32 records_per_page; + /* Number of records that fit in a chapter */ + u32 records_per_chapter; + /* Number of records that fit in a volume */ + u64 records_per_volume; + /* Number of delta lists per chapter index */ + u32 delta_lists_per_chapter; + /* Mean delta for chapter indexes */ + u32 chapter_mean_delta; + /* Number of bits needed for record page numbers */ + u8 chapter_payload_bits; + /* Number of bits used to compute addresses for chapter delta lists */ + u8 chapter_address_bits; + /* Number of densely-indexed chapters in a volume */ + u32 dense_chapters_per_volume; +}; + +enum { + /* The number of bytes in a record (name + metadata) */ + BYTES_PER_RECORD = (UDS_RECORD_NAME_SIZE + UDS_RECORD_DATA_SIZE), + + /* The default length of a page in a chapter, in bytes */ + DEFAULT_BYTES_PER_PAGE = 1024 * BYTES_PER_RECORD, + + /* The default maximum number of records per page */ + DEFAULT_RECORDS_PER_PAGE = DEFAULT_BYTES_PER_PAGE / BYTES_PER_RECORD, + + /* The default number of record pages in a chapter */ + DEFAULT_RECORD_PAGES_PER_CHAPTER = 256, + + /* The default number of record pages in a chapter for a small index */ + SMALL_RECORD_PAGES_PER_CHAPTER = 64, + + /* The default number of chapters in a volume */ + DEFAULT_CHAPTERS_PER_VOLUME = 1024, + + /* The default number of sparsely-indexed chapters in a volume */ + DEFAULT_SPARSE_CHAPTERS_PER_VOLUME = 0, + + /* The log2 of the default mean delta */ + DEFAULT_CHAPTER_MEAN_DELTA_BITS = 16, + + /* The log2 of the number of delta lists in a large chapter */ + DEFAULT_CHAPTER_DELTA_LIST_BITS = 12, + + /* The log2 of the number of delta lists in a small chapter */ + SMALL_CHAPTER_DELTA_LIST_BITS = 10, + + /* The number of header pages per volume */ + HEADER_PAGES_PER_VOLUME = 1, +}; + +int __must_check uds_make_index_geometry(size_t bytes_per_page, u32 record_pages_per_chapter, + u32 chapters_per_volume, + u32 sparse_chapters_per_volume, u64 remapped_virtual, + u64 remapped_physical, + struct index_geometry **geometry_ptr); + +int __must_check uds_copy_index_geometry(struct index_geometry *source, + struct index_geometry **geometry_ptr); + +void uds_free_index_geometry(struct index_geometry *geometry); + +u32 __must_check uds_map_to_physical_chapter(const struct index_geometry *geometry, + u64 virtual_chapter); + +/* + * Check whether this geometry is reduced by a chapter. This will only be true if the volume was + * converted from a non-lvm volume to an lvm volume. + */ +static inline bool __must_check +uds_is_reduced_index_geometry(const struct index_geometry *geometry) +{ + return !!(geometry->chapters_per_volume & 1); +} + +static inline bool __must_check +uds_is_sparse_index_geometry(const struct index_geometry *geometry) +{ + return geometry->sparse_chapters_per_volume > 0; +} + +bool __must_check uds_has_sparse_chapters(const struct index_geometry *geometry, + u64 oldest_virtual_chapter, + u64 newest_virtual_chapter); + +bool __must_check uds_is_chapter_sparse(const struct index_geometry *geometry, + u64 oldest_virtual_chapter, + u64 newest_virtual_chapter, + u64 virtual_chapter_number); + +u32 __must_check uds_chapters_to_expire(const struct index_geometry *geometry, + u64 newest_chapter); + +#endif /* UDS_INDEX_GEOMETRY_H */ diff --git a/drivers/md/dm-vdo/indexer/hash-utils.h b/drivers/md/dm-vdo/indexer/hash-utils.h new file mode 100644 index 0000000000..6a8dd8ffea --- /dev/null +++ b/drivers/md/dm-vdo/indexer/hash-utils.h @@ -0,0 +1,66 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright 2023 Red Hat + */ + +#ifndef UDS_HASH_UTILS_H +#define UDS_HASH_UTILS_H + +#include "numeric.h" + +#include "geometry.h" +#include "indexer.h" + +/* Utilities for extracting portions of a request name for various uses. */ + +/* How various portions of a record name are apportioned. */ +enum { + VOLUME_INDEX_BYTES_OFFSET = 0, + VOLUME_INDEX_BYTES_COUNT = 8, + CHAPTER_INDEX_BYTES_OFFSET = 8, + CHAPTER_INDEX_BYTES_COUNT = 6, + SAMPLE_BYTES_OFFSET = 14, + SAMPLE_BYTES_COUNT = 2, +}; + +static inline u64 uds_extract_chapter_index_bytes(const struct uds_record_name *name) +{ + const u8 *chapter_bits = &name->name[CHAPTER_INDEX_BYTES_OFFSET]; + u64 bytes = (u64) get_unaligned_be16(chapter_bits) << 32; + + bytes |= get_unaligned_be32(chapter_bits + 2); + return bytes; +} + +static inline u64 uds_extract_volume_index_bytes(const struct uds_record_name *name) +{ + return get_unaligned_be64(&name->name[VOLUME_INDEX_BYTES_OFFSET]); +} + +static inline u32 uds_extract_sampling_bytes(const struct uds_record_name *name) +{ + return get_unaligned_be16(&name->name[SAMPLE_BYTES_OFFSET]); +} + +/* Compute the chapter delta list for a given name. */ +static inline u32 uds_hash_to_chapter_delta_list(const struct uds_record_name *name, + const struct index_geometry *geometry) +{ + return ((uds_extract_chapter_index_bytes(name) >> geometry->chapter_address_bits) & + ((1 << geometry->chapter_delta_list_bits) - 1)); +} + +/* Compute the chapter delta address for a given name. */ +static inline u32 uds_hash_to_chapter_delta_address(const struct uds_record_name *name, + const struct index_geometry *geometry) +{ + return uds_extract_chapter_index_bytes(name) & ((1 << geometry->chapter_address_bits) - 1); +} + +static inline unsigned int uds_name_to_hash_slot(const struct uds_record_name *name, + unsigned int slot_count) +{ + return (unsigned int) (uds_extract_chapter_index_bytes(name) % slot_count); +} + +#endif /* UDS_HASH_UTILS_H */ diff --git a/drivers/md/dm-vdo/indexer/index-layout.c b/drivers/md/dm-vdo/indexer/index-layout.c new file mode 100644 index 0000000000..627adc24af --- /dev/null +++ b/drivers/md/dm-vdo/indexer/index-layout.c @@ -0,0 +1,1765 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2023 Red Hat + */ + +#include "index-layout.h" + +#include + +#include "logger.h" +#include "memory-alloc.h" +#include "murmurhash3.h" +#include "numeric.h" +#include "time-utils.h" + +#include "config.h" +#include "open-chapter.h" +#include "volume-index.h" + +/* + * The UDS layout on storage media is divided into a number of fixed-size regions, the sizes of + * which are computed when the index is created. Every header and region begins on 4K block + * boundary. Save regions are further sub-divided into regions of their own. + * + * Each region has a kind and an instance number. Some kinds only have one instance and therefore + * use RL_SOLE_INSTANCE (-1) as the instance number. The RL_KIND_INDEX used to use instances to + * represent sub-indices; now, however there is only ever one sub-index and therefore one instance. + * The RL_KIND_VOLUME_INDEX uses instances to record which zone is being saved. + * + * Every region header has a type and version. + * + * +-+-+---------+--------+--------+-+ + * | | | I N D E X 0 101, 0 | | + * |H|C+---------+--------+--------+S| + * |D|f| Volume | Save | Save |e| + * |R|g| Region | Region | Region |a| + * | | | 201, -1 | 202, 0 | 202, 1 |l| + * +-+-+--------+---------+--------+-+ + * + * The header contains the encoded region layout table as well as some index configuration data. + * The sub-index region and its subdivisions are maintained in the same table. + * + * There are two save regions to preserve the old state in case saving the new state is incomplete. + * They are used in alternation. Each save region is further divided into sub-regions. + * + * +-+-----+------+------+-----+-----+ + * |H| IPM | MI | MI | | OC | + * |D| | zone | zone | ... | | + * |R| 301 | 302 | 302 | | 303 | + * | | -1 | 0 | 1 | | -1 | + * +-+-----+------+------+-----+-----+ + * + * The header contains the encoded region layout table as well as index state data for that save. + * Each save also has a unique nonce. + */ + +#define MAGIC_SIZE 32 +#define NONCE_INFO_SIZE 32 +#define MAX_SAVES 2 + +enum region_kind { + RL_KIND_EMPTY = 0, + RL_KIND_HEADER = 1, + RL_KIND_CONFIG = 100, + RL_KIND_INDEX = 101, + RL_KIND_SEAL = 102, + RL_KIND_VOLUME = 201, + RL_KIND_SAVE = 202, + RL_KIND_INDEX_PAGE_MAP = 301, + RL_KIND_VOLUME_INDEX = 302, + RL_KIND_OPEN_CHAPTER = 303, +}; + +/* Some region types are historical and are no longer used. */ +enum region_type { + RH_TYPE_FREE = 0, /* unused */ + RH_TYPE_SUPER = 1, + RH_TYPE_SAVE = 2, + RH_TYPE_CHECKPOINT = 3, /* unused */ + RH_TYPE_UNSAVED = 4, +}; + +#define RL_SOLE_INSTANCE 65535 + +/* + * Super block version 2 is the first released version. + * + * Super block version 3 is the normal version used from RHEL 8.2 onwards. + * + * Super block versions 4 through 6 were incremental development versions and + * are not supported. + * + * Super block version 7 is used for volumes which have been reduced in size by one chapter in + * order to make room to prepend LVM metadata to a volume originally created without lvm. This + * allows the index to retain most its deduplication records. + */ +#define SUPER_VERSION_MINIMUM 3 +#define SUPER_VERSION_CURRENT 3 +#define SUPER_VERSION_MAXIMUM 7 + +static const u8 LAYOUT_MAGIC[MAGIC_SIZE] = "*ALBIREO*SINGLE*FILE*LAYOUT*001*"; +static const u64 REGION_MAGIC = 0x416c6252676e3031; /* 'AlbRgn01' */ + +struct region_header { + u64 magic; + u64 region_blocks; + u16 type; + /* Currently always version 1 */ + u16 version; + u16 region_count; + u16 payload; +}; + +struct layout_region { + u64 start_block; + u64 block_count; + u32 __unused; + u16 kind; + u16 instance; +}; + +struct region_table { + size_t encoded_size; + struct region_header header; + struct layout_region regions[]; +}; + +struct index_save_data { + u64 timestamp; + u64 nonce; + /* Currently always version 1 */ + u32 version; + u32 unused__; +}; + +struct index_state_version { + s32 signature; + s32 version_id; +}; + +static const struct index_state_version INDEX_STATE_VERSION_301 = { + .signature = -1, + .version_id = 301, +}; + +struct index_state_data301 { + struct index_state_version version; + u64 newest_chapter; + u64 oldest_chapter; + u64 last_save; + u32 unused; + u32 padding; +}; + +struct index_save_layout { + unsigned int zone_count; + struct layout_region index_save; + struct layout_region header; + struct layout_region index_page_map; + struct layout_region free_space; + struct layout_region volume_index_zones[MAX_ZONES]; + struct layout_region open_chapter; + struct index_save_data save_data; + struct index_state_data301 state_data; +}; + +struct sub_index_layout { + u64 nonce; + struct layout_region sub_index; + struct layout_region volume; + struct index_save_layout *saves; +}; + +struct super_block_data { + u8 magic_label[MAGIC_SIZE]; + u8 nonce_info[NONCE_INFO_SIZE]; + u64 nonce; + u32 version; + u32 block_size; + u16 index_count; + u16 max_saves; + /* Padding reflects a blank field on permanent storage */ + u8 padding[4]; + u64 open_chapter_blocks; + u64 page_map_blocks; + u64 volume_offset; + u64 start_offset; +}; + +struct index_layout { + struct io_factory *factory; + size_t factory_size; + off_t offset; + struct super_block_data super; + struct layout_region header; + struct layout_region config; + struct sub_index_layout index; + struct layout_region seal; + u64 total_blocks; +}; + +struct save_layout_sizes { + unsigned int save_count; + size_t block_size; + u64 volume_blocks; + u64 volume_index_blocks; + u64 page_map_blocks; + u64 open_chapter_blocks; + u64 save_blocks; + u64 sub_index_blocks; + u64 total_blocks; + size_t total_size; +}; + +static inline bool is_converted_super_block(struct super_block_data *super) +{ + return super->version == 7; +} + +static int __must_check compute_sizes(const struct uds_configuration *config, + struct save_layout_sizes *sls) +{ + int result; + struct index_geometry *geometry = config->geometry; + + memset(sls, 0, sizeof(*sls)); + sls->save_count = MAX_SAVES; + sls->block_size = UDS_BLOCK_SIZE; + sls->volume_blocks = geometry->bytes_per_volume / sls->block_size; + + result = uds_compute_volume_index_save_blocks(config, sls->block_size, + &sls->volume_index_blocks); + if (result != UDS_SUCCESS) + return vdo_log_error_strerror(result, "cannot compute index save size"); + + sls->page_map_blocks = + DIV_ROUND_UP(uds_compute_index_page_map_save_size(geometry), + sls->block_size); + sls->open_chapter_blocks = + DIV_ROUND_UP(uds_compute_saved_open_chapter_size(geometry), + sls->block_size); + sls->save_blocks = + 1 + (sls->volume_index_blocks + sls->page_map_blocks + sls->open_chapter_blocks); + sls->sub_index_blocks = sls->volume_blocks + (sls->save_count * sls->save_blocks); + sls->total_blocks = 3 + sls->sub_index_blocks; + sls->total_size = sls->total_blocks * sls->block_size; + + return UDS_SUCCESS; +} + +int uds_compute_index_size(const struct uds_parameters *parameters, u64 *index_size) +{ + int result; + struct uds_configuration *index_config; + struct save_layout_sizes sizes; + + if (index_size == NULL) { + vdo_log_error("Missing output size pointer"); + return -EINVAL; + } + + result = uds_make_configuration(parameters, &index_config); + if (result != UDS_SUCCESS) { + vdo_log_error_strerror(result, "cannot compute index size"); + return uds_status_to_errno(result); + } + + result = compute_sizes(index_config, &sizes); + uds_free_configuration(index_config); + if (result != UDS_SUCCESS) + return uds_status_to_errno(result); + + *index_size = sizes.total_size; + return UDS_SUCCESS; +} + +/* Create unique data using the current time and a pseudorandom number. */ +static void create_unique_nonce_data(u8 *buffer) +{ + ktime_t now = current_time_ns(CLOCK_REALTIME); + u32 rand; + size_t offset = 0; + + get_random_bytes(&rand, sizeof(u32)); + memcpy(buffer + offset, &now, sizeof(now)); + offset += sizeof(now); + memcpy(buffer + offset, &rand, sizeof(rand)); + offset += sizeof(rand); + while (offset < NONCE_INFO_SIZE) { + size_t len = min(NONCE_INFO_SIZE - offset, offset); + + memcpy(buffer + offset, buffer, len); + offset += len; + } +} + +static u64 hash_stuff(u64 start, const void *data, size_t len) +{ + u32 seed = start ^ (start >> 27); + u8 hash_buffer[16]; + + murmurhash3_128(data, len, seed, hash_buffer); + return get_unaligned_le64(hash_buffer + 4); +} + +/* Generate a primary nonce from the provided data. */ +static u64 generate_primary_nonce(const void *data, size_t len) +{ + return hash_stuff(0xa1b1e0fc, data, len); +} + +/* + * Deterministically generate a secondary nonce from an existing nonce and some arbitrary data by + * hashing the original nonce and the data to produce a new nonce. + */ +static u64 generate_secondary_nonce(u64 nonce, const void *data, size_t len) +{ + return hash_stuff(nonce + 1, data, len); +} + +static int __must_check open_layout_reader(struct index_layout *layout, + struct layout_region *lr, off_t offset, + struct buffered_reader **reader_ptr) +{ + return uds_make_buffered_reader(layout->factory, lr->start_block + offset, + lr->block_count, reader_ptr); +} + +static int open_region_reader(struct index_layout *layout, struct layout_region *region, + struct buffered_reader **reader_ptr) +{ + return open_layout_reader(layout, region, -layout->super.start_offset, + reader_ptr); +} + +static int __must_check open_layout_writer(struct index_layout *layout, + struct layout_region *lr, off_t offset, + struct buffered_writer **writer_ptr) +{ + return uds_make_buffered_writer(layout->factory, lr->start_block + offset, + lr->block_count, writer_ptr); +} + +static int open_region_writer(struct index_layout *layout, struct layout_region *region, + struct buffered_writer **writer_ptr) +{ + return open_layout_writer(layout, region, -layout->super.start_offset, + writer_ptr); +} + +static void generate_super_block_data(struct save_layout_sizes *sls, + struct super_block_data *super) +{ + memset(super, 0, sizeof(*super)); + memcpy(super->magic_label, LAYOUT_MAGIC, MAGIC_SIZE); + create_unique_nonce_data(super->nonce_info); + + super->nonce = generate_primary_nonce(super->nonce_info, + sizeof(super->nonce_info)); + super->version = SUPER_VERSION_CURRENT; + super->block_size = sls->block_size; + super->index_count = 1; + super->max_saves = sls->save_count; + super->open_chapter_blocks = sls->open_chapter_blocks; + super->page_map_blocks = sls->page_map_blocks; + super->volume_offset = 0; + super->start_offset = 0; +} + +static void define_sub_index_nonce(struct index_layout *layout) +{ + struct sub_index_nonce_data { + u64 offset; + u16 index_id; + }; + struct sub_index_layout *sil = &layout->index; + u64 primary_nonce = layout->super.nonce; + u8 buffer[sizeof(struct sub_index_nonce_data)] = { 0 }; + size_t offset = 0; + + encode_u64_le(buffer, &offset, sil->sub_index.start_block); + encode_u16_le(buffer, &offset, 0); + sil->nonce = generate_secondary_nonce(primary_nonce, buffer, sizeof(buffer)); + if (sil->nonce == 0) { + sil->nonce = generate_secondary_nonce(~primary_nonce + 1, buffer, + sizeof(buffer)); + } +} + +static void setup_sub_index(struct index_layout *layout, u64 start_block, + struct save_layout_sizes *sls) +{ + struct sub_index_layout *sil = &layout->index; + u64 next_block = start_block; + unsigned int i; + + sil->sub_index = (struct layout_region) { + .start_block = start_block, + .block_count = sls->sub_index_blocks, + .kind = RL_KIND_INDEX, + .instance = 0, + }; + + sil->volume = (struct layout_region) { + .start_block = next_block, + .block_count = sls->volume_blocks, + .kind = RL_KIND_VOLUME, + .instance = RL_SOLE_INSTANCE, + }; + + next_block += sls->volume_blocks; + + for (i = 0; i < sls->save_count; i++) { + sil->saves[i].index_save = (struct layout_region) { + .start_block = next_block, + .block_count = sls->save_blocks, + .kind = RL_KIND_SAVE, + .instance = i, + }; + + next_block += sls->save_blocks; + } + + define_sub_index_nonce(layout); +} + +static void initialize_layout(struct index_layout *layout, struct save_layout_sizes *sls) +{ + u64 next_block = layout->offset / sls->block_size; + + layout->total_blocks = sls->total_blocks; + generate_super_block_data(sls, &layout->super); + layout->header = (struct layout_region) { + .start_block = next_block++, + .block_count = 1, + .kind = RL_KIND_HEADER, + .instance = RL_SOLE_INSTANCE, + }; + + layout->config = (struct layout_region) { + .start_block = next_block++, + .block_count = 1, + .kind = RL_KIND_CONFIG, + .instance = RL_SOLE_INSTANCE, + }; + + setup_sub_index(layout, next_block, sls); + next_block += sls->sub_index_blocks; + + layout->seal = (struct layout_region) { + .start_block = next_block, + .block_count = 1, + .kind = RL_KIND_SEAL, + .instance = RL_SOLE_INSTANCE, + }; +} + +static int __must_check make_index_save_region_table(struct index_save_layout *isl, + struct region_table **table_ptr) +{ + int result; + unsigned int z; + struct region_table *table; + struct layout_region *lr; + u16 region_count; + size_t payload; + size_t type; + + if (isl->zone_count > 0) { + /* + * Normal save regions: header, page map, volume index zones, + * open chapter, and possibly free space. + */ + region_count = 3 + isl->zone_count; + if (isl->free_space.block_count > 0) + region_count++; + + payload = sizeof(isl->save_data) + sizeof(isl->state_data); + type = RH_TYPE_SAVE; + } else { + /* Empty save regions: header, page map, free space. */ + region_count = 3; + payload = sizeof(isl->save_data); + type = RH_TYPE_UNSAVED; + } + + result = vdo_allocate_extended(struct region_table, region_count, + struct layout_region, + "layout region table for ISL", &table); + if (result != VDO_SUCCESS) + return result; + + lr = &table->regions[0]; + *lr++ = isl->header; + *lr++ = isl->index_page_map; + for (z = 0; z < isl->zone_count; z++) + *lr++ = isl->volume_index_zones[z]; + + if (isl->zone_count > 0) + *lr++ = isl->open_chapter; + + if (isl->free_space.block_count > 0) + *lr++ = isl->free_space; + + table->header = (struct region_header) { + .magic = REGION_MAGIC, + .region_blocks = isl->index_save.block_count, + .type = type, + .version = 1, + .region_count = region_count, + .payload = payload, + }; + + table->encoded_size = (sizeof(struct region_header) + payload + + region_count * sizeof(struct layout_region)); + *table_ptr = table; + return UDS_SUCCESS; +} + +static void encode_region_table(u8 *buffer, size_t *offset, struct region_table *table) +{ + unsigned int i; + + encode_u64_le(buffer, offset, REGION_MAGIC); + encode_u64_le(buffer, offset, table->header.region_blocks); + encode_u16_le(buffer, offset, table->header.type); + encode_u16_le(buffer, offset, table->header.version); + encode_u16_le(buffer, offset, table->header.region_count); + encode_u16_le(buffer, offset, table->header.payload); + + for (i = 0; i < table->header.region_count; i++) { + encode_u64_le(buffer, offset, table->regions[i].start_block); + encode_u64_le(buffer, offset, table->regions[i].block_count); + encode_u32_le(buffer, offset, 0); + encode_u16_le(buffer, offset, table->regions[i].kind); + encode_u16_le(buffer, offset, table->regions[i].instance); + } +} + +static int __must_check write_index_save_header(struct index_save_layout *isl, + struct region_table *table, + struct buffered_writer *writer) +{ + int result; + u8 *buffer; + size_t offset = 0; + + result = vdo_allocate(table->encoded_size, u8, "index save data", &buffer); + if (result != VDO_SUCCESS) + return result; + + encode_region_table(buffer, &offset, table); + encode_u64_le(buffer, &offset, isl->save_data.timestamp); + encode_u64_le(buffer, &offset, isl->save_data.nonce); + encode_u32_le(buffer, &offset, isl->save_data.version); + encode_u32_le(buffer, &offset, 0); + if (isl->zone_count > 0) { + encode_u32_le(buffer, &offset, INDEX_STATE_VERSION_301.signature); + encode_u32_le(buffer, &offset, INDEX_STATE_VERSION_301.version_id); + encode_u64_le(buffer, &offset, isl->state_data.newest_chapter); + encode_u64_le(buffer, &offset, isl->state_data.oldest_chapter); + encode_u64_le(buffer, &offset, isl->state_data.last_save); + encode_u64_le(buffer, &offset, 0); + } + + result = uds_write_to_buffered_writer(writer, buffer, offset); + vdo_free(buffer); + if (result != UDS_SUCCESS) + return result; + + return uds_flush_buffered_writer(writer); +} + +static int write_index_save_layout(struct index_layout *layout, + struct index_save_layout *isl) +{ + int result; + struct region_table *table; + struct buffered_writer *writer; + + result = make_index_save_region_table(isl, &table); + if (result != UDS_SUCCESS) + return result; + + result = open_region_writer(layout, &isl->header, &writer); + if (result != UDS_SUCCESS) { + vdo_free(table); + return result; + } + + result = write_index_save_header(isl, table, writer); + vdo_free(table); + uds_free_buffered_writer(writer); + + return result; +} + +static void reset_index_save_layout(struct index_save_layout *isl, u64 page_map_blocks) +{ + u64 free_blocks; + u64 next_block = isl->index_save.start_block; + + isl->zone_count = 0; + memset(&isl->save_data, 0, sizeof(isl->save_data)); + + isl->header = (struct layout_region) { + .start_block = next_block++, + .block_count = 1, + .kind = RL_KIND_HEADER, + .instance = RL_SOLE_INSTANCE, + }; + + isl->index_page_map = (struct layout_region) { + .start_block = next_block, + .block_count = page_map_blocks, + .kind = RL_KIND_INDEX_PAGE_MAP, + .instance = RL_SOLE_INSTANCE, + }; + + next_block += page_map_blocks; + + free_blocks = isl->index_save.block_count - page_map_blocks - 1; + isl->free_space = (struct layout_region) { + .start_block = next_block, + .block_count = free_blocks, + .kind = RL_KIND_EMPTY, + .instance = RL_SOLE_INSTANCE, + }; +} + +static int __must_check invalidate_old_save(struct index_layout *layout, + struct index_save_layout *isl) +{ + reset_index_save_layout(isl, layout->super.page_map_blocks); + return write_index_save_layout(layout, isl); +} + +static int discard_index_state_data(struct index_layout *layout) +{ + int result; + int saved_result = UDS_SUCCESS; + unsigned int i; + + for (i = 0; i < layout->super.max_saves; i++) { + result = invalidate_old_save(layout, &layout->index.saves[i]); + if (result != UDS_SUCCESS) + saved_result = result; + } + + if (saved_result != UDS_SUCCESS) { + return vdo_log_error_strerror(result, + "%s: cannot destroy all index saves", + __func__); + } + + return UDS_SUCCESS; +} + +static int __must_check make_layout_region_table(struct index_layout *layout, + struct region_table **table_ptr) +{ + int result; + unsigned int i; + /* Regions: header, config, index, volume, saves, seal */ + u16 region_count = 5 + layout->super.max_saves; + u16 payload; + struct region_table *table; + struct layout_region *lr; + + result = vdo_allocate_extended(struct region_table, region_count, + struct layout_region, "layout region table", + &table); + if (result != VDO_SUCCESS) + return result; + + lr = &table->regions[0]; + *lr++ = layout->header; + *lr++ = layout->config; + *lr++ = layout->index.sub_index; + *lr++ = layout->index.volume; + + for (i = 0; i < layout->super.max_saves; i++) + *lr++ = layout->index.saves[i].index_save; + + *lr++ = layout->seal; + + if (is_converted_super_block(&layout->super)) { + payload = sizeof(struct super_block_data); + } else { + payload = (sizeof(struct super_block_data) - + sizeof(layout->super.volume_offset) - + sizeof(layout->super.start_offset)); + } + + table->header = (struct region_header) { + .magic = REGION_MAGIC, + .region_blocks = layout->total_blocks, + .type = RH_TYPE_SUPER, + .version = 1, + .region_count = region_count, + .payload = payload, + }; + + table->encoded_size = (sizeof(struct region_header) + payload + + region_count * sizeof(struct layout_region)); + *table_ptr = table; + return UDS_SUCCESS; +} + +static int __must_check write_layout_header(struct index_layout *layout, + struct region_table *table, + struct buffered_writer *writer) +{ + int result; + u8 *buffer; + size_t offset = 0; + + result = vdo_allocate(table->encoded_size, u8, "layout data", &buffer); + if (result != VDO_SUCCESS) + return result; + + encode_region_table(buffer, &offset, table); + memcpy(buffer + offset, &layout->super.magic_label, MAGIC_SIZE); + offset += MAGIC_SIZE; + memcpy(buffer + offset, &layout->super.nonce_info, NONCE_INFO_SIZE); + offset += NONCE_INFO_SIZE; + encode_u64_le(buffer, &offset, layout->super.nonce); + encode_u32_le(buffer, &offset, layout->super.version); + encode_u32_le(buffer, &offset, layout->super.block_size); + encode_u16_le(buffer, &offset, layout->super.index_count); + encode_u16_le(buffer, &offset, layout->super.max_saves); + encode_u32_le(buffer, &offset, 0); + encode_u64_le(buffer, &offset, layout->super.open_chapter_blocks); + encode_u64_le(buffer, &offset, layout->super.page_map_blocks); + + if (is_converted_super_block(&layout->super)) { + encode_u64_le(buffer, &offset, layout->super.volume_offset); + encode_u64_le(buffer, &offset, layout->super.start_offset); + } + + result = uds_write_to_buffered_writer(writer, buffer, offset); + vdo_free(buffer); + if (result != UDS_SUCCESS) + return result; + + return uds_flush_buffered_writer(writer); +} + +static int __must_check write_uds_index_config(struct index_layout *layout, + struct uds_configuration *config, + off_t offset) +{ + int result; + struct buffered_writer *writer = NULL; + + result = open_layout_writer(layout, &layout->config, offset, &writer); + if (result != UDS_SUCCESS) + return vdo_log_error_strerror(result, "failed to open config region"); + + result = uds_write_config_contents(writer, config, layout->super.version); + if (result != UDS_SUCCESS) { + uds_free_buffered_writer(writer); + return vdo_log_error_strerror(result, "failed to write config region"); + } + + result = uds_flush_buffered_writer(writer); + if (result != UDS_SUCCESS) { + uds_free_buffered_writer(writer); + return vdo_log_error_strerror(result, "cannot flush config writer"); + } + + uds_free_buffered_writer(writer); + return UDS_SUCCESS; +} + +static int __must_check save_layout(struct index_layout *layout, off_t offset) +{ + int result; + struct buffered_writer *writer = NULL; + struct region_table *table; + + result = make_layout_region_table(layout, &table); + if (result != UDS_SUCCESS) + return result; + + result = open_layout_writer(layout, &layout->header, offset, &writer); + if (result != UDS_SUCCESS) { + vdo_free(table); + return result; + } + + result = write_layout_header(layout, table, writer); + vdo_free(table); + uds_free_buffered_writer(writer); + + return result; +} + +static int create_index_layout(struct index_layout *layout, struct uds_configuration *config) +{ + int result; + struct save_layout_sizes sizes; + + result = compute_sizes(config, &sizes); + if (result != UDS_SUCCESS) + return result; + + result = vdo_allocate(sizes.save_count, struct index_save_layout, __func__, + &layout->index.saves); + if (result != VDO_SUCCESS) + return result; + + initialize_layout(layout, &sizes); + + result = discard_index_state_data(layout); + if (result != UDS_SUCCESS) + return result; + + result = write_uds_index_config(layout, config, 0); + if (result != UDS_SUCCESS) + return result; + + return save_layout(layout, 0); +} + +static u64 generate_index_save_nonce(u64 volume_nonce, struct index_save_layout *isl) +{ + struct save_nonce_data { + struct index_save_data data; + u64 offset; + } nonce_data; + u8 buffer[sizeof(nonce_data)]; + size_t offset = 0; + + encode_u64_le(buffer, &offset, isl->save_data.timestamp); + encode_u64_le(buffer, &offset, 0); + encode_u32_le(buffer, &offset, isl->save_data.version); + encode_u32_le(buffer, &offset, 0U); + encode_u64_le(buffer, &offset, isl->index_save.start_block); + VDO_ASSERT_LOG_ONLY(offset == sizeof(nonce_data), + "%zu bytes encoded of %zu expected", + offset, sizeof(nonce_data)); + return generate_secondary_nonce(volume_nonce, buffer, sizeof(buffer)); +} + +static u64 validate_index_save_layout(struct index_save_layout *isl, u64 volume_nonce) +{ + if ((isl->zone_count == 0) || (isl->save_data.timestamp == 0)) + return 0; + + if (isl->save_data.nonce != generate_index_save_nonce(volume_nonce, isl)) + return 0; + + return isl->save_data.timestamp; +} + +static int find_latest_uds_index_save_slot(struct index_layout *layout, + struct index_save_layout **isl_ptr) +{ + struct index_save_layout *latest = NULL; + struct index_save_layout *isl; + unsigned int i; + u64 save_time = 0; + u64 latest_time = 0; + + for (i = 0; i < layout->super.max_saves; i++) { + isl = &layout->index.saves[i]; + save_time = validate_index_save_layout(isl, layout->index.nonce); + if (save_time > latest_time) { + latest = isl; + latest_time = save_time; + } + } + + if (latest == NULL) { + vdo_log_error("No valid index save found"); + return UDS_INDEX_NOT_SAVED_CLEANLY; + } + + *isl_ptr = latest; + return UDS_SUCCESS; +} + +int uds_discard_open_chapter(struct index_layout *layout) +{ + int result; + struct index_save_layout *isl; + struct buffered_writer *writer; + + result = find_latest_uds_index_save_slot(layout, &isl); + if (result != UDS_SUCCESS) + return result; + + result = open_region_writer(layout, &isl->open_chapter, &writer); + if (result != UDS_SUCCESS) + return result; + + result = uds_write_to_buffered_writer(writer, NULL, UDS_BLOCK_SIZE); + if (result != UDS_SUCCESS) { + uds_free_buffered_writer(writer); + return result; + } + + result = uds_flush_buffered_writer(writer); + uds_free_buffered_writer(writer); + return result; +} + +int uds_load_index_state(struct index_layout *layout, struct uds_index *index) +{ + int result; + unsigned int zone; + struct index_save_layout *isl; + struct buffered_reader *readers[MAX_ZONES]; + + result = find_latest_uds_index_save_slot(layout, &isl); + if (result != UDS_SUCCESS) + return result; + + index->newest_virtual_chapter = isl->state_data.newest_chapter; + index->oldest_virtual_chapter = isl->state_data.oldest_chapter; + index->last_save = isl->state_data.last_save; + + result = open_region_reader(layout, &isl->open_chapter, &readers[0]); + if (result != UDS_SUCCESS) + return result; + + result = uds_load_open_chapter(index, readers[0]); + uds_free_buffered_reader(readers[0]); + if (result != UDS_SUCCESS) + return result; + + for (zone = 0; zone < isl->zone_count; zone++) { + result = open_region_reader(layout, &isl->volume_index_zones[zone], + &readers[zone]); + if (result != UDS_SUCCESS) { + for (; zone > 0; zone--) + uds_free_buffered_reader(readers[zone - 1]); + + return result; + } + } + + result = uds_load_volume_index(index->volume_index, readers, isl->zone_count); + for (zone = 0; zone < isl->zone_count; zone++) + uds_free_buffered_reader(readers[zone]); + if (result != UDS_SUCCESS) + return result; + + result = open_region_reader(layout, &isl->index_page_map, &readers[0]); + if (result != UDS_SUCCESS) + return result; + + result = uds_read_index_page_map(index->volume->index_page_map, readers[0]); + uds_free_buffered_reader(readers[0]); + + return result; +} + +static struct index_save_layout *select_oldest_index_save_layout(struct index_layout *layout) +{ + struct index_save_layout *oldest = NULL; + struct index_save_layout *isl; + unsigned int i; + u64 save_time = 0; + u64 oldest_time = 0; + + for (i = 0; i < layout->super.max_saves; i++) { + isl = &layout->index.saves[i]; + save_time = validate_index_save_layout(isl, layout->index.nonce); + if (oldest == NULL || save_time < oldest_time) { + oldest = isl; + oldest_time = save_time; + } + } + + return oldest; +} + +static void instantiate_index_save_layout(struct index_save_layout *isl, + struct super_block_data *super, + u64 volume_nonce, unsigned int zone_count) +{ + unsigned int z; + u64 next_block; + u64 free_blocks; + u64 volume_index_blocks; + + isl->zone_count = zone_count; + memset(&isl->save_data, 0, sizeof(isl->save_data)); + isl->save_data.timestamp = ktime_to_ms(current_time_ns(CLOCK_REALTIME)); + isl->save_data.version = 1; + isl->save_data.nonce = generate_index_save_nonce(volume_nonce, isl); + + next_block = isl->index_save.start_block; + isl->header = (struct layout_region) { + .start_block = next_block++, + .block_count = 1, + .kind = RL_KIND_HEADER, + .instance = RL_SOLE_INSTANCE, + }; + + isl->index_page_map = (struct layout_region) { + .start_block = next_block, + .block_count = super->page_map_blocks, + .kind = RL_KIND_INDEX_PAGE_MAP, + .instance = RL_SOLE_INSTANCE, + }; + next_block += super->page_map_blocks; + + free_blocks = (isl->index_save.block_count - 1 - + super->page_map_blocks - + super->open_chapter_blocks); + volume_index_blocks = free_blocks / isl->zone_count; + for (z = 0; z < isl->zone_count; z++) { + isl->volume_index_zones[z] = (struct layout_region) { + .start_block = next_block, + .block_count = volume_index_blocks, + .kind = RL_KIND_VOLUME_INDEX, + .instance = z, + }; + + next_block += volume_index_blocks; + free_blocks -= volume_index_blocks; + } + + isl->open_chapter = (struct layout_region) { + .start_block = next_block, + .block_count = super->open_chapter_blocks, + .kind = RL_KIND_OPEN_CHAPTER, + .instance = RL_SOLE_INSTANCE, + }; + + next_block += super->open_chapter_blocks; + + isl->free_space = (struct layout_region) { + .start_block = next_block, + .block_count = free_blocks, + .kind = RL_KIND_EMPTY, + .instance = RL_SOLE_INSTANCE, + }; +} + +static int setup_uds_index_save_slot(struct index_layout *layout, + unsigned int zone_count, + struct index_save_layout **isl_ptr) +{ + int result; + struct index_save_layout *isl; + + isl = select_oldest_index_save_layout(layout); + result = invalidate_old_save(layout, isl); + if (result != UDS_SUCCESS) + return result; + + instantiate_index_save_layout(isl, &layout->super, layout->index.nonce, + zone_count); + + *isl_ptr = isl; + return UDS_SUCCESS; +} + +static void cancel_uds_index_save(struct index_save_layout *isl) +{ + memset(&isl->save_data, 0, sizeof(isl->save_data)); + memset(&isl->state_data, 0, sizeof(isl->state_data)); + isl->zone_count = 0; +} + +int uds_save_index_state(struct index_layout *layout, struct uds_index *index) +{ + int result; + unsigned int zone; + struct index_save_layout *isl; + struct buffered_writer *writers[MAX_ZONES]; + + result = setup_uds_index_save_slot(layout, index->zone_count, &isl); + if (result != UDS_SUCCESS) + return result; + + isl->state_data = (struct index_state_data301) { + .newest_chapter = index->newest_virtual_chapter, + .oldest_chapter = index->oldest_virtual_chapter, + .last_save = index->last_save, + }; + + result = open_region_writer(layout, &isl->open_chapter, &writers[0]); + if (result != UDS_SUCCESS) { + cancel_uds_index_save(isl); + return result; + } + + result = uds_save_open_chapter(index, writers[0]); + uds_free_buffered_writer(writers[0]); + if (result != UDS_SUCCESS) { + cancel_uds_index_save(isl); + return result; + } + + for (zone = 0; zone < index->zone_count; zone++) { + result = open_region_writer(layout, &isl->volume_index_zones[zone], + &writers[zone]); + if (result != UDS_SUCCESS) { + for (; zone > 0; zone--) + uds_free_buffered_writer(writers[zone - 1]); + + cancel_uds_index_save(isl); + return result; + } + } + + result = uds_save_volume_index(index->volume_index, writers, index->zone_count); + for (zone = 0; zone < index->zone_count; zone++) + uds_free_buffered_writer(writers[zone]); + if (result != UDS_SUCCESS) { + cancel_uds_index_save(isl); + return result; + } + + result = open_region_writer(layout, &isl->index_page_map, &writers[0]); + if (result != UDS_SUCCESS) { + cancel_uds_index_save(isl); + return result; + } + + result = uds_write_index_page_map(index->volume->index_page_map, writers[0]); + uds_free_buffered_writer(writers[0]); + if (result != UDS_SUCCESS) { + cancel_uds_index_save(isl); + return result; + } + + return write_index_save_layout(layout, isl); +} + +static int __must_check load_region_table(struct buffered_reader *reader, + struct region_table **table_ptr) +{ + int result; + unsigned int i; + struct region_header header; + struct region_table *table; + u8 buffer[sizeof(struct region_header)]; + size_t offset = 0; + + result = uds_read_from_buffered_reader(reader, buffer, sizeof(buffer)); + if (result != UDS_SUCCESS) + return vdo_log_error_strerror(result, "cannot read region table header"); + + decode_u64_le(buffer, &offset, &header.magic); + decode_u64_le(buffer, &offset, &header.region_blocks); + decode_u16_le(buffer, &offset, &header.type); + decode_u16_le(buffer, &offset, &header.version); + decode_u16_le(buffer, &offset, &header.region_count); + decode_u16_le(buffer, &offset, &header.payload); + + if (header.magic != REGION_MAGIC) + return UDS_NO_INDEX; + + if (header.version != 1) { + return vdo_log_error_strerror(UDS_UNSUPPORTED_VERSION, + "unknown region table version %hu", + header.version); + } + + result = vdo_allocate_extended(struct region_table, header.region_count, + struct layout_region, + "single file layout region table", &table); + if (result != VDO_SUCCESS) + return result; + + table->header = header; + for (i = 0; i < header.region_count; i++) { + u8 region_buffer[sizeof(struct layout_region)]; + + offset = 0; + result = uds_read_from_buffered_reader(reader, region_buffer, + sizeof(region_buffer)); + if (result != UDS_SUCCESS) { + vdo_free(table); + return vdo_log_error_strerror(UDS_CORRUPT_DATA, + "cannot read region table layouts"); + } + + decode_u64_le(region_buffer, &offset, &table->regions[i].start_block); + decode_u64_le(region_buffer, &offset, &table->regions[i].block_count); + offset += sizeof(u32); + decode_u16_le(region_buffer, &offset, &table->regions[i].kind); + decode_u16_le(region_buffer, &offset, &table->regions[i].instance); + } + + *table_ptr = table; + return UDS_SUCCESS; +} + +static int __must_check read_super_block_data(struct buffered_reader *reader, + struct index_layout *layout, + size_t saved_size) +{ + int result; + struct super_block_data *super = &layout->super; + u8 *buffer; + size_t offset = 0; + + result = vdo_allocate(saved_size, u8, "super block data", &buffer); + if (result != VDO_SUCCESS) + return result; + + result = uds_read_from_buffered_reader(reader, buffer, saved_size); + if (result != UDS_SUCCESS) { + vdo_free(buffer); + return vdo_log_error_strerror(result, "cannot read region table header"); + } + + memcpy(&super->magic_label, buffer, MAGIC_SIZE); + offset += MAGIC_SIZE; + memcpy(&super->nonce_info, buffer + offset, NONCE_INFO_SIZE); + offset += NONCE_INFO_SIZE; + decode_u64_le(buffer, &offset, &super->nonce); + decode_u32_le(buffer, &offset, &super->version); + decode_u32_le(buffer, &offset, &super->block_size); + decode_u16_le(buffer, &offset, &super->index_count); + decode_u16_le(buffer, &offset, &super->max_saves); + offset += sizeof(u32); + decode_u64_le(buffer, &offset, &super->open_chapter_blocks); + decode_u64_le(buffer, &offset, &super->page_map_blocks); + + if (is_converted_super_block(super)) { + decode_u64_le(buffer, &offset, &super->volume_offset); + decode_u64_le(buffer, &offset, &super->start_offset); + } else { + super->volume_offset = 0; + super->start_offset = 0; + } + + vdo_free(buffer); + + if (memcmp(super->magic_label, LAYOUT_MAGIC, MAGIC_SIZE) != 0) + return vdo_log_error_strerror(UDS_CORRUPT_DATA, + "unknown superblock magic label"); + + if ((super->version < SUPER_VERSION_MINIMUM) || + (super->version == 4) || (super->version == 5) || (super->version == 6) || + (super->version > SUPER_VERSION_MAXIMUM)) { + return vdo_log_error_strerror(UDS_UNSUPPORTED_VERSION, + "unknown superblock version number %u", + super->version); + } + + if (super->volume_offset < super->start_offset) { + return vdo_log_error_strerror(UDS_CORRUPT_DATA, + "inconsistent offsets (start %llu, volume %llu)", + (unsigned long long) super->start_offset, + (unsigned long long) super->volume_offset); + } + + /* Sub-indexes are no longer used but the layout retains this field. */ + if (super->index_count != 1) { + return vdo_log_error_strerror(UDS_CORRUPT_DATA, + "invalid subindex count %u", + super->index_count); + } + + if (generate_primary_nonce(super->nonce_info, sizeof(super->nonce_info)) != super->nonce) { + return vdo_log_error_strerror(UDS_CORRUPT_DATA, + "inconsistent superblock nonce"); + } + + return UDS_SUCCESS; +} + +static int __must_check verify_region(struct layout_region *lr, u64 start_block, + enum region_kind kind, unsigned int instance) +{ + if (lr->start_block != start_block) + return vdo_log_error_strerror(UDS_CORRUPT_DATA, + "incorrect layout region offset"); + + if (lr->kind != kind) + return vdo_log_error_strerror(UDS_CORRUPT_DATA, + "incorrect layout region kind"); + + if (lr->instance != instance) { + return vdo_log_error_strerror(UDS_CORRUPT_DATA, + "incorrect layout region instance"); + } + + return UDS_SUCCESS; +} + +static int __must_check verify_sub_index(struct index_layout *layout, u64 start_block, + struct region_table *table) +{ + int result; + unsigned int i; + struct sub_index_layout *sil = &layout->index; + u64 next_block = start_block; + + sil->sub_index = table->regions[2]; + result = verify_region(&sil->sub_index, next_block, RL_KIND_INDEX, 0); + if (result != UDS_SUCCESS) + return result; + + define_sub_index_nonce(layout); + + sil->volume = table->regions[3]; + result = verify_region(&sil->volume, next_block, RL_KIND_VOLUME, + RL_SOLE_INSTANCE); + if (result != UDS_SUCCESS) + return result; + + next_block += sil->volume.block_count + layout->super.volume_offset; + + for (i = 0; i < layout->super.max_saves; i++) { + sil->saves[i].index_save = table->regions[i + 4]; + result = verify_region(&sil->saves[i].index_save, next_block, + RL_KIND_SAVE, i); + if (result != UDS_SUCCESS) + return result; + + next_block += sil->saves[i].index_save.block_count; + } + + next_block -= layout->super.volume_offset; + if (next_block != start_block + sil->sub_index.block_count) { + return vdo_log_error_strerror(UDS_CORRUPT_DATA, + "sub index region does not span all saves"); + } + + return UDS_SUCCESS; +} + +static int __must_check reconstitute_layout(struct index_layout *layout, + struct region_table *table, u64 first_block) +{ + int result; + u64 next_block = first_block; + + result = vdo_allocate(layout->super.max_saves, struct index_save_layout, + __func__, &layout->index.saves); + if (result != VDO_SUCCESS) + return result; + + layout->total_blocks = table->header.region_blocks; + + layout->header = table->regions[0]; + result = verify_region(&layout->header, next_block++, RL_KIND_HEADER, + RL_SOLE_INSTANCE); + if (result != UDS_SUCCESS) + return result; + + layout->config = table->regions[1]; + result = verify_region(&layout->config, next_block++, RL_KIND_CONFIG, + RL_SOLE_INSTANCE); + if (result != UDS_SUCCESS) + return result; + + result = verify_sub_index(layout, next_block, table); + if (result != UDS_SUCCESS) + return result; + + next_block += layout->index.sub_index.block_count; + + layout->seal = table->regions[table->header.region_count - 1]; + result = verify_region(&layout->seal, next_block + layout->super.volume_offset, + RL_KIND_SEAL, RL_SOLE_INSTANCE); + if (result != UDS_SUCCESS) + return result; + + if (++next_block != (first_block + layout->total_blocks)) { + return vdo_log_error_strerror(UDS_CORRUPT_DATA, + "layout table does not span total blocks"); + } + + return UDS_SUCCESS; +} + +static int __must_check load_super_block(struct index_layout *layout, size_t block_size, + u64 first_block, struct buffered_reader *reader) +{ + int result; + struct region_table *table = NULL; + struct super_block_data *super = &layout->super; + + result = load_region_table(reader, &table); + if (result != UDS_SUCCESS) + return result; + + if (table->header.type != RH_TYPE_SUPER) { + vdo_free(table); + return vdo_log_error_strerror(UDS_CORRUPT_DATA, + "not a superblock region table"); + } + + result = read_super_block_data(reader, layout, table->header.payload); + if (result != UDS_SUCCESS) { + vdo_free(table); + return vdo_log_error_strerror(result, "unknown superblock format"); + } + + if (super->block_size != block_size) { + vdo_free(table); + return vdo_log_error_strerror(UDS_CORRUPT_DATA, + "superblock saved block_size %u differs from supplied block_size %zu", + super->block_size, block_size); + } + + first_block -= (super->volume_offset - super->start_offset); + result = reconstitute_layout(layout, table, first_block); + vdo_free(table); + return result; +} + +static int __must_check read_index_save_data(struct buffered_reader *reader, + struct index_save_layout *isl, + size_t saved_size) +{ + int result; + struct index_state_version file_version; + u8 buffer[sizeof(struct index_save_data) + sizeof(struct index_state_data301)]; + size_t offset = 0; + + if (saved_size != sizeof(buffer)) { + return vdo_log_error_strerror(UDS_CORRUPT_DATA, + "unexpected index save data size %zu", + saved_size); + } + + result = uds_read_from_buffered_reader(reader, buffer, sizeof(buffer)); + if (result != UDS_SUCCESS) + return vdo_log_error_strerror(result, "cannot read index save data"); + + decode_u64_le(buffer, &offset, &isl->save_data.timestamp); + decode_u64_le(buffer, &offset, &isl->save_data.nonce); + decode_u32_le(buffer, &offset, &isl->save_data.version); + offset += sizeof(u32); + + if (isl->save_data.version > 1) { + return vdo_log_error_strerror(UDS_UNSUPPORTED_VERSION, + "unknown index save version number %u", + isl->save_data.version); + } + + decode_s32_le(buffer, &offset, &file_version.signature); + decode_s32_le(buffer, &offset, &file_version.version_id); + + if ((file_version.signature != INDEX_STATE_VERSION_301.signature) || + (file_version.version_id != INDEX_STATE_VERSION_301.version_id)) { + return vdo_log_error_strerror(UDS_UNSUPPORTED_VERSION, + "index state version %d,%d is unsupported", + file_version.signature, + file_version.version_id); + } + + decode_u64_le(buffer, &offset, &isl->state_data.newest_chapter); + decode_u64_le(buffer, &offset, &isl->state_data.oldest_chapter); + decode_u64_le(buffer, &offset, &isl->state_data.last_save); + /* Skip past some historical fields that are now unused */ + offset += sizeof(u32) + sizeof(u32); + return UDS_SUCCESS; +} + +static int __must_check reconstruct_index_save(struct index_save_layout *isl, + struct region_table *table) +{ + int result; + unsigned int z; + struct layout_region *last_region; + u64 next_block = isl->index_save.start_block; + u64 last_block = next_block + isl->index_save.block_count; + + isl->zone_count = table->header.region_count - 3; + + last_region = &table->regions[table->header.region_count - 1]; + if (last_region->kind == RL_KIND_EMPTY) { + isl->free_space = *last_region; + isl->zone_count--; + } else { + isl->free_space = (struct layout_region) { + .start_block = last_block, + .block_count = 0, + .kind = RL_KIND_EMPTY, + .instance = RL_SOLE_INSTANCE, + }; + } + + isl->header = table->regions[0]; + result = verify_region(&isl->header, next_block++, RL_KIND_HEADER, + RL_SOLE_INSTANCE); + if (result != UDS_SUCCESS) + return result; + + isl->index_page_map = table->regions[1]; + result = verify_region(&isl->index_page_map, next_block, RL_KIND_INDEX_PAGE_MAP, + RL_SOLE_INSTANCE); + if (result != UDS_SUCCESS) + return result; + + next_block += isl->index_page_map.block_count; + + for (z = 0; z < isl->zone_count; z++) { + isl->volume_index_zones[z] = table->regions[z + 2]; + result = verify_region(&isl->volume_index_zones[z], next_block, + RL_KIND_VOLUME_INDEX, z); + if (result != UDS_SUCCESS) + return result; + + next_block += isl->volume_index_zones[z].block_count; + } + + isl->open_chapter = table->regions[isl->zone_count + 2]; + result = verify_region(&isl->open_chapter, next_block, RL_KIND_OPEN_CHAPTER, + RL_SOLE_INSTANCE); + if (result != UDS_SUCCESS) + return result; + + next_block += isl->open_chapter.block_count; + + result = verify_region(&isl->free_space, next_block, RL_KIND_EMPTY, + RL_SOLE_INSTANCE); + if (result != UDS_SUCCESS) + return result; + + next_block += isl->free_space.block_count; + if (next_block != last_block) { + return vdo_log_error_strerror(UDS_CORRUPT_DATA, + "index save layout table incomplete"); + } + + return UDS_SUCCESS; +} + +static int __must_check load_index_save(struct index_save_layout *isl, + struct buffered_reader *reader, + unsigned int instance) +{ + int result; + struct region_table *table = NULL; + + result = load_region_table(reader, &table); + if (result != UDS_SUCCESS) { + return vdo_log_error_strerror(result, "cannot read index save %u header", + instance); + } + + if (table->header.region_blocks != isl->index_save.block_count) { + u64 region_blocks = table->header.region_blocks; + + vdo_free(table); + return vdo_log_error_strerror(UDS_CORRUPT_DATA, + "unexpected index save %u region block count %llu", + instance, + (unsigned long long) region_blocks); + } + + if (table->header.type == RH_TYPE_UNSAVED) { + vdo_free(table); + reset_index_save_layout(isl, 0); + return UDS_SUCCESS; + } + + + if (table->header.type != RH_TYPE_SAVE) { + vdo_log_error_strerror(UDS_CORRUPT_DATA, + "unexpected index save %u header type %u", + instance, table->header.type); + vdo_free(table); + return UDS_CORRUPT_DATA; + } + + result = read_index_save_data(reader, isl, table->header.payload); + if (result != UDS_SUCCESS) { + vdo_free(table); + return vdo_log_error_strerror(result, + "unknown index save %u data format", + instance); + } + + result = reconstruct_index_save(isl, table); + vdo_free(table); + if (result != UDS_SUCCESS) { + return vdo_log_error_strerror(result, "cannot reconstruct index save %u", + instance); + } + + return UDS_SUCCESS; +} + +static int __must_check load_sub_index_regions(struct index_layout *layout) +{ + int result; + unsigned int j; + struct index_save_layout *isl; + struct buffered_reader *reader; + + for (j = 0; j < layout->super.max_saves; j++) { + isl = &layout->index.saves[j]; + result = open_region_reader(layout, &isl->index_save, &reader); + + if (result != UDS_SUCCESS) { + vdo_log_error_strerror(result, + "cannot get reader for index 0 save %u", + j); + return result; + } + + result = load_index_save(isl, reader, j); + uds_free_buffered_reader(reader); + if (result != UDS_SUCCESS) { + /* Another save slot might be valid. */ + reset_index_save_layout(isl, 0); + continue; + } + } + + return UDS_SUCCESS; +} + +static int __must_check verify_uds_index_config(struct index_layout *layout, + struct uds_configuration *config) +{ + int result; + struct buffered_reader *reader = NULL; + u64 offset; + + offset = layout->super.volume_offset - layout->super.start_offset; + result = open_layout_reader(layout, &layout->config, offset, &reader); + if (result != UDS_SUCCESS) + return vdo_log_error_strerror(result, "failed to open config reader"); + + result = uds_validate_config_contents(reader, config); + if (result != UDS_SUCCESS) { + uds_free_buffered_reader(reader); + return vdo_log_error_strerror(result, "failed to read config region"); + } + + uds_free_buffered_reader(reader); + return UDS_SUCCESS; +} + +static int load_index_layout(struct index_layout *layout, struct uds_configuration *config) +{ + int result; + struct buffered_reader *reader; + + result = uds_make_buffered_reader(layout->factory, + layout->offset / UDS_BLOCK_SIZE, 1, &reader); + if (result != UDS_SUCCESS) + return vdo_log_error_strerror(result, "unable to read superblock"); + + result = load_super_block(layout, UDS_BLOCK_SIZE, + layout->offset / UDS_BLOCK_SIZE, reader); + uds_free_buffered_reader(reader); + if (result != UDS_SUCCESS) + return result; + + result = verify_uds_index_config(layout, config); + if (result != UDS_SUCCESS) + return result; + + return load_sub_index_regions(layout); +} + +static int create_layout_factory(struct index_layout *layout, + const struct uds_configuration *config) +{ + int result; + size_t writable_size; + struct io_factory *factory = NULL; + + result = uds_make_io_factory(config->bdev, &factory); + if (result != UDS_SUCCESS) + return result; + + writable_size = uds_get_writable_size(factory) & -UDS_BLOCK_SIZE; + if (writable_size < config->size + config->offset) { + uds_put_io_factory(factory); + vdo_log_error("index storage (%zu) is smaller than the requested size %zu", + writable_size, config->size + config->offset); + return -ENOSPC; + } + + layout->factory = factory; + layout->factory_size = (config->size > 0) ? config->size : writable_size; + layout->offset = config->offset; + return UDS_SUCCESS; +} + +int uds_make_index_layout(struct uds_configuration *config, bool new_layout, + struct index_layout **layout_ptr) +{ + int result; + struct index_layout *layout = NULL; + struct save_layout_sizes sizes; + + result = compute_sizes(config, &sizes); + if (result != UDS_SUCCESS) + return result; + + result = vdo_allocate(1, struct index_layout, __func__, &layout); + if (result != VDO_SUCCESS) + return result; + + result = create_layout_factory(layout, config); + if (result != UDS_SUCCESS) { + uds_free_index_layout(layout); + return result; + } + + if (layout->factory_size < sizes.total_size) { + vdo_log_error("index storage (%zu) is smaller than the required size %llu", + layout->factory_size, + (unsigned long long) sizes.total_size); + uds_free_index_layout(layout); + return -ENOSPC; + } + + if (new_layout) + result = create_index_layout(layout, config); + else + result = load_index_layout(layout, config); + if (result != UDS_SUCCESS) { + uds_free_index_layout(layout); + return result; + } + + *layout_ptr = layout; + return UDS_SUCCESS; +} + +void uds_free_index_layout(struct index_layout *layout) +{ + if (layout == NULL) + return; + + vdo_free(layout->index.saves); + if (layout->factory != NULL) + uds_put_io_factory(layout->factory); + + vdo_free(layout); +} + +int uds_replace_index_layout_storage(struct index_layout *layout, + struct block_device *bdev) +{ + return uds_replace_storage(layout->factory, bdev); +} + +/* Obtain a dm_bufio_client for the volume region. */ +int uds_open_volume_bufio(struct index_layout *layout, size_t block_size, + unsigned int reserved_buffers, + struct dm_bufio_client **client_ptr) +{ + off_t offset = (layout->index.volume.start_block + + layout->super.volume_offset - + layout->super.start_offset); + + return uds_make_bufio(layout->factory, offset, block_size, reserved_buffers, + client_ptr); +} + +u64 uds_get_volume_nonce(struct index_layout *layout) +{ + return layout->index.nonce; +} diff --git a/drivers/md/dm-vdo/indexer/index-layout.h b/drivers/md/dm-vdo/indexer/index-layout.h new file mode 100644 index 0000000000..e9ac6f4302 --- /dev/null +++ b/drivers/md/dm-vdo/indexer/index-layout.h @@ -0,0 +1,43 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright 2023 Red Hat + */ + +#ifndef UDS_INDEX_LAYOUT_H +#define UDS_INDEX_LAYOUT_H + +#include "config.h" +#include "indexer.h" +#include "io-factory.h" + +/* + * The index layout describes the format of the index on the underlying storage, and is responsible + * for creating those structures when the index is first created. It also validates the index data + * when loading a saved index, and updates it when saving the index. + */ + +struct index_layout; + +int __must_check uds_make_index_layout(struct uds_configuration *config, bool new_layout, + struct index_layout **layout_ptr); + +void uds_free_index_layout(struct index_layout *layout); + +int __must_check uds_replace_index_layout_storage(struct index_layout *layout, + struct block_device *bdev); + +int __must_check uds_load_index_state(struct index_layout *layout, + struct uds_index *index); + +int __must_check uds_save_index_state(struct index_layout *layout, + struct uds_index *index); + +int __must_check uds_discard_open_chapter(struct index_layout *layout); + +u64 __must_check uds_get_volume_nonce(struct index_layout *layout); + +int __must_check uds_open_volume_bufio(struct index_layout *layout, size_t block_size, + unsigned int reserved_buffers, + struct dm_bufio_client **client_ptr); + +#endif /* UDS_INDEX_LAYOUT_H */ diff --git a/drivers/md/dm-vdo/indexer/index-page-map.c b/drivers/md/dm-vdo/indexer/index-page-map.c new file mode 100644 index 0000000000..00b44e07d0 --- /dev/null +++ b/drivers/md/dm-vdo/indexer/index-page-map.c @@ -0,0 +1,173 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2023 Red Hat + */ + +#include "index-page-map.h" + +#include "errors.h" +#include "logger.h" +#include "memory-alloc.h" +#include "numeric.h" +#include "permassert.h" +#include "string-utils.h" +#include "thread-utils.h" + +#include "hash-utils.h" +#include "indexer.h" + +/* + * The index page map is conceptually a two-dimensional array indexed by chapter number and index + * page number within the chapter. Each entry contains the number of the last delta list on that + * index page. In order to save memory, the information for the last page in each chapter is not + * recorded, as it is known from the geometry. + */ + +static const u8 PAGE_MAP_MAGIC[] = "ALBIPM02"; + +#define PAGE_MAP_MAGIC_LENGTH (sizeof(PAGE_MAP_MAGIC) - 1) + +static inline u32 get_entry_count(const struct index_geometry *geometry) +{ + return geometry->chapters_per_volume * (geometry->index_pages_per_chapter - 1); +} + +int uds_make_index_page_map(const struct index_geometry *geometry, + struct index_page_map **map_ptr) +{ + int result; + struct index_page_map *map; + + result = vdo_allocate(1, struct index_page_map, "page map", &map); + if (result != VDO_SUCCESS) + return result; + + map->geometry = geometry; + map->entries_per_chapter = geometry->index_pages_per_chapter - 1; + result = vdo_allocate(get_entry_count(geometry), u16, "Index Page Map Entries", + &map->entries); + if (result != VDO_SUCCESS) { + uds_free_index_page_map(map); + return result; + } + + *map_ptr = map; + return UDS_SUCCESS; +} + +void uds_free_index_page_map(struct index_page_map *map) +{ + if (map != NULL) { + vdo_free(map->entries); + vdo_free(map); + } +} + +void uds_update_index_page_map(struct index_page_map *map, u64 virtual_chapter_number, + u32 chapter_number, u32 index_page_number, + u32 delta_list_number) +{ + size_t slot; + + map->last_update = virtual_chapter_number; + if (index_page_number == map->entries_per_chapter) + return; + + slot = (chapter_number * map->entries_per_chapter) + index_page_number; + map->entries[slot] = delta_list_number; +} + +u32 uds_find_index_page_number(const struct index_page_map *map, + const struct uds_record_name *name, u32 chapter_number) +{ + u32 delta_list_number = uds_hash_to_chapter_delta_list(name, map->geometry); + u32 slot = chapter_number * map->entries_per_chapter; + u32 page; + + for (page = 0; page < map->entries_per_chapter; page++) { + if (delta_list_number <= map->entries[slot + page]) + break; + } + + return page; +} + +void uds_get_list_number_bounds(const struct index_page_map *map, u32 chapter_number, + u32 index_page_number, u32 *lowest_list, + u32 *highest_list) +{ + u32 slot = chapter_number * map->entries_per_chapter; + + *lowest_list = ((index_page_number == 0) ? + 0 : map->entries[slot + index_page_number - 1] + 1); + *highest_list = ((index_page_number < map->entries_per_chapter) ? + map->entries[slot + index_page_number] : + map->geometry->delta_lists_per_chapter - 1); +} + +u64 uds_compute_index_page_map_save_size(const struct index_geometry *geometry) +{ + return PAGE_MAP_MAGIC_LENGTH + sizeof(u64) + sizeof(u16) * get_entry_count(geometry); +} + +int uds_write_index_page_map(struct index_page_map *map, struct buffered_writer *writer) +{ + int result; + u8 *buffer; + size_t offset = 0; + u64 saved_size = uds_compute_index_page_map_save_size(map->geometry); + u32 i; + + result = vdo_allocate(saved_size, u8, "page map data", &buffer); + if (result != VDO_SUCCESS) + return result; + + memcpy(buffer, PAGE_MAP_MAGIC, PAGE_MAP_MAGIC_LENGTH); + offset += PAGE_MAP_MAGIC_LENGTH; + encode_u64_le(buffer, &offset, map->last_update); + for (i = 0; i < get_entry_count(map->geometry); i++) + encode_u16_le(buffer, &offset, map->entries[i]); + + result = uds_write_to_buffered_writer(writer, buffer, offset); + vdo_free(buffer); + if (result != UDS_SUCCESS) + return result; + + return uds_flush_buffered_writer(writer); +} + +int uds_read_index_page_map(struct index_page_map *map, struct buffered_reader *reader) +{ + int result; + u8 magic[PAGE_MAP_MAGIC_LENGTH]; + u8 *buffer; + size_t offset = 0; + u64 saved_size = uds_compute_index_page_map_save_size(map->geometry); + u32 i; + + result = vdo_allocate(saved_size, u8, "page map data", &buffer); + if (result != VDO_SUCCESS) + return result; + + result = uds_read_from_buffered_reader(reader, buffer, saved_size); + if (result != UDS_SUCCESS) { + vdo_free(buffer); + return result; + } + + memcpy(&magic, buffer, PAGE_MAP_MAGIC_LENGTH); + offset += PAGE_MAP_MAGIC_LENGTH; + if (memcmp(magic, PAGE_MAP_MAGIC, PAGE_MAP_MAGIC_LENGTH) != 0) { + vdo_free(buffer); + return UDS_CORRUPT_DATA; + } + + decode_u64_le(buffer, &offset, &map->last_update); + for (i = 0; i < get_entry_count(map->geometry); i++) + decode_u16_le(buffer, &offset, &map->entries[i]); + + vdo_free(buffer); + vdo_log_debug("read index page map, last update %llu", + (unsigned long long) map->last_update); + return UDS_SUCCESS; +} diff --git a/drivers/md/dm-vdo/indexer/index-page-map.h b/drivers/md/dm-vdo/indexer/index-page-map.h new file mode 100644 index 0000000000..b327c0bb96 --- /dev/null +++ b/drivers/md/dm-vdo/indexer/index-page-map.h @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright 2023 Red Hat + */ + +#ifndef UDS_INDEX_PAGE_MAP_H +#define UDS_INDEX_PAGE_MAP_H + +#include "geometry.h" +#include "io-factory.h" + +/* + * The index maintains a page map which records how the chapter delta lists are distributed among + * the index pages for each chapter, allowing the volume to be efficient about reading only pages + * that it knows it will need. + */ + +struct index_page_map { + const struct index_geometry *geometry; + u64 last_update; + u32 entries_per_chapter; + u16 *entries; +}; + +int __must_check uds_make_index_page_map(const struct index_geometry *geometry, + struct index_page_map **map_ptr); + +void uds_free_index_page_map(struct index_page_map *map); + +int __must_check uds_read_index_page_map(struct index_page_map *map, + struct buffered_reader *reader); + +int __must_check uds_write_index_page_map(struct index_page_map *map, + struct buffered_writer *writer); + +void uds_update_index_page_map(struct index_page_map *map, u64 virtual_chapter_number, + u32 chapter_number, u32 index_page_number, + u32 delta_list_number); + +u32 __must_check uds_find_index_page_number(const struct index_page_map *map, + const struct uds_record_name *name, + u32 chapter_number); + +void uds_get_list_number_bounds(const struct index_page_map *map, u32 chapter_number, + u32 index_page_number, u32 *lowest_list, + u32 *highest_list); + +u64 uds_compute_index_page_map_save_size(const struct index_geometry *geometry); + +#endif /* UDS_INDEX_PAGE_MAP_H */ diff --git a/drivers/md/dm-vdo/indexer/index-session.c b/drivers/md/dm-vdo/indexer/index-session.c new file mode 100644 index 0000000000..aee0914d60 --- /dev/null +++ b/drivers/md/dm-vdo/indexer/index-session.c @@ -0,0 +1,739 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2023 Red Hat + */ + +#include "index-session.h" + +#include + +#include "logger.h" +#include "memory-alloc.h" +#include "time-utils.h" + +#include "funnel-requestqueue.h" +#include "index.h" +#include "index-layout.h" + +/* + * The index session contains a lock (the request_mutex) which ensures that only one thread can + * change the state of its index at a time. The state field indicates the current state of the + * index through a set of descriptive flags. The request_mutex must be notified whenever a + * non-transient state flag is cleared. The request_mutex is also used to count the number of + * requests currently in progress so that they can be drained when suspending or closing the index. + * + * If the index session is suspended shortly after opening an index, it may have to suspend during + * a rebuild. Depending on the size of the index, a rebuild may take a significant amount of time, + * so UDS allows the rebuild to be paused in order to suspend the session in a timely manner. When + * the index session is resumed, the rebuild can continue from where it left off. If the index + * session is shut down with a suspended rebuild, the rebuild progress is abandoned and the rebuild + * will start from the beginning the next time the index is loaded. The mutex and status fields in + * the index_load_context are used to record the state of any interrupted rebuild. + */ + +enum index_session_flag_bit { + IS_FLAG_BIT_START = 8, + /* The session has started loading an index but not completed it. */ + IS_FLAG_BIT_LOADING = IS_FLAG_BIT_START, + /* The session has loaded an index, which can handle requests. */ + IS_FLAG_BIT_LOADED, + /* The session's index has been permanently disabled. */ + IS_FLAG_BIT_DISABLED, + /* The session's index is suspended. */ + IS_FLAG_BIT_SUSPENDED, + /* The session is handling some index state change. */ + IS_FLAG_BIT_WAITING, + /* The session's index is closing and draining requests. */ + IS_FLAG_BIT_CLOSING, + /* The session is being destroyed and is draining requests. */ + IS_FLAG_BIT_DESTROYING, +}; + +enum index_session_flag { + IS_FLAG_LOADED = (1 << IS_FLAG_BIT_LOADED), + IS_FLAG_LOADING = (1 << IS_FLAG_BIT_LOADING), + IS_FLAG_DISABLED = (1 << IS_FLAG_BIT_DISABLED), + IS_FLAG_SUSPENDED = (1 << IS_FLAG_BIT_SUSPENDED), + IS_FLAG_WAITING = (1 << IS_FLAG_BIT_WAITING), + IS_FLAG_CLOSING = (1 << IS_FLAG_BIT_CLOSING), + IS_FLAG_DESTROYING = (1 << IS_FLAG_BIT_DESTROYING), +}; + +/* Release a reference to an index session. */ +static void release_index_session(struct uds_index_session *index_session) +{ + mutex_lock(&index_session->request_mutex); + if (--index_session->request_count == 0) + uds_broadcast_cond(&index_session->request_cond); + mutex_unlock(&index_session->request_mutex); +} + +/* + * Acquire a reference to the index session for an asynchronous index request. The reference must + * eventually be released with a corresponding call to release_index_session(). + */ +static int get_index_session(struct uds_index_session *index_session) +{ + unsigned int state; + int result = UDS_SUCCESS; + + mutex_lock(&index_session->request_mutex); + index_session->request_count++; + state = index_session->state; + mutex_unlock(&index_session->request_mutex); + + if (state == IS_FLAG_LOADED) { + return UDS_SUCCESS; + } else if (state & IS_FLAG_DISABLED) { + result = UDS_DISABLED; + } else if ((state & IS_FLAG_LOADING) || + (state & IS_FLAG_SUSPENDED) || + (state & IS_FLAG_WAITING)) { + result = -EBUSY; + } else { + result = UDS_NO_INDEX; + } + + release_index_session(index_session); + return result; +} + +int uds_launch_request(struct uds_request *request) +{ + size_t internal_size; + int result; + + if (request->callback == NULL) { + vdo_log_error("missing required callback"); + return -EINVAL; + } + + switch (request->type) { + case UDS_DELETE: + case UDS_POST: + case UDS_QUERY: + case UDS_QUERY_NO_UPDATE: + case UDS_UPDATE: + break; + default: + vdo_log_error("received invalid callback type"); + return -EINVAL; + } + + /* Reset all internal fields before processing. */ + internal_size = + sizeof(struct uds_request) - offsetof(struct uds_request, zone_number); + // FIXME should be using struct_group for this instead + memset((char *) request + sizeof(*request) - internal_size, 0, internal_size); + + result = get_index_session(request->session); + if (result != UDS_SUCCESS) + return result; + + request->found = false; + request->unbatched = false; + request->index = request->session->index; + + uds_enqueue_request(request, STAGE_TRIAGE); + return UDS_SUCCESS; +} + +static void enter_callback_stage(struct uds_request *request) +{ + if (request->status != UDS_SUCCESS) { + /* All request errors are considered unrecoverable */ + mutex_lock(&request->session->request_mutex); + request->session->state |= IS_FLAG_DISABLED; + mutex_unlock(&request->session->request_mutex); + } + + uds_request_queue_enqueue(request->session->callback_queue, request); +} + +static inline void count_once(u64 *count_ptr) +{ + WRITE_ONCE(*count_ptr, READ_ONCE(*count_ptr) + 1); +} + +static void update_session_stats(struct uds_request *request) +{ + struct session_stats *session_stats = &request->session->stats; + + count_once(&session_stats->requests); + + switch (request->type) { + case UDS_POST: + if (request->found) + count_once(&session_stats->posts_found); + else + count_once(&session_stats->posts_not_found); + + if (request->location == UDS_LOCATION_IN_OPEN_CHAPTER) + count_once(&session_stats->posts_found_open_chapter); + else if (request->location == UDS_LOCATION_IN_DENSE) + count_once(&session_stats->posts_found_dense); + else if (request->location == UDS_LOCATION_IN_SPARSE) + count_once(&session_stats->posts_found_sparse); + break; + + case UDS_UPDATE: + if (request->found) + count_once(&session_stats->updates_found); + else + count_once(&session_stats->updates_not_found); + break; + + case UDS_DELETE: + if (request->found) + count_once(&session_stats->deletions_found); + else + count_once(&session_stats->deletions_not_found); + break; + + case UDS_QUERY: + case UDS_QUERY_NO_UPDATE: + if (request->found) + count_once(&session_stats->queries_found); + else + count_once(&session_stats->queries_not_found); + break; + + default: + request->status = VDO_ASSERT(false, "unknown request type: %d", + request->type); + } +} + +static void handle_callbacks(struct uds_request *request) +{ + struct uds_index_session *index_session = request->session; + + if (request->status == UDS_SUCCESS) + update_session_stats(request); + + request->status = uds_status_to_errno(request->status); + request->callback(request); + release_index_session(index_session); +} + +static int __must_check make_empty_index_session(struct uds_index_session **index_session_ptr) +{ + int result; + struct uds_index_session *session; + + result = vdo_allocate(1, struct uds_index_session, __func__, &session); + if (result != VDO_SUCCESS) + return result; + + mutex_init(&session->request_mutex); + uds_init_cond(&session->request_cond); + mutex_init(&session->load_context.mutex); + uds_init_cond(&session->load_context.cond); + + result = uds_make_request_queue("callbackW", &handle_callbacks, + &session->callback_queue); + if (result != UDS_SUCCESS) { + vdo_free(session); + return result; + } + + *index_session_ptr = session; + return UDS_SUCCESS; +} + +int uds_create_index_session(struct uds_index_session **session) +{ + if (session == NULL) { + vdo_log_error("missing session pointer"); + return -EINVAL; + } + + return uds_status_to_errno(make_empty_index_session(session)); +} + +static int __must_check start_loading_index_session(struct uds_index_session *index_session) +{ + int result; + + mutex_lock(&index_session->request_mutex); + if (index_session->state & IS_FLAG_SUSPENDED) { + vdo_log_info("Index session is suspended"); + result = -EBUSY; + } else if (index_session->state != 0) { + vdo_log_info("Index is already loaded"); + result = -EBUSY; + } else { + index_session->state |= IS_FLAG_LOADING; + result = UDS_SUCCESS; + } + mutex_unlock(&index_session->request_mutex); + return result; +} + +static void finish_loading_index_session(struct uds_index_session *index_session, + int result) +{ + mutex_lock(&index_session->request_mutex); + index_session->state &= ~IS_FLAG_LOADING; + if (result == UDS_SUCCESS) + index_session->state |= IS_FLAG_LOADED; + + uds_broadcast_cond(&index_session->request_cond); + mutex_unlock(&index_session->request_mutex); +} + +static int initialize_index_session(struct uds_index_session *index_session, + enum uds_open_index_type open_type) +{ + int result; + struct uds_configuration *config; + + result = uds_make_configuration(&index_session->parameters, &config); + if (result != UDS_SUCCESS) { + vdo_log_error_strerror(result, "Failed to allocate config"); + return result; + } + + memset(&index_session->stats, 0, sizeof(index_session->stats)); + result = uds_make_index(config, open_type, &index_session->load_context, + enter_callback_stage, &index_session->index); + if (result != UDS_SUCCESS) + vdo_log_error_strerror(result, "Failed to make index"); + else + uds_log_configuration(config); + + uds_free_configuration(config); + return result; +} + +static const char *get_open_type_string(enum uds_open_index_type open_type) +{ + switch (open_type) { + case UDS_CREATE: + return "creating index"; + case UDS_LOAD: + return "loading or rebuilding index"; + case UDS_NO_REBUILD: + return "loading index"; + default: + return "unknown open method"; + } +} + +/* + * Open an index under the given session. This operation will fail if the + * index session is suspended, or if there is already an open index. + */ +int uds_open_index(enum uds_open_index_type open_type, + const struct uds_parameters *parameters, + struct uds_index_session *session) +{ + int result; + char name[BDEVNAME_SIZE]; + + if (parameters == NULL) { + vdo_log_error("missing required parameters"); + return -EINVAL; + } + if (parameters->bdev == NULL) { + vdo_log_error("missing required block device"); + return -EINVAL; + } + if (session == NULL) { + vdo_log_error("missing required session pointer"); + return -EINVAL; + } + + result = start_loading_index_session(session); + if (result != UDS_SUCCESS) + return uds_status_to_errno(result); + + session->parameters = *parameters; + format_dev_t(name, parameters->bdev->bd_dev); + vdo_log_info("%s: %s", get_open_type_string(open_type), name); + + result = initialize_index_session(session, open_type); + if (result != UDS_SUCCESS) + vdo_log_error_strerror(result, "Failed %s", + get_open_type_string(open_type)); + + finish_loading_index_session(session, result); + return uds_status_to_errno(result); +} + +static void wait_for_no_requests_in_progress(struct uds_index_session *index_session) +{ + mutex_lock(&index_session->request_mutex); + while (index_session->request_count > 0) { + uds_wait_cond(&index_session->request_cond, + &index_session->request_mutex); + } + mutex_unlock(&index_session->request_mutex); +} + +static int __must_check save_index(struct uds_index_session *index_session) +{ + wait_for_no_requests_in_progress(index_session); + return uds_save_index(index_session->index); +} + +static void suspend_rebuild(struct uds_index_session *session) +{ + mutex_lock(&session->load_context.mutex); + switch (session->load_context.status) { + case INDEX_OPENING: + session->load_context.status = INDEX_SUSPENDING; + + /* Wait until the index indicates that it is not replaying. */ + while ((session->load_context.status != INDEX_SUSPENDED) && + (session->load_context.status != INDEX_READY)) { + uds_wait_cond(&session->load_context.cond, + &session->load_context.mutex); + } + + break; + + case INDEX_READY: + /* Index load does not need to be suspended. */ + break; + + case INDEX_SUSPENDED: + case INDEX_SUSPENDING: + case INDEX_FREEING: + default: + /* These cases should not happen. */ + VDO_ASSERT_LOG_ONLY(false, "Bad load context state %u", + session->load_context.status); + break; + } + mutex_unlock(&session->load_context.mutex); +} + +/* + * Suspend index operation, draining all current index requests and preventing new index requests + * from starting. Optionally saves all index data before returning. + */ +int uds_suspend_index_session(struct uds_index_session *session, bool save) +{ + int result = UDS_SUCCESS; + bool no_work = false; + bool rebuilding = false; + + /* Wait for any current index state change to complete. */ + mutex_lock(&session->request_mutex); + while (session->state & IS_FLAG_CLOSING) + uds_wait_cond(&session->request_cond, &session->request_mutex); + + if ((session->state & IS_FLAG_WAITING) || (session->state & IS_FLAG_DESTROYING)) { + no_work = true; + vdo_log_info("Index session is already changing state"); + result = -EBUSY; + } else if (session->state & IS_FLAG_SUSPENDED) { + no_work = true; + } else if (session->state & IS_FLAG_LOADING) { + session->state |= IS_FLAG_WAITING; + rebuilding = true; + } else if (session->state & IS_FLAG_LOADED) { + session->state |= IS_FLAG_WAITING; + } else { + no_work = true; + session->state |= IS_FLAG_SUSPENDED; + uds_broadcast_cond(&session->request_cond); + } + mutex_unlock(&session->request_mutex); + + if (no_work) + return uds_status_to_errno(result); + + if (rebuilding) + suspend_rebuild(session); + else if (save) + result = save_index(session); + else + result = uds_flush_index_session(session); + + mutex_lock(&session->request_mutex); + session->state &= ~IS_FLAG_WAITING; + session->state |= IS_FLAG_SUSPENDED; + uds_broadcast_cond(&session->request_cond); + mutex_unlock(&session->request_mutex); + return uds_status_to_errno(result); +} + +static int replace_device(struct uds_index_session *session, struct block_device *bdev) +{ + int result; + + result = uds_replace_index_storage(session->index, bdev); + if (result != UDS_SUCCESS) + return result; + + session->parameters.bdev = bdev; + return UDS_SUCCESS; +} + +/* + * Resume index operation after being suspended. If the index is suspended and the supplied block + * device differs from the current backing store, the index will start using the new backing store. + */ +int uds_resume_index_session(struct uds_index_session *session, + struct block_device *bdev) +{ + int result = UDS_SUCCESS; + bool no_work = false; + bool resume_replay = false; + + mutex_lock(&session->request_mutex); + if (session->state & IS_FLAG_WAITING) { + vdo_log_info("Index session is already changing state"); + no_work = true; + result = -EBUSY; + } else if (!(session->state & IS_FLAG_SUSPENDED)) { + /* If not suspended, just succeed. */ + no_work = true; + result = UDS_SUCCESS; + } else { + session->state |= IS_FLAG_WAITING; + if (session->state & IS_FLAG_LOADING) + resume_replay = true; + } + mutex_unlock(&session->request_mutex); + + if (no_work) + return result; + + if ((session->index != NULL) && (bdev != session->parameters.bdev)) { + result = replace_device(session, bdev); + if (result != UDS_SUCCESS) { + mutex_lock(&session->request_mutex); + session->state &= ~IS_FLAG_WAITING; + uds_broadcast_cond(&session->request_cond); + mutex_unlock(&session->request_mutex); + return uds_status_to_errno(result); + } + } + + if (resume_replay) { + mutex_lock(&session->load_context.mutex); + switch (session->load_context.status) { + case INDEX_SUSPENDED: + session->load_context.status = INDEX_OPENING; + /* Notify the index to start replaying again. */ + uds_broadcast_cond(&session->load_context.cond); + break; + + case INDEX_READY: + /* There is no index rebuild to resume. */ + break; + + case INDEX_OPENING: + case INDEX_SUSPENDING: + case INDEX_FREEING: + default: + /* These cases should not happen; do nothing. */ + VDO_ASSERT_LOG_ONLY(false, "Bad load context state %u", + session->load_context.status); + break; + } + mutex_unlock(&session->load_context.mutex); + } + + mutex_lock(&session->request_mutex); + session->state &= ~IS_FLAG_WAITING; + session->state &= ~IS_FLAG_SUSPENDED; + uds_broadcast_cond(&session->request_cond); + mutex_unlock(&session->request_mutex); + return UDS_SUCCESS; +} + +static int save_and_free_index(struct uds_index_session *index_session) +{ + int result = UDS_SUCCESS; + bool suspended; + struct uds_index *index = index_session->index; + + if (index == NULL) + return UDS_SUCCESS; + + mutex_lock(&index_session->request_mutex); + suspended = (index_session->state & IS_FLAG_SUSPENDED); + mutex_unlock(&index_session->request_mutex); + + if (!suspended) { + result = uds_save_index(index); + if (result != UDS_SUCCESS) + vdo_log_warning_strerror(result, + "ignoring error from save_index"); + } + uds_free_index(index); + index_session->index = NULL; + + /* + * Reset all index state that happens to be in the index + * session, so it doesn't affect any future index. + */ + mutex_lock(&index_session->load_context.mutex); + index_session->load_context.status = INDEX_OPENING; + mutex_unlock(&index_session->load_context.mutex); + + mutex_lock(&index_session->request_mutex); + /* Only the suspend bit will remain relevant. */ + index_session->state &= IS_FLAG_SUSPENDED; + mutex_unlock(&index_session->request_mutex); + + return result; +} + +/* Save and close the current index. */ +int uds_close_index(struct uds_index_session *index_session) +{ + int result = UDS_SUCCESS; + + /* Wait for any current index state change to complete. */ + mutex_lock(&index_session->request_mutex); + while ((index_session->state & IS_FLAG_WAITING) || + (index_session->state & IS_FLAG_CLOSING)) { + uds_wait_cond(&index_session->request_cond, + &index_session->request_mutex); + } + + if (index_session->state & IS_FLAG_SUSPENDED) { + vdo_log_info("Index session is suspended"); + result = -EBUSY; + } else if ((index_session->state & IS_FLAG_DESTROYING) || + !(index_session->state & IS_FLAG_LOADED)) { + /* The index doesn't exist, hasn't finished loading, or is being destroyed. */ + result = UDS_NO_INDEX; + } else { + index_session->state |= IS_FLAG_CLOSING; + } + mutex_unlock(&index_session->request_mutex); + if (result != UDS_SUCCESS) + return uds_status_to_errno(result); + + vdo_log_debug("Closing index"); + wait_for_no_requests_in_progress(index_session); + result = save_and_free_index(index_session); + vdo_log_debug("Closed index"); + + mutex_lock(&index_session->request_mutex); + index_session->state &= ~IS_FLAG_CLOSING; + uds_broadcast_cond(&index_session->request_cond); + mutex_unlock(&index_session->request_mutex); + return uds_status_to_errno(result); +} + +/* This will save and close an open index before destroying the session. */ +int uds_destroy_index_session(struct uds_index_session *index_session) +{ + int result; + bool load_pending = false; + + vdo_log_debug("Destroying index session"); + + /* Wait for any current index state change to complete. */ + mutex_lock(&index_session->request_mutex); + while ((index_session->state & IS_FLAG_WAITING) || + (index_session->state & IS_FLAG_CLOSING)) { + uds_wait_cond(&index_session->request_cond, + &index_session->request_mutex); + } + + if (index_session->state & IS_FLAG_DESTROYING) { + mutex_unlock(&index_session->request_mutex); + vdo_log_info("Index session is already closing"); + return -EBUSY; + } + + index_session->state |= IS_FLAG_DESTROYING; + load_pending = ((index_session->state & IS_FLAG_LOADING) && + (index_session->state & IS_FLAG_SUSPENDED)); + mutex_unlock(&index_session->request_mutex); + + if (load_pending) { + /* Tell the index to terminate the rebuild. */ + mutex_lock(&index_session->load_context.mutex); + if (index_session->load_context.status == INDEX_SUSPENDED) { + index_session->load_context.status = INDEX_FREEING; + uds_broadcast_cond(&index_session->load_context.cond); + } + mutex_unlock(&index_session->load_context.mutex); + + /* Wait until the load exits before proceeding. */ + mutex_lock(&index_session->request_mutex); + while (index_session->state & IS_FLAG_LOADING) { + uds_wait_cond(&index_session->request_cond, + &index_session->request_mutex); + } + mutex_unlock(&index_session->request_mutex); + } + + wait_for_no_requests_in_progress(index_session); + result = save_and_free_index(index_session); + uds_request_queue_finish(index_session->callback_queue); + index_session->callback_queue = NULL; + vdo_log_debug("Destroyed index session"); + vdo_free(index_session); + return uds_status_to_errno(result); +} + +/* Wait until all callbacks for index operations are complete. */ +int uds_flush_index_session(struct uds_index_session *index_session) +{ + wait_for_no_requests_in_progress(index_session); + uds_wait_for_idle_index(index_session->index); + return UDS_SUCCESS; +} + +/* Statistics collection is intended to be thread-safe. */ +static void collect_stats(const struct uds_index_session *index_session, + struct uds_index_stats *stats) +{ + const struct session_stats *session_stats = &index_session->stats; + + stats->current_time = ktime_to_seconds(current_time_ns(CLOCK_REALTIME)); + stats->posts_found = READ_ONCE(session_stats->posts_found); + stats->in_memory_posts_found = READ_ONCE(session_stats->posts_found_open_chapter); + stats->dense_posts_found = READ_ONCE(session_stats->posts_found_dense); + stats->sparse_posts_found = READ_ONCE(session_stats->posts_found_sparse); + stats->posts_not_found = READ_ONCE(session_stats->posts_not_found); + stats->updates_found = READ_ONCE(session_stats->updates_found); + stats->updates_not_found = READ_ONCE(session_stats->updates_not_found); + stats->deletions_found = READ_ONCE(session_stats->deletions_found); + stats->deletions_not_found = READ_ONCE(session_stats->deletions_not_found); + stats->queries_found = READ_ONCE(session_stats->queries_found); + stats->queries_not_found = READ_ONCE(session_stats->queries_not_found); + stats->requests = READ_ONCE(session_stats->requests); +} + +int uds_get_index_session_stats(struct uds_index_session *index_session, + struct uds_index_stats *stats) +{ + if (stats == NULL) { + vdo_log_error("received a NULL index stats pointer"); + return -EINVAL; + } + + collect_stats(index_session, stats); + if (index_session->index != NULL) { + uds_get_index_stats(index_session->index, stats); + } else { + stats->entries_indexed = 0; + stats->memory_used = 0; + stats->collisions = 0; + stats->entries_discarded = 0; + } + + return UDS_SUCCESS; +} + +void uds_wait_cond(struct cond_var *cv, struct mutex *mutex) +{ + DEFINE_WAIT(__wait); + + prepare_to_wait(&cv->wait_queue, &__wait, TASK_IDLE); + mutex_unlock(mutex); + schedule(); + finish_wait(&cv->wait_queue, &__wait); + mutex_lock(mutex); +} diff --git a/drivers/md/dm-vdo/indexer/index-session.h b/drivers/md/dm-vdo/indexer/index-session.h new file mode 100644 index 0000000000..066648f6e0 --- /dev/null +++ b/drivers/md/dm-vdo/indexer/index-session.h @@ -0,0 +1,85 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright 2023 Red Hat + */ + +#ifndef UDS_INDEX_SESSION_H +#define UDS_INDEX_SESSION_H + +#include +#include + +#include "thread-utils.h" + +#include "config.h" +#include "indexer.h" + +/* + * The index session mediates all interactions with a UDS index. Once the index session is created, + * it can be used to open, close, suspend, or recreate an index. It implements the majority of the + * functions in the top-level UDS API. + * + * If any deduplication request fails due to an internal error, the index is marked disabled. It + * will not accept any further requests and can only be closed. Closing the index will clear the + * disabled flag, and the index can then be reopened and recovered using the same index session. + */ + +struct __aligned(L1_CACHE_BYTES) session_stats { + /* Post requests that found an entry */ + u64 posts_found; + /* Post requests found in the open chapter */ + u64 posts_found_open_chapter; + /* Post requests found in the dense index */ + u64 posts_found_dense; + /* Post requests found in the sparse index */ + u64 posts_found_sparse; + /* Post requests that did not find an entry */ + u64 posts_not_found; + /* Update requests that found an entry */ + u64 updates_found; + /* Update requests that did not find an entry */ + u64 updates_not_found; + /* Delete requests that found an entry */ + u64 deletions_found; + /* Delete requests that did not find an entry */ + u64 deletions_not_found; + /* Query requests that found an entry */ + u64 queries_found; + /* Query requests that did not find an entry */ + u64 queries_not_found; + /* Total number of requests */ + u64 requests; +}; + +enum index_suspend_status { + /* An index load has started but the index is not ready for use. */ + INDEX_OPENING = 0, + /* The index is able to handle requests. */ + INDEX_READY, + /* The index is attempting to suspend a rebuild. */ + INDEX_SUSPENDING, + /* An index rebuild has been suspended. */ + INDEX_SUSPENDED, + /* An index rebuild is being stopped in order to shut down. */ + INDEX_FREEING, +}; + +struct index_load_context { + struct mutex mutex; + struct cond_var cond; + enum index_suspend_status status; +}; + +struct uds_index_session { + unsigned int state; + struct uds_index *index; + struct uds_request_queue *callback_queue; + struct uds_parameters parameters; + struct index_load_context load_context; + struct mutex request_mutex; + struct cond_var request_cond; + int request_count; + struct session_stats stats; +}; + +#endif /* UDS_INDEX_SESSION_H */ diff --git a/drivers/md/dm-vdo/indexer/index.c b/drivers/md/dm-vdo/indexer/index.c new file mode 100644 index 0000000000..1ba7671444 --- /dev/null +++ b/drivers/md/dm-vdo/indexer/index.c @@ -0,0 +1,1388 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2023 Red Hat + */ + + +#include "index.h" + +#include "logger.h" +#include "memory-alloc.h" + +#include "funnel-requestqueue.h" +#include "hash-utils.h" +#include "sparse-cache.h" + +static const u64 NO_LAST_SAVE = U64_MAX; + +/* + * When searching for deduplication records, the index first searches the volume index, and then + * searches the chapter index for the relevant chapter. If the chapter has been fully committed to + * storage, the chapter pages are loaded into the page cache. If the chapter has not yet been + * committed (either the open chapter or a recently closed one), the index searches the in-memory + * representation of the chapter. Finally, if the volume index does not find a record and the index + * is sparse, the index will search the sparse cache. + * + * The index send two kinds of messages to coordinate between zones: chapter close messages for the + * chapter writer, and sparse cache barrier messages for the sparse cache. + * + * The chapter writer is responsible for committing chapters of records to storage. Since zones can + * get different numbers of records, some zones may fall behind others. Each time a zone fills up + * its available space in a chapter, it informs the chapter writer that the chapter is complete, + * and also informs all other zones that it has closed the chapter. Each other zone will then close + * the chapter immediately, regardless of how full it is, in order to minimize skew between zones. + * Once every zone has closed the chapter, the chapter writer will commit that chapter to storage. + * + * The last zone to close the chapter also removes the oldest chapter from the volume index. + * Although that chapter is invalid for zones that have moved on, the existence of the open chapter + * means that those zones will never ask the volume index about it. No zone is allowed to get more + * than one chapter ahead of any other. If a zone is so far ahead that it tries to close another + * chapter before the previous one has been closed by all zones, it is forced to wait. + * + * The sparse cache relies on having the same set of chapter indexes available to all zones. When a + * request wants to add a chapter to the sparse cache, it sends a barrier message to each zone + * during the triage stage that acts as a rendezvous. Once every zone has reached the barrier and + * paused its operations, the cache membership is changed and each zone is then informed that it + * can proceed. More details can be found in the sparse cache documentation. + * + * If a sparse cache has only one zone, it will not create a triage queue, but it still needs the + * barrier message to change the sparse cache membership, so the index simulates the message by + * invoking the handler directly. + */ + +struct chapter_writer { + /* The index to which we belong */ + struct uds_index *index; + /* The thread to do the writing */ + struct thread *thread; + /* The lock protecting the following fields */ + struct mutex mutex; + /* The condition signalled on state changes */ + struct cond_var cond; + /* Set to true to stop the thread */ + bool stop; + /* The result from the most recent write */ + int result; + /* The number of bytes allocated by the chapter writer */ + size_t memory_size; + /* The number of zones which have submitted a chapter for writing */ + unsigned int zones_to_write; + /* Open chapter index used by uds_close_open_chapter() */ + struct open_chapter_index *open_chapter_index; + /* Collated records used by uds_close_open_chapter() */ + struct uds_volume_record *collated_records; + /* The chapters to write (one per zone) */ + struct open_chapter_zone *chapters[]; +}; + +static bool is_zone_chapter_sparse(const struct index_zone *zone, u64 virtual_chapter) +{ + return uds_is_chapter_sparse(zone->index->volume->geometry, + zone->oldest_virtual_chapter, + zone->newest_virtual_chapter, virtual_chapter); +} + +static int launch_zone_message(struct uds_zone_message message, unsigned int zone, + struct uds_index *index) +{ + int result; + struct uds_request *request; + + result = vdo_allocate(1, struct uds_request, __func__, &request); + if (result != VDO_SUCCESS) + return result; + + request->index = index; + request->unbatched = true; + request->zone_number = zone; + request->zone_message = message; + + uds_enqueue_request(request, STAGE_MESSAGE); + return UDS_SUCCESS; +} + +static void enqueue_barrier_messages(struct uds_index *index, u64 virtual_chapter) +{ + struct uds_zone_message message = { + .type = UDS_MESSAGE_SPARSE_CACHE_BARRIER, + .virtual_chapter = virtual_chapter, + }; + unsigned int zone; + + for (zone = 0; zone < index->zone_count; zone++) { + int result = launch_zone_message(message, zone, index); + + VDO_ASSERT_LOG_ONLY((result == UDS_SUCCESS), "barrier message allocation"); + } +} + +/* + * Determine whether this request should trigger a sparse cache barrier message to change the + * membership of the sparse cache. If a change in membership is desired, the function returns the + * chapter number to add. + */ +static u64 triage_index_request(struct uds_index *index, struct uds_request *request) +{ + u64 virtual_chapter; + struct index_zone *zone; + + virtual_chapter = uds_lookup_volume_index_name(index->volume_index, + &request->record_name); + if (virtual_chapter == NO_CHAPTER) + return NO_CHAPTER; + + zone = index->zones[request->zone_number]; + if (!is_zone_chapter_sparse(zone, virtual_chapter)) + return NO_CHAPTER; + + /* + * FIXME: Optimize for a common case by remembering the chapter from the most recent + * barrier message and skipping this chapter if is it the same. + */ + + return virtual_chapter; +} + +/* + * Simulate a message to change the sparse cache membership for a single-zone sparse index. This + * allows us to forgo the complicated locking required by a multi-zone sparse index. Any other kind + * of index does nothing here. + */ +static int simulate_index_zone_barrier_message(struct index_zone *zone, + struct uds_request *request) +{ + u64 sparse_virtual_chapter; + + if ((zone->index->zone_count > 1) || + !uds_is_sparse_index_geometry(zone->index->volume->geometry)) + return UDS_SUCCESS; + + sparse_virtual_chapter = triage_index_request(zone->index, request); + if (sparse_virtual_chapter == NO_CHAPTER) + return UDS_SUCCESS; + + return uds_update_sparse_cache(zone, sparse_virtual_chapter); +} + +/* This is the request processing function for the triage queue. */ +static void triage_request(struct uds_request *request) +{ + struct uds_index *index = request->index; + u64 sparse_virtual_chapter = triage_index_request(index, request); + + if (sparse_virtual_chapter != NO_CHAPTER) + enqueue_barrier_messages(index, sparse_virtual_chapter); + + uds_enqueue_request(request, STAGE_INDEX); +} + +static int finish_previous_chapter(struct uds_index *index, u64 current_chapter_number) +{ + int result; + struct chapter_writer *writer = index->chapter_writer; + + mutex_lock(&writer->mutex); + while (index->newest_virtual_chapter < current_chapter_number) + uds_wait_cond(&writer->cond, &writer->mutex); + result = writer->result; + mutex_unlock(&writer->mutex); + + if (result != UDS_SUCCESS) + return vdo_log_error_strerror(result, + "Writing of previous open chapter failed"); + + return UDS_SUCCESS; +} + +static int swap_open_chapter(struct index_zone *zone) +{ + int result; + struct open_chapter_zone *temporary_chapter; + + result = finish_previous_chapter(zone->index, zone->newest_virtual_chapter); + if (result != UDS_SUCCESS) + return result; + + temporary_chapter = zone->open_chapter; + zone->open_chapter = zone->writing_chapter; + zone->writing_chapter = temporary_chapter; + return UDS_SUCCESS; +} + +/* + * Inform the chapter writer that this zone is done with this chapter. The chapter won't start + * writing until all zones have closed it. + */ +static unsigned int start_closing_chapter(struct uds_index *index, + unsigned int zone_number, + struct open_chapter_zone *chapter) +{ + unsigned int finished_zones; + struct chapter_writer *writer = index->chapter_writer; + + mutex_lock(&writer->mutex); + finished_zones = ++writer->zones_to_write; + writer->chapters[zone_number] = chapter; + uds_broadcast_cond(&writer->cond); + mutex_unlock(&writer->mutex); + + return finished_zones; +} + +static int announce_chapter_closed(struct index_zone *zone, u64 closed_chapter) +{ + int result; + unsigned int i; + struct uds_zone_message zone_message = { + .type = UDS_MESSAGE_ANNOUNCE_CHAPTER_CLOSED, + .virtual_chapter = closed_chapter, + }; + + for (i = 0; i < zone->index->zone_count; i++) { + if (zone->id == i) + continue; + + result = launch_zone_message(zone_message, i, zone->index); + if (result != UDS_SUCCESS) + return result; + } + + return UDS_SUCCESS; +} + +static int open_next_chapter(struct index_zone *zone) +{ + int result; + u64 closed_chapter; + u64 expiring; + unsigned int finished_zones; + u32 expire_chapters; + + vdo_log_debug("closing chapter %llu of zone %u after %u entries (%u short)", + (unsigned long long) zone->newest_virtual_chapter, zone->id, + zone->open_chapter->size, + zone->open_chapter->capacity - zone->open_chapter->size); + + result = swap_open_chapter(zone); + if (result != UDS_SUCCESS) + return result; + + closed_chapter = zone->newest_virtual_chapter++; + uds_set_volume_index_zone_open_chapter(zone->index->volume_index, zone->id, + zone->newest_virtual_chapter); + uds_reset_open_chapter(zone->open_chapter); + + finished_zones = start_closing_chapter(zone->index, zone->id, + zone->writing_chapter); + if ((finished_zones == 1) && (zone->index->zone_count > 1)) { + result = announce_chapter_closed(zone, closed_chapter); + if (result != UDS_SUCCESS) + return result; + } + + expiring = zone->oldest_virtual_chapter; + expire_chapters = uds_chapters_to_expire(zone->index->volume->geometry, + zone->newest_virtual_chapter); + zone->oldest_virtual_chapter += expire_chapters; + + if (finished_zones < zone->index->zone_count) + return UDS_SUCCESS; + + while (expire_chapters-- > 0) + uds_forget_chapter(zone->index->volume, expiring++); + + return UDS_SUCCESS; +} + +static int handle_chapter_closed(struct index_zone *zone, u64 virtual_chapter) +{ + if (zone->newest_virtual_chapter == virtual_chapter) + return open_next_chapter(zone); + + return UDS_SUCCESS; +} + +static int dispatch_index_zone_control_request(struct uds_request *request) +{ + struct uds_zone_message *message = &request->zone_message; + struct index_zone *zone = request->index->zones[request->zone_number]; + + switch (message->type) { + case UDS_MESSAGE_SPARSE_CACHE_BARRIER: + return uds_update_sparse_cache(zone, message->virtual_chapter); + + case UDS_MESSAGE_ANNOUNCE_CHAPTER_CLOSED: + return handle_chapter_closed(zone, message->virtual_chapter); + + default: + vdo_log_error("invalid message type: %d", message->type); + return UDS_INVALID_ARGUMENT; + } +} + +static void set_request_location(struct uds_request *request, + enum uds_index_region new_location) +{ + request->location = new_location; + request->found = ((new_location == UDS_LOCATION_IN_OPEN_CHAPTER) || + (new_location == UDS_LOCATION_IN_DENSE) || + (new_location == UDS_LOCATION_IN_SPARSE)); +} + +static void set_chapter_location(struct uds_request *request, + const struct index_zone *zone, u64 virtual_chapter) +{ + request->found = true; + if (virtual_chapter == zone->newest_virtual_chapter) + request->location = UDS_LOCATION_IN_OPEN_CHAPTER; + else if (is_zone_chapter_sparse(zone, virtual_chapter)) + request->location = UDS_LOCATION_IN_SPARSE; + else + request->location = UDS_LOCATION_IN_DENSE; +} + +static int search_sparse_cache_in_zone(struct index_zone *zone, struct uds_request *request, + u64 virtual_chapter, bool *found) +{ + int result; + struct volume *volume; + u16 record_page_number; + u32 chapter; + + result = uds_search_sparse_cache(zone, &request->record_name, &virtual_chapter, + &record_page_number); + if ((result != UDS_SUCCESS) || (virtual_chapter == NO_CHAPTER)) + return result; + + request->virtual_chapter = virtual_chapter; + volume = zone->index->volume; + chapter = uds_map_to_physical_chapter(volume->geometry, virtual_chapter); + return uds_search_cached_record_page(volume, request, chapter, + record_page_number, found); +} + +static int get_record_from_zone(struct index_zone *zone, struct uds_request *request, + bool *found) +{ + struct volume *volume; + + if (request->location == UDS_LOCATION_RECORD_PAGE_LOOKUP) { + *found = true; + return UDS_SUCCESS; + } else if (request->location == UDS_LOCATION_UNAVAILABLE) { + *found = false; + return UDS_SUCCESS; + } + + if (request->virtual_chapter == zone->newest_virtual_chapter) { + uds_search_open_chapter(zone->open_chapter, &request->record_name, + &request->old_metadata, found); + return UDS_SUCCESS; + } + + if ((zone->newest_virtual_chapter > 0) && + (request->virtual_chapter == (zone->newest_virtual_chapter - 1)) && + (zone->writing_chapter->size > 0)) { + uds_search_open_chapter(zone->writing_chapter, &request->record_name, + &request->old_metadata, found); + return UDS_SUCCESS; + } + + volume = zone->index->volume; + if (is_zone_chapter_sparse(zone, request->virtual_chapter) && + uds_sparse_cache_contains(volume->sparse_cache, request->virtual_chapter, + request->zone_number)) + return search_sparse_cache_in_zone(zone, request, + request->virtual_chapter, found); + + return uds_search_volume_page_cache(volume, request, found); +} + +static int put_record_in_zone(struct index_zone *zone, struct uds_request *request, + const struct uds_record_data *metadata) +{ + unsigned int remaining; + + remaining = uds_put_open_chapter(zone->open_chapter, &request->record_name, + metadata); + if (remaining == 0) + return open_next_chapter(zone); + + return UDS_SUCCESS; +} + +static int search_index_zone(struct index_zone *zone, struct uds_request *request) +{ + int result; + struct volume_index_record record; + bool overflow_record, found = false; + struct uds_record_data *metadata; + u64 chapter; + + result = uds_get_volume_index_record(zone->index->volume_index, + &request->record_name, &record); + if (result != UDS_SUCCESS) + return result; + + if (record.is_found) { + if (request->requeued && request->virtual_chapter != record.virtual_chapter) + set_request_location(request, UDS_LOCATION_UNKNOWN); + + request->virtual_chapter = record.virtual_chapter; + result = get_record_from_zone(zone, request, &found); + if (result != UDS_SUCCESS) + return result; + } + + if (found) + set_chapter_location(request, zone, record.virtual_chapter); + + /* + * If a record has overflowed a chapter index in more than one chapter (or overflowed in + * one chapter and collided with an existing record), it will exist as a collision record + * in the volume index, but we won't find it in the volume. This case needs special + * handling. + */ + overflow_record = (record.is_found && record.is_collision && !found); + chapter = zone->newest_virtual_chapter; + if (found || overflow_record) { + if ((request->type == UDS_QUERY_NO_UPDATE) || + ((request->type == UDS_QUERY) && overflow_record)) { + /* There is nothing left to do. */ + return UDS_SUCCESS; + } + + if (record.virtual_chapter != chapter) { + /* + * Update the volume index to reference the new chapter for the block. If + * the record had been deleted or dropped from the chapter index, it will + * be back. + */ + result = uds_set_volume_index_record_chapter(&record, chapter); + } else if (request->type != UDS_UPDATE) { + /* The record is already in the open chapter. */ + return UDS_SUCCESS; + } + } else { + /* + * The record wasn't in the volume index, so check whether the + * name is in a cached sparse chapter. If we found the name on + * a previous search, use that result instead. + */ + if (request->location == UDS_LOCATION_RECORD_PAGE_LOOKUP) { + found = true; + } else if (request->location == UDS_LOCATION_UNAVAILABLE) { + found = false; + } else if (uds_is_sparse_index_geometry(zone->index->volume->geometry) && + !uds_is_volume_index_sample(zone->index->volume_index, + &request->record_name)) { + result = search_sparse_cache_in_zone(zone, request, NO_CHAPTER, + &found); + if (result != UDS_SUCCESS) + return result; + } + + if (found) + set_request_location(request, UDS_LOCATION_IN_SPARSE); + + if ((request->type == UDS_QUERY_NO_UPDATE) || + ((request->type == UDS_QUERY) && !found)) { + /* There is nothing left to do. */ + return UDS_SUCCESS; + } + + /* + * Add a new entry to the volume index referencing the open chapter. This needs to + * be done both for new records, and for records from cached sparse chapters. + */ + result = uds_put_volume_index_record(&record, chapter); + } + + if (result == UDS_OVERFLOW) { + /* + * The volume index encountered a delta list overflow. The condition was already + * logged. We will go on without adding the record to the open chapter. + */ + return UDS_SUCCESS; + } + + if (result != UDS_SUCCESS) + return result; + + if (!found || (request->type == UDS_UPDATE)) { + /* This is a new record or we're updating an existing record. */ + metadata = &request->new_metadata; + } else { + /* Move the existing record to the open chapter. */ + metadata = &request->old_metadata; + } + + return put_record_in_zone(zone, request, metadata); +} + +static int remove_from_index_zone(struct index_zone *zone, struct uds_request *request) +{ + int result; + struct volume_index_record record; + + result = uds_get_volume_index_record(zone->index->volume_index, + &request->record_name, &record); + if (result != UDS_SUCCESS) + return result; + + if (!record.is_found) + return UDS_SUCCESS; + + /* If the request was requeued, check whether the saved state is still valid. */ + + if (record.is_collision) { + set_chapter_location(request, zone, record.virtual_chapter); + } else { + /* Non-collision records are hints, so resolve the name in the chapter. */ + bool found; + + if (request->requeued && request->virtual_chapter != record.virtual_chapter) + set_request_location(request, UDS_LOCATION_UNKNOWN); + + request->virtual_chapter = record.virtual_chapter; + result = get_record_from_zone(zone, request, &found); + if (result != UDS_SUCCESS) + return result; + + if (!found) { + /* There is no record to remove. */ + return UDS_SUCCESS; + } + } + + set_chapter_location(request, zone, record.virtual_chapter); + + /* + * Delete the volume index entry for the named record only. Note that a later search might + * later return stale advice if there is a colliding name in the same chapter, but it's a + * very rare case (1 in 2^21). + */ + result = uds_remove_volume_index_record(&record); + if (result != UDS_SUCCESS) + return result; + + /* + * If the record is in the open chapter, we must remove it or mark it deleted to avoid + * trouble if the record is added again later. + */ + if (request->location == UDS_LOCATION_IN_OPEN_CHAPTER) + uds_remove_from_open_chapter(zone->open_chapter, &request->record_name); + + return UDS_SUCCESS; +} + +static int dispatch_index_request(struct uds_index *index, struct uds_request *request) +{ + int result; + struct index_zone *zone = index->zones[request->zone_number]; + + if (!request->requeued) { + result = simulate_index_zone_barrier_message(zone, request); + if (result != UDS_SUCCESS) + return result; + } + + switch (request->type) { + case UDS_POST: + case UDS_UPDATE: + case UDS_QUERY: + case UDS_QUERY_NO_UPDATE: + result = search_index_zone(zone, request); + break; + + case UDS_DELETE: + result = remove_from_index_zone(zone, request); + break; + + default: + result = vdo_log_warning_strerror(UDS_INVALID_ARGUMENT, + "invalid request type: %d", + request->type); + break; + } + + return result; +} + +/* This is the request processing function invoked by each zone's thread. */ +static void execute_zone_request(struct uds_request *request) +{ + int result; + struct uds_index *index = request->index; + + if (request->zone_message.type != UDS_MESSAGE_NONE) { + result = dispatch_index_zone_control_request(request); + if (result != UDS_SUCCESS) { + vdo_log_error_strerror(result, "error executing message: %d", + request->zone_message.type); + } + + /* Once the message is processed it can be freed. */ + vdo_free(vdo_forget(request)); + return; + } + + index->need_to_save = true; + if (request->requeued && (request->status != UDS_SUCCESS)) { + set_request_location(request, UDS_LOCATION_UNAVAILABLE); + index->callback(request); + return; + } + + result = dispatch_index_request(index, request); + if (result == UDS_QUEUED) { + /* The request has been requeued so don't let it complete. */ + return; + } + + if (!request->found) + set_request_location(request, UDS_LOCATION_UNAVAILABLE); + + request->status = result; + index->callback(request); +} + +static int initialize_index_queues(struct uds_index *index, + const struct index_geometry *geometry) +{ + int result; + unsigned int i; + + for (i = 0; i < index->zone_count; i++) { + result = uds_make_request_queue("indexW", &execute_zone_request, + &index->zone_queues[i]); + if (result != UDS_SUCCESS) + return result; + } + + /* The triage queue is only needed for sparse multi-zone indexes. */ + if ((index->zone_count > 1) && uds_is_sparse_index_geometry(geometry)) { + result = uds_make_request_queue("triageW", &triage_request, + &index->triage_queue); + if (result != UDS_SUCCESS) + return result; + } + + return UDS_SUCCESS; +} + +/* This is the driver function for the chapter writer thread. */ +static void close_chapters(void *arg) +{ + int result; + struct chapter_writer *writer = arg; + struct uds_index *index = writer->index; + + vdo_log_debug("chapter writer starting"); + mutex_lock(&writer->mutex); + for (;;) { + while (writer->zones_to_write < index->zone_count) { + if (writer->stop && (writer->zones_to_write == 0)) { + /* + * We've been told to stop, and all of the zones are in the same + * open chapter, so we can exit now. + */ + mutex_unlock(&writer->mutex); + vdo_log_debug("chapter writer stopping"); + return; + } + uds_wait_cond(&writer->cond, &writer->mutex); + } + + /* + * Release the lock while closing a chapter. We probably don't need to do this, but + * it seems safer in principle. It's OK to access the chapter and chapter_number + * fields without the lock since those aren't allowed to change until we're done. + */ + mutex_unlock(&writer->mutex); + + if (index->has_saved_open_chapter) { + /* + * Remove the saved open chapter the first time we close an open chapter + * after loading from a clean shutdown, or after doing a clean save. The + * lack of the saved open chapter will indicate that a recovery is + * necessary. + */ + index->has_saved_open_chapter = false; + result = uds_discard_open_chapter(index->layout); + if (result == UDS_SUCCESS) + vdo_log_debug("Discarding saved open chapter"); + } + + result = uds_close_open_chapter(writer->chapters, index->zone_count, + index->volume, + writer->open_chapter_index, + writer->collated_records, + index->newest_virtual_chapter); + + mutex_lock(&writer->mutex); + index->newest_virtual_chapter++; + index->oldest_virtual_chapter += + uds_chapters_to_expire(index->volume->geometry, + index->newest_virtual_chapter); + writer->result = result; + writer->zones_to_write = 0; + uds_broadcast_cond(&writer->cond); + } +} + +static void stop_chapter_writer(struct chapter_writer *writer) +{ + struct thread *writer_thread = NULL; + + mutex_lock(&writer->mutex); + if (writer->thread != NULL) { + writer_thread = writer->thread; + writer->thread = NULL; + writer->stop = true; + uds_broadcast_cond(&writer->cond); + } + mutex_unlock(&writer->mutex); + + if (writer_thread != NULL) + vdo_join_threads(writer_thread); +} + +static void free_chapter_writer(struct chapter_writer *writer) +{ + if (writer == NULL) + return; + + stop_chapter_writer(writer); + uds_free_open_chapter_index(writer->open_chapter_index); + vdo_free(writer->collated_records); + vdo_free(writer); +} + +static int make_chapter_writer(struct uds_index *index, + struct chapter_writer **writer_ptr) +{ + int result; + struct chapter_writer *writer; + size_t collated_records_size = + (sizeof(struct uds_volume_record) * index->volume->geometry->records_per_chapter); + + result = vdo_allocate_extended(struct chapter_writer, index->zone_count, + struct open_chapter_zone *, "Chapter Writer", + &writer); + if (result != VDO_SUCCESS) + return result; + + writer->index = index; + mutex_init(&writer->mutex); + uds_init_cond(&writer->cond); + + result = vdo_allocate_cache_aligned(collated_records_size, "collated records", + &writer->collated_records); + if (result != VDO_SUCCESS) { + free_chapter_writer(writer); + return result; + } + + result = uds_make_open_chapter_index(&writer->open_chapter_index, + index->volume->geometry, + index->volume->nonce); + if (result != UDS_SUCCESS) { + free_chapter_writer(writer); + return result; + } + + writer->memory_size = (sizeof(struct chapter_writer) + + index->zone_count * sizeof(struct open_chapter_zone *) + + collated_records_size + + writer->open_chapter_index->memory_size); + + result = vdo_create_thread(close_chapters, writer, "writer", &writer->thread); + if (result != VDO_SUCCESS) { + free_chapter_writer(writer); + return result; + } + + *writer_ptr = writer; + return UDS_SUCCESS; +} + +static int load_index(struct uds_index *index) +{ + int result; + u64 last_save_chapter; + + result = uds_load_index_state(index->layout, index); + if (result != UDS_SUCCESS) + return UDS_INDEX_NOT_SAVED_CLEANLY; + + last_save_chapter = ((index->last_save != NO_LAST_SAVE) ? index->last_save : 0); + + vdo_log_info("loaded index from chapter %llu through chapter %llu", + (unsigned long long) index->oldest_virtual_chapter, + (unsigned long long) last_save_chapter); + + return UDS_SUCCESS; +} + +static int rebuild_index_page_map(struct uds_index *index, u64 vcn) +{ + int result; + struct delta_index_page *chapter_index_page; + struct index_geometry *geometry = index->volume->geometry; + u32 chapter = uds_map_to_physical_chapter(geometry, vcn); + u32 expected_list_number = 0; + u32 index_page_number; + u32 lowest_delta_list; + u32 highest_delta_list; + + for (index_page_number = 0; + index_page_number < geometry->index_pages_per_chapter; + index_page_number++) { + result = uds_get_volume_index_page(index->volume, chapter, + index_page_number, + &chapter_index_page); + if (result != UDS_SUCCESS) { + return vdo_log_error_strerror(result, + "failed to read index page %u in chapter %u", + index_page_number, chapter); + } + + lowest_delta_list = chapter_index_page->lowest_list_number; + highest_delta_list = chapter_index_page->highest_list_number; + if (lowest_delta_list != expected_list_number) { + return vdo_log_error_strerror(UDS_CORRUPT_DATA, + "chapter %u index page %u is corrupt", + chapter, index_page_number); + } + + uds_update_index_page_map(index->volume->index_page_map, vcn, chapter, + index_page_number, highest_delta_list); + expected_list_number = highest_delta_list + 1; + } + + return UDS_SUCCESS; +} + +static int replay_record(struct uds_index *index, const struct uds_record_name *name, + u64 virtual_chapter, bool will_be_sparse_chapter) +{ + int result; + struct volume_index_record record; + bool update_record; + + if (will_be_sparse_chapter && + !uds_is_volume_index_sample(index->volume_index, name)) { + /* + * This entry will be in a sparse chapter after the rebuild completes, and it is + * not a sample, so just skip over it. + */ + return UDS_SUCCESS; + } + + result = uds_get_volume_index_record(index->volume_index, name, &record); + if (result != UDS_SUCCESS) + return result; + + if (record.is_found) { + if (record.is_collision) { + if (record.virtual_chapter == virtual_chapter) { + /* The record is already correct. */ + return UDS_SUCCESS; + } + + update_record = true; + } else if (record.virtual_chapter == virtual_chapter) { + /* + * There is a volume index entry pointing to the current chapter, but we + * don't know if it is for the same name as the one we are currently + * working on or not. For now, we're just going to assume that it isn't. + * This will create one extra collision record if there was a deleted + * record in the current chapter. + */ + update_record = false; + } else { + /* + * If we're rebuilding, we don't normally want to go to disk to see if the + * record exists, since we will likely have just read the record from disk + * (i.e. we know it's there). The exception to this is when we find an + * entry in the volume index that has a different chapter. In this case, we + * need to search that chapter to determine if the volume index entry was + * for the same record or a different one. + */ + result = uds_search_volume_page_cache_for_rebuild(index->volume, + name, + record.virtual_chapter, + &update_record); + if (result != UDS_SUCCESS) + return result; + } + } else { + update_record = false; + } + + if (update_record) { + /* + * Update the volume index to reference the new chapter for the block. If the + * record had been deleted or dropped from the chapter index, it will be back. + */ + result = uds_set_volume_index_record_chapter(&record, virtual_chapter); + } else { + /* + * Add a new entry to the volume index referencing the open chapter. This should be + * done regardless of whether we are a brand new record or a sparse record, i.e. + * one that doesn't exist in the index but does on disk, since for a sparse record, + * we would want to un-sparsify if it did exist. + */ + result = uds_put_volume_index_record(&record, virtual_chapter); + } + + if ((result == UDS_DUPLICATE_NAME) || (result == UDS_OVERFLOW)) { + /* The rebuilt index will lose these records. */ + return UDS_SUCCESS; + } + + return result; +} + +static bool check_for_suspend(struct uds_index *index) +{ + bool closing; + + if (index->load_context == NULL) + return false; + + mutex_lock(&index->load_context->mutex); + if (index->load_context->status != INDEX_SUSPENDING) { + mutex_unlock(&index->load_context->mutex); + return false; + } + + /* Notify that we are suspended and wait for the resume. */ + index->load_context->status = INDEX_SUSPENDED; + uds_broadcast_cond(&index->load_context->cond); + + while ((index->load_context->status != INDEX_OPENING) && + (index->load_context->status != INDEX_FREEING)) + uds_wait_cond(&index->load_context->cond, &index->load_context->mutex); + + closing = (index->load_context->status == INDEX_FREEING); + mutex_unlock(&index->load_context->mutex); + return closing; +} + +static int replay_chapter(struct uds_index *index, u64 virtual, bool sparse) +{ + int result; + u32 i; + u32 j; + const struct index_geometry *geometry; + u32 physical_chapter; + + if (check_for_suspend(index)) { + vdo_log_info("Replay interrupted by index shutdown at chapter %llu", + (unsigned long long) virtual); + return -EBUSY; + } + + geometry = index->volume->geometry; + physical_chapter = uds_map_to_physical_chapter(geometry, virtual); + uds_prefetch_volume_chapter(index->volume, physical_chapter); + uds_set_volume_index_open_chapter(index->volume_index, virtual); + + result = rebuild_index_page_map(index, virtual); + if (result != UDS_SUCCESS) { + return vdo_log_error_strerror(result, + "could not rebuild index page map for chapter %u", + physical_chapter); + } + + for (i = 0; i < geometry->record_pages_per_chapter; i++) { + u8 *record_page; + u32 record_page_number; + + record_page_number = geometry->index_pages_per_chapter + i; + result = uds_get_volume_record_page(index->volume, physical_chapter, + record_page_number, &record_page); + if (result != UDS_SUCCESS) { + return vdo_log_error_strerror(result, "could not get page %d", + record_page_number); + } + + for (j = 0; j < geometry->records_per_page; j++) { + const u8 *name_bytes; + struct uds_record_name name; + + name_bytes = record_page + (j * BYTES_PER_RECORD); + memcpy(&name.name, name_bytes, UDS_RECORD_NAME_SIZE); + result = replay_record(index, &name, virtual, sparse); + if (result != UDS_SUCCESS) + return result; + } + } + + return UDS_SUCCESS; +} + +static int replay_volume(struct uds_index *index) +{ + int result; + u64 old_map_update; + u64 new_map_update; + u64 virtual; + u64 from_virtual = index->oldest_virtual_chapter; + u64 upto_virtual = index->newest_virtual_chapter; + bool will_be_sparse; + + vdo_log_info("Replaying volume from chapter %llu through chapter %llu", + (unsigned long long) from_virtual, + (unsigned long long) upto_virtual); + + /* + * The index failed to load, so the volume index is empty. Add records to the volume index + * in order, skipping non-hooks in chapters which will be sparse to save time. + * + * Go through each record page of each chapter and add the records back to the volume + * index. This should not cause anything to be written to either the open chapter or the + * on-disk volume. Also skip the on-disk chapter corresponding to upto_virtual, as this + * would have already been purged from the volume index when the chapter was opened. + * + * Also, go through each index page for each chapter and rebuild the index page map. + */ + old_map_update = index->volume->index_page_map->last_update; + for (virtual = from_virtual; virtual < upto_virtual; virtual++) { + will_be_sparse = uds_is_chapter_sparse(index->volume->geometry, + from_virtual, upto_virtual, + virtual); + result = replay_chapter(index, virtual, will_be_sparse); + if (result != UDS_SUCCESS) + return result; + } + + /* Also reap the chapter being replaced by the open chapter. */ + uds_set_volume_index_open_chapter(index->volume_index, upto_virtual); + + new_map_update = index->volume->index_page_map->last_update; + if (new_map_update != old_map_update) { + vdo_log_info("replay changed index page map update from %llu to %llu", + (unsigned long long) old_map_update, + (unsigned long long) new_map_update); + } + + return UDS_SUCCESS; +} + +static int rebuild_index(struct uds_index *index) +{ + int result; + u64 lowest; + u64 highest; + bool is_empty = false; + u32 chapters_per_volume = index->volume->geometry->chapters_per_volume; + + index->volume->lookup_mode = LOOKUP_FOR_REBUILD; + result = uds_find_volume_chapter_boundaries(index->volume, &lowest, &highest, + &is_empty); + if (result != UDS_SUCCESS) { + return vdo_log_fatal_strerror(result, + "cannot rebuild index: unknown volume chapter boundaries"); + } + + if (is_empty) { + index->newest_virtual_chapter = 0; + index->oldest_virtual_chapter = 0; + index->volume->lookup_mode = LOOKUP_NORMAL; + return UDS_SUCCESS; + } + + index->newest_virtual_chapter = highest + 1; + index->oldest_virtual_chapter = lowest; + if (index->newest_virtual_chapter == + (index->oldest_virtual_chapter + chapters_per_volume)) { + /* Skip the chapter shadowed by the open chapter. */ + index->oldest_virtual_chapter++; + } + + result = replay_volume(index); + if (result != UDS_SUCCESS) + return result; + + index->volume->lookup_mode = LOOKUP_NORMAL; + return UDS_SUCCESS; +} + +static void free_index_zone(struct index_zone *zone) +{ + if (zone == NULL) + return; + + uds_free_open_chapter(zone->open_chapter); + uds_free_open_chapter(zone->writing_chapter); + vdo_free(zone); +} + +static int make_index_zone(struct uds_index *index, unsigned int zone_number) +{ + int result; + struct index_zone *zone; + + result = vdo_allocate(1, struct index_zone, "index zone", &zone); + if (result != VDO_SUCCESS) + return result; + + result = uds_make_open_chapter(index->volume->geometry, index->zone_count, + &zone->open_chapter); + if (result != UDS_SUCCESS) { + free_index_zone(zone); + return result; + } + + result = uds_make_open_chapter(index->volume->geometry, index->zone_count, + &zone->writing_chapter); + if (result != UDS_SUCCESS) { + free_index_zone(zone); + return result; + } + + zone->index = index; + zone->id = zone_number; + index->zones[zone_number] = zone; + + return UDS_SUCCESS; +} + +int uds_make_index(struct uds_configuration *config, enum uds_open_index_type open_type, + struct index_load_context *load_context, index_callback_fn callback, + struct uds_index **new_index) +{ + int result; + bool loaded = false; + bool new = (open_type == UDS_CREATE); + struct uds_index *index = NULL; + struct index_zone *zone; + u64 nonce; + unsigned int z; + + result = vdo_allocate_extended(struct uds_index, config->zone_count, + struct uds_request_queue *, "index", &index); + if (result != VDO_SUCCESS) + return result; + + index->zone_count = config->zone_count; + + result = uds_make_index_layout(config, new, &index->layout); + if (result != UDS_SUCCESS) { + uds_free_index(index); + return result; + } + + result = vdo_allocate(index->zone_count, struct index_zone *, "zones", + &index->zones); + if (result != VDO_SUCCESS) { + uds_free_index(index); + return result; + } + + result = uds_make_volume(config, index->layout, &index->volume); + if (result != UDS_SUCCESS) { + uds_free_index(index); + return result; + } + + index->volume->lookup_mode = LOOKUP_NORMAL; + for (z = 0; z < index->zone_count; z++) { + result = make_index_zone(index, z); + if (result != UDS_SUCCESS) { + uds_free_index(index); + return vdo_log_error_strerror(result, + "Could not create index zone"); + } + } + + nonce = uds_get_volume_nonce(index->layout); + result = uds_make_volume_index(config, nonce, &index->volume_index); + if (result != UDS_SUCCESS) { + uds_free_index(index); + return vdo_log_error_strerror(result, "could not make volume index"); + } + + index->load_context = load_context; + index->callback = callback; + + result = initialize_index_queues(index, config->geometry); + if (result != UDS_SUCCESS) { + uds_free_index(index); + return result; + } + + result = make_chapter_writer(index, &index->chapter_writer); + if (result != UDS_SUCCESS) { + uds_free_index(index); + return result; + } + + if (!new) { + result = load_index(index); + switch (result) { + case UDS_SUCCESS: + loaded = true; + break; + case -ENOMEM: + /* We should not try a rebuild for this error. */ + vdo_log_error_strerror(result, "index could not be loaded"); + break; + default: + vdo_log_error_strerror(result, "index could not be loaded"); + if (open_type == UDS_LOAD) { + result = rebuild_index(index); + if (result != UDS_SUCCESS) { + vdo_log_error_strerror(result, + "index could not be rebuilt"); + } + } + break; + } + } + + if (result != UDS_SUCCESS) { + uds_free_index(index); + return vdo_log_error_strerror(result, "fatal error in %s()", __func__); + } + + for (z = 0; z < index->zone_count; z++) { + zone = index->zones[z]; + zone->oldest_virtual_chapter = index->oldest_virtual_chapter; + zone->newest_virtual_chapter = index->newest_virtual_chapter; + } + + if (index->load_context != NULL) { + mutex_lock(&index->load_context->mutex); + index->load_context->status = INDEX_READY; + /* + * If we get here, suspend is meaningless, but notify any thread trying to suspend + * us so it doesn't hang. + */ + uds_broadcast_cond(&index->load_context->cond); + mutex_unlock(&index->load_context->mutex); + } + + index->has_saved_open_chapter = loaded; + index->need_to_save = !loaded; + *new_index = index; + return UDS_SUCCESS; +} + +void uds_free_index(struct uds_index *index) +{ + unsigned int i; + + if (index == NULL) + return; + + uds_request_queue_finish(index->triage_queue); + for (i = 0; i < index->zone_count; i++) + uds_request_queue_finish(index->zone_queues[i]); + + free_chapter_writer(index->chapter_writer); + + uds_free_volume_index(index->volume_index); + if (index->zones != NULL) { + for (i = 0; i < index->zone_count; i++) + free_index_zone(index->zones[i]); + vdo_free(index->zones); + } + + uds_free_volume(index->volume); + uds_free_index_layout(vdo_forget(index->layout)); + vdo_free(index); +} + +/* Wait for the chapter writer to complete any outstanding writes. */ +void uds_wait_for_idle_index(struct uds_index *index) +{ + struct chapter_writer *writer = index->chapter_writer; + + mutex_lock(&writer->mutex); + while (writer->zones_to_write > 0) + uds_wait_cond(&writer->cond, &writer->mutex); + mutex_unlock(&writer->mutex); +} + +/* This function assumes that all requests have been drained. */ +int uds_save_index(struct uds_index *index) +{ + int result; + + if (!index->need_to_save) + return UDS_SUCCESS; + + uds_wait_for_idle_index(index); + index->prev_save = index->last_save; + index->last_save = ((index->newest_virtual_chapter == 0) ? + NO_LAST_SAVE : index->newest_virtual_chapter - 1); + vdo_log_info("beginning save (vcn %llu)", (unsigned long long) index->last_save); + + result = uds_save_index_state(index->layout, index); + if (result != UDS_SUCCESS) { + vdo_log_info("save index failed"); + index->last_save = index->prev_save; + } else { + index->has_saved_open_chapter = true; + index->need_to_save = false; + vdo_log_info("finished save (vcn %llu)", + (unsigned long long) index->last_save); + } + + return result; +} + +int uds_replace_index_storage(struct uds_index *index, struct block_device *bdev) +{ + return uds_replace_volume_storage(index->volume, index->layout, bdev); +} + +/* Accessing statistics should be safe from any thread. */ +void uds_get_index_stats(struct uds_index *index, struct uds_index_stats *counters) +{ + struct volume_index_stats stats; + + uds_get_volume_index_stats(index->volume_index, &stats); + counters->entries_indexed = stats.record_count; + counters->collisions = stats.collision_count; + counters->entries_discarded = stats.discard_count; + + counters->memory_used = (index->volume_index->memory_size + + index->volume->cache_size + + index->chapter_writer->memory_size); +} + +void uds_enqueue_request(struct uds_request *request, enum request_stage stage) +{ + struct uds_index *index = request->index; + struct uds_request_queue *queue; + + switch (stage) { + case STAGE_TRIAGE: + if (index->triage_queue != NULL) { + queue = index->triage_queue; + break; + } + + fallthrough; + + case STAGE_INDEX: + request->zone_number = + uds_get_volume_index_zone(index->volume_index, &request->record_name); + fallthrough; + + case STAGE_MESSAGE: + queue = index->zone_queues[request->zone_number]; + break; + + default: + VDO_ASSERT_LOG_ONLY(false, "invalid index stage: %d", stage); + return; + } + + uds_request_queue_enqueue(queue, request); +} diff --git a/drivers/md/dm-vdo/indexer/index.h b/drivers/md/dm-vdo/indexer/index.h new file mode 100644 index 0000000000..edabb23954 --- /dev/null +++ b/drivers/md/dm-vdo/indexer/index.h @@ -0,0 +1,83 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright 2023 Red Hat + */ + +#ifndef UDS_INDEX_H +#define UDS_INDEX_H + +#include "index-layout.h" +#include "index-session.h" +#include "open-chapter.h" +#include "volume.h" +#include "volume-index.h" + +/* + * The index is a high-level structure which represents the totality of the UDS index. It manages + * the queues for incoming requests and dispatches them to the appropriate sub-components like the + * volume or the volume index. It also manages administrative tasks such as saving and loading the + * index. + * + * The index is divided into a number of independent zones and assigns each request to a zone based + * on its name. Most sub-components are similarly divided into zones as well so that requests in + * each zone usually operate without interference or coordination between zones. + */ + +typedef void (*index_callback_fn)(struct uds_request *request); + +struct index_zone { + struct uds_index *index; + struct open_chapter_zone *open_chapter; + struct open_chapter_zone *writing_chapter; + u64 oldest_virtual_chapter; + u64 newest_virtual_chapter; + unsigned int id; +}; + +struct uds_index { + bool has_saved_open_chapter; + bool need_to_save; + struct index_load_context *load_context; + struct index_layout *layout; + struct volume_index *volume_index; + struct volume *volume; + unsigned int zone_count; + struct index_zone **zones; + + u64 oldest_virtual_chapter; + u64 newest_virtual_chapter; + + u64 last_save; + u64 prev_save; + struct chapter_writer *chapter_writer; + + index_callback_fn callback; + struct uds_request_queue *triage_queue; + struct uds_request_queue *zone_queues[]; +}; + +enum request_stage { + STAGE_TRIAGE, + STAGE_INDEX, + STAGE_MESSAGE, +}; + +int __must_check uds_make_index(struct uds_configuration *config, + enum uds_open_index_type open_type, + struct index_load_context *load_context, + index_callback_fn callback, struct uds_index **new_index); + +int __must_check uds_save_index(struct uds_index *index); + +void uds_free_index(struct uds_index *index); + +int __must_check uds_replace_index_storage(struct uds_index *index, + struct block_device *bdev); + +void uds_get_index_stats(struct uds_index *index, struct uds_index_stats *counters); + +void uds_enqueue_request(struct uds_request *request, enum request_stage stage); + +void uds_wait_for_idle_index(struct uds_index *index); + +#endif /* UDS_INDEX_H */ diff --git a/drivers/md/dm-vdo/indexer/indexer.h b/drivers/md/dm-vdo/indexer/indexer.h new file mode 100644 index 0000000000..3744aaf625 --- /dev/null +++ b/drivers/md/dm-vdo/indexer/indexer.h @@ -0,0 +1,353 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright 2023 Red Hat + */ + +#ifndef INDEXER_H +#define INDEXER_H + +#include +#include +#include +#include + +#include "funnel-queue.h" + +/* + * UDS public API + * + * The Universal Deduplication System (UDS) is an efficient name-value store. When used for + * deduplicating storage, the names are generally hashes of data blocks and the associated data is + * where that block is located on the underlying storage medium. The stored names are expected to + * be randomly distributed among the space of possible names. If this assumption is violated, the + * UDS index will store fewer names than normal but will otherwise continue to work. The data + * associated with each name can be any 16-byte value. + * + * A client must first create an index session to interact with an index. Once created, the session + * can be shared among multiple threads or users. When a session is destroyed, it will also close + * and save any associated index. + * + * To make a request, a client must allocate a uds_request structure and set the required fields + * before launching it. UDS will invoke the provided callback to complete the request. After the + * callback has been called, the uds_request structure can be freed or reused for a new request. + * There are five types of requests: + * + * A UDS_UPDATE request will associate the provided name with the provided data. Any previous data + * associated with that name will be discarded. + * + * A UDS_QUERY request will return the data associated with the provided name, if any. The entry + * for the name will also be marked as most recent, as if the data had been updated. + * + * A UDS_POST request is a combination of UDS_QUERY and UDS_UPDATE. If there is already data + * associated with the provided name, that data is returned. If there is no existing association, + * the name is associated with the newly provided data. This request is equivalent to a UDS_QUERY + * request followed by a UDS_UPDATE request if no data is found, but it is much more efficient. + * + * A UDS_QUERY_NO_UPDATE request will return the data associated with the provided name, but will + * not change the recency of the entry for the name. This request is primarily useful for testing, + * to determine whether an entry exists without changing the internal state of the index. + * + * A UDS_DELETE request removes any data associated with the provided name. This operation is + * generally not necessary, because the index will automatically discard its oldest entries once it + * becomes full. + */ + +/* General UDS constants and structures */ + +enum uds_request_type { + /* Create or update the mapping for a name, and make the name most recent. */ + UDS_UPDATE, + + /* Return any mapped data for a name, and make the name most recent. */ + UDS_QUERY, + + /* + * Return any mapped data for a name, or map the provided data to the name if there is no + * current data, and make the name most recent. + */ + UDS_POST, + + /* Return any mapped data for a name without updating its recency. */ + UDS_QUERY_NO_UPDATE, + + /* Remove any mapping for a name. */ + UDS_DELETE, + +}; + +enum uds_open_index_type { + /* Create a new index. */ + UDS_CREATE, + + /* Load an existing index and try to recover if necessary. */ + UDS_LOAD, + + /* Load an existing index, but only if it was saved cleanly. */ + UDS_NO_REBUILD, +}; + +enum { + /* The record name size in bytes */ + UDS_RECORD_NAME_SIZE = 16, + /* The maximum record data size in bytes */ + UDS_RECORD_DATA_SIZE = 16, +}; + +/* + * A type representing a UDS memory configuration which is either a positive integer number of + * gigabytes or one of the six special constants for configurations smaller than one gigabyte. + */ +typedef int uds_memory_config_size_t; + +enum { + /* The maximum configurable amount of memory */ + UDS_MEMORY_CONFIG_MAX = 1024, + /* Flag indicating that the index has one less chapter than usual */ + UDS_MEMORY_CONFIG_REDUCED = 0x1000, + UDS_MEMORY_CONFIG_REDUCED_MAX = 1024 + UDS_MEMORY_CONFIG_REDUCED, + /* Special values indicating sizes less than 1 GB */ + UDS_MEMORY_CONFIG_256MB = -256, + UDS_MEMORY_CONFIG_512MB = -512, + UDS_MEMORY_CONFIG_768MB = -768, + UDS_MEMORY_CONFIG_REDUCED_256MB = -1280, + UDS_MEMORY_CONFIG_REDUCED_512MB = -1536, + UDS_MEMORY_CONFIG_REDUCED_768MB = -1792, +}; + +struct uds_record_name { + unsigned char name[UDS_RECORD_NAME_SIZE]; +}; + +struct uds_record_data { + unsigned char data[UDS_RECORD_DATA_SIZE]; +}; + +struct uds_volume_record { + struct uds_record_name name; + struct uds_record_data data; +}; + +struct uds_parameters { + /* The block_device used for storage */ + struct block_device *bdev; + /* The maximum allowable size of the index on storage */ + size_t size; + /* The offset where the index should start */ + off_t offset; + /* The maximum memory allocation, in GB */ + uds_memory_config_size_t memory_size; + /* Whether the index should include sparse chapters */ + bool sparse; + /* A 64-bit nonce to validate the index */ + u64 nonce; + /* The number of threads used to process index requests */ + unsigned int zone_count; + /* The number of threads used to read volume pages */ + unsigned int read_threads; +}; + +/* + * These statistics capture characteristics of the current index, including resource usage and + * requests processed since the index was opened. + */ +struct uds_index_stats { + /* The total number of records stored in the index */ + u64 entries_indexed; + /* An estimate of the index's memory usage, in bytes */ + u64 memory_used; + /* The number of collisions recorded in the volume index */ + u64 collisions; + /* The number of entries discarded from the index since startup */ + u64 entries_discarded; + /* The time at which these statistics were fetched */ + s64 current_time; + /* The number of post calls that found an existing entry */ + u64 posts_found; + /* The number of post calls that added an entry */ + u64 posts_not_found; + /* + * The number of post calls that found an existing entry that is current enough to only + * exist in memory and not have been committed to disk yet + */ + u64 in_memory_posts_found; + /* + * The number of post calls that found an existing entry in the dense portion of the index + */ + u64 dense_posts_found; + /* + * The number of post calls that found an existing entry in the sparse portion of the index + */ + u64 sparse_posts_found; + /* The number of update calls that updated an existing entry */ + u64 updates_found; + /* The number of update calls that added a new entry */ + u64 updates_not_found; + /* The number of delete requests that deleted an existing entry */ + u64 deletions_found; + /* The number of delete requests that did nothing */ + u64 deletions_not_found; + /* The number of query calls that found existing entry */ + u64 queries_found; + /* The number of query calls that did not find an entry */ + u64 queries_not_found; + /* The total number of requests processed */ + u64 requests; +}; + +enum uds_index_region { + /* No location information has been determined */ + UDS_LOCATION_UNKNOWN = 0, + /* The index page entry has been found */ + UDS_LOCATION_INDEX_PAGE_LOOKUP, + /* The record page entry has been found */ + UDS_LOCATION_RECORD_PAGE_LOOKUP, + /* The record is not in the index */ + UDS_LOCATION_UNAVAILABLE, + /* The record was found in the open chapter */ + UDS_LOCATION_IN_OPEN_CHAPTER, + /* The record was found in the dense part of the index */ + UDS_LOCATION_IN_DENSE, + /* The record was found in the sparse part of the index */ + UDS_LOCATION_IN_SPARSE, +} __packed; + +/* Zone message requests are used to communicate between index zones. */ +enum uds_zone_message_type { + /* A standard request with no message */ + UDS_MESSAGE_NONE = 0, + /* Add a chapter to the sparse chapter index cache */ + UDS_MESSAGE_SPARSE_CACHE_BARRIER, + /* Close a chapter to keep the zone from falling behind */ + UDS_MESSAGE_ANNOUNCE_CHAPTER_CLOSED, +} __packed; + +struct uds_zone_message { + /* The type of message, determining how it will be processed */ + enum uds_zone_message_type type; + /* The virtual chapter number to which the message applies */ + u64 virtual_chapter; +}; + +struct uds_index_session; +struct uds_index; +struct uds_request; + +/* Once this callback has been invoked, the uds_request structure can be reused or freed. */ +typedef void (*uds_request_callback_fn)(struct uds_request *request); + +struct uds_request { + /* These input fields must be set before launching a request. */ + + /* The name of the record to look up or create */ + struct uds_record_name record_name; + /* New data to associate with the record name, if applicable */ + struct uds_record_data new_metadata; + /* A callback to invoke when the request is complete */ + uds_request_callback_fn callback; + /* The index session that will manage this request */ + struct uds_index_session *session; + /* The type of operation to perform, as describe above */ + enum uds_request_type type; + + /* These output fields are set when a request is complete. */ + + /* The existing data associated with the request name, if any */ + struct uds_record_data old_metadata; + /* Either UDS_SUCCESS or an error code for the request */ + int status; + /* True if the record name had an existing entry in the index */ + bool found; + + /* + * The remaining fields are used internally and should not be altered by clients. The index + * relies on zone_number being the first field in this section. + */ + + /* The number of the zone which will process this request*/ + unsigned int zone_number; + /* A link for adding a request to a lock-free queue */ + struct funnel_queue_entry queue_link; + /* A link for adding a request to a standard linked list */ + struct uds_request *next_request; + /* A pointer to the index processing this request */ + struct uds_index *index; + /* Control message for coordinating between zones */ + struct uds_zone_message zone_message; + /* If true, process request immediately by waking the worker thread */ + bool unbatched; + /* If true, continue this request before processing newer requests */ + bool requeued; + /* The virtual chapter containing the record name, if known */ + u64 virtual_chapter; + /* The region of the index containing the record name */ + enum uds_index_region location; +}; + +/* Compute the number of bytes needed to store an index. */ +int __must_check uds_compute_index_size(const struct uds_parameters *parameters, + u64 *index_size); + +/* A session is required for most index operations. */ +int __must_check uds_create_index_session(struct uds_index_session **session); + +/* Destroying an index session also closes and saves the associated index. */ +int uds_destroy_index_session(struct uds_index_session *session); + +/* + * Create or open an index with an existing session. This operation fails if the index session is + * suspended, or if there is already an open index. + */ +int __must_check uds_open_index(enum uds_open_index_type open_type, + const struct uds_parameters *parameters, + struct uds_index_session *session); + +/* + * Wait until all callbacks for index operations are complete, and prevent new index operations + * from starting. New index operations will fail with EBUSY until the session is resumed. Also + * optionally saves the index. + */ +int __must_check uds_suspend_index_session(struct uds_index_session *session, bool save); + +/* + * Allow new index operations for an index, whether it was suspended or not. If the index is + * suspended and the supplied block device differs from the current backing store, the index will + * start using the new backing store instead. + */ +int __must_check uds_resume_index_session(struct uds_index_session *session, + struct block_device *bdev); + +/* Wait until all outstanding index operations are complete. */ +int __must_check uds_flush_index_session(struct uds_index_session *session); + +/* Close an index. This operation fails if the index session is suspended. */ +int __must_check uds_close_index(struct uds_index_session *session); + +/* Get index statistics since the last time the index was opened. */ +int __must_check uds_get_index_session_stats(struct uds_index_session *session, + struct uds_index_stats *stats); + +/* This function will fail if any required field of the request is not set. */ +int __must_check uds_launch_request(struct uds_request *request); + +struct cond_var { + wait_queue_head_t wait_queue; +}; + +static inline void uds_init_cond(struct cond_var *cv) +{ + init_waitqueue_head(&cv->wait_queue); +} + +static inline void uds_signal_cond(struct cond_var *cv) +{ + wake_up(&cv->wait_queue); +} + +static inline void uds_broadcast_cond(struct cond_var *cv) +{ + wake_up_all(&cv->wait_queue); +} + +void uds_wait_cond(struct cond_var *cv, struct mutex *mutex); + +#endif /* INDEXER_H */ diff --git a/drivers/md/dm-vdo/indexer/io-factory.c b/drivers/md/dm-vdo/indexer/io-factory.c new file mode 100644 index 0000000000..515765d357 --- /dev/null +++ b/drivers/md/dm-vdo/indexer/io-factory.c @@ -0,0 +1,415 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2023 Red Hat + */ + +#include "io-factory.h" + +#include +#include +#include +#include + +#include "logger.h" +#include "memory-alloc.h" +#include "numeric.h" + +/* + * The I/O factory object manages access to index storage, which is a contiguous range of blocks on + * a block device. + * + * The factory holds the open device and is responsible for closing it. The factory has methods to + * make helper structures that can be used to access sections of the index. + */ +struct io_factory { + struct block_device *bdev; + atomic_t ref_count; +}; + +/* The buffered reader allows efficient I/O by reading page-sized segments into a buffer. */ +struct buffered_reader { + struct io_factory *factory; + struct dm_bufio_client *client; + struct dm_buffer *buffer; + sector_t limit; + sector_t block_number; + u8 *start; + u8 *end; +}; + +#define MAX_READ_AHEAD_BLOCKS 4 + +/* + * The buffered writer allows efficient I/O by buffering writes and committing page-sized segments + * to storage. + */ +struct buffered_writer { + struct io_factory *factory; + struct dm_bufio_client *client; + struct dm_buffer *buffer; + sector_t limit; + sector_t block_number; + u8 *start; + u8 *end; + int error; +}; + +static void uds_get_io_factory(struct io_factory *factory) +{ + atomic_inc(&factory->ref_count); +} + +int uds_make_io_factory(struct block_device *bdev, struct io_factory **factory_ptr) +{ + int result; + struct io_factory *factory; + + result = vdo_allocate(1, struct io_factory, __func__, &factory); + if (result != VDO_SUCCESS) + return result; + + factory->bdev = bdev; + atomic_set_release(&factory->ref_count, 1); + + *factory_ptr = factory; + return UDS_SUCCESS; +} + +int uds_replace_storage(struct io_factory *factory, struct block_device *bdev) +{ + factory->bdev = bdev; + return UDS_SUCCESS; +} + +/* Free an I/O factory once all references have been released. */ +void uds_put_io_factory(struct io_factory *factory) +{ + if (atomic_add_return(-1, &factory->ref_count) <= 0) + vdo_free(factory); +} + +size_t uds_get_writable_size(struct io_factory *factory) +{ + return i_size_read(factory->bdev->bd_inode); +} + +/* Create a struct dm_bufio_client for an index region starting at offset. */ +int uds_make_bufio(struct io_factory *factory, off_t block_offset, size_t block_size, + unsigned int reserved_buffers, struct dm_bufio_client **client_ptr) +{ + struct dm_bufio_client *client; + + client = dm_bufio_client_create(factory->bdev, block_size, reserved_buffers, 0, + NULL, NULL, 0); + if (IS_ERR(client)) + return -PTR_ERR(client); + + dm_bufio_set_sector_offset(client, block_offset * SECTORS_PER_BLOCK); + *client_ptr = client; + return UDS_SUCCESS; +} + +static void read_ahead(struct buffered_reader *reader, sector_t block_number) +{ + if (block_number < reader->limit) { + sector_t read_ahead = min((sector_t) MAX_READ_AHEAD_BLOCKS, + reader->limit - block_number); + + dm_bufio_prefetch(reader->client, block_number, read_ahead); + } +} + +void uds_free_buffered_reader(struct buffered_reader *reader) +{ + if (reader == NULL) + return; + + if (reader->buffer != NULL) + dm_bufio_release(reader->buffer); + + dm_bufio_client_destroy(reader->client); + uds_put_io_factory(reader->factory); + vdo_free(reader); +} + +/* Create a buffered reader for an index region starting at offset. */ +int uds_make_buffered_reader(struct io_factory *factory, off_t offset, u64 block_count, + struct buffered_reader **reader_ptr) +{ + int result; + struct dm_bufio_client *client = NULL; + struct buffered_reader *reader = NULL; + + result = uds_make_bufio(factory, offset, UDS_BLOCK_SIZE, 1, &client); + if (result != UDS_SUCCESS) + return result; + + result = vdo_allocate(1, struct buffered_reader, "buffered reader", &reader); + if (result != VDO_SUCCESS) { + dm_bufio_client_destroy(client); + return result; + } + + *reader = (struct buffered_reader) { + .factory = factory, + .client = client, + .buffer = NULL, + .limit = block_count, + .block_number = 0, + .start = NULL, + .end = NULL, + }; + + read_ahead(reader, 0); + uds_get_io_factory(factory); + *reader_ptr = reader; + return UDS_SUCCESS; +} + +static int position_reader(struct buffered_reader *reader, sector_t block_number, + off_t offset) +{ + struct dm_buffer *buffer = NULL; + void *data; + + if ((reader->end == NULL) || (block_number != reader->block_number)) { + if (block_number >= reader->limit) + return UDS_OUT_OF_RANGE; + + if (reader->buffer != NULL) + dm_bufio_release(vdo_forget(reader->buffer)); + + data = dm_bufio_read(reader->client, block_number, &buffer); + if (IS_ERR(data)) + return -PTR_ERR(data); + + reader->buffer = buffer; + reader->start = data; + if (block_number == reader->block_number + 1) + read_ahead(reader, block_number + 1); + } + + reader->block_number = block_number; + reader->end = reader->start + offset; + return UDS_SUCCESS; +} + +static size_t bytes_remaining_in_read_buffer(struct buffered_reader *reader) +{ + return (reader->end == NULL) ? 0 : reader->start + UDS_BLOCK_SIZE - reader->end; +} + +static int reset_reader(struct buffered_reader *reader) +{ + sector_t block_number; + + if (bytes_remaining_in_read_buffer(reader) > 0) + return UDS_SUCCESS; + + block_number = reader->block_number; + if (reader->end != NULL) + block_number++; + + return position_reader(reader, block_number, 0); +} + +int uds_read_from_buffered_reader(struct buffered_reader *reader, u8 *data, + size_t length) +{ + int result = UDS_SUCCESS; + size_t chunk_size; + + while (length > 0) { + result = reset_reader(reader); + if (result != UDS_SUCCESS) + return result; + + chunk_size = min(length, bytes_remaining_in_read_buffer(reader)); + memcpy(data, reader->end, chunk_size); + length -= chunk_size; + data += chunk_size; + reader->end += chunk_size; + } + + return UDS_SUCCESS; +} + +/* + * Verify that the next data on the reader matches the required value. If the value matches, the + * matching contents are consumed. If the value does not match, the reader state is unchanged. + */ +int uds_verify_buffered_data(struct buffered_reader *reader, const u8 *value, + size_t length) +{ + int result = UDS_SUCCESS; + size_t chunk_size; + sector_t start_block_number = reader->block_number; + int start_offset = reader->end - reader->start; + + while (length > 0) { + result = reset_reader(reader); + if (result != UDS_SUCCESS) { + result = UDS_CORRUPT_DATA; + break; + } + + chunk_size = min(length, bytes_remaining_in_read_buffer(reader)); + if (memcmp(value, reader->end, chunk_size) != 0) { + result = UDS_CORRUPT_DATA; + break; + } + + length -= chunk_size; + value += chunk_size; + reader->end += chunk_size; + } + + if (result != UDS_SUCCESS) + position_reader(reader, start_block_number, start_offset); + + return result; +} + +/* Create a buffered writer for an index region starting at offset. */ +int uds_make_buffered_writer(struct io_factory *factory, off_t offset, u64 block_count, + struct buffered_writer **writer_ptr) +{ + int result; + struct dm_bufio_client *client = NULL; + struct buffered_writer *writer; + + result = uds_make_bufio(factory, offset, UDS_BLOCK_SIZE, 1, &client); + if (result != UDS_SUCCESS) + return result; + + result = vdo_allocate(1, struct buffered_writer, "buffered writer", &writer); + if (result != VDO_SUCCESS) { + dm_bufio_client_destroy(client); + return result; + } + + *writer = (struct buffered_writer) { + .factory = factory, + .client = client, + .buffer = NULL, + .limit = block_count, + .start = NULL, + .end = NULL, + .block_number = 0, + .error = UDS_SUCCESS, + }; + + uds_get_io_factory(factory); + *writer_ptr = writer; + return UDS_SUCCESS; +} + +static size_t get_remaining_write_space(struct buffered_writer *writer) +{ + return writer->start + UDS_BLOCK_SIZE - writer->end; +} + +static int __must_check prepare_next_buffer(struct buffered_writer *writer) +{ + struct dm_buffer *buffer = NULL; + void *data; + + if (writer->block_number >= writer->limit) { + writer->error = UDS_OUT_OF_RANGE; + return UDS_OUT_OF_RANGE; + } + + data = dm_bufio_new(writer->client, writer->block_number, &buffer); + if (IS_ERR(data)) { + writer->error = -PTR_ERR(data); + return writer->error; + } + + writer->buffer = buffer; + writer->start = data; + writer->end = data; + return UDS_SUCCESS; +} + +static int flush_previous_buffer(struct buffered_writer *writer) +{ + size_t available; + + if (writer->buffer == NULL) + return writer->error; + + if (writer->error == UDS_SUCCESS) { + available = get_remaining_write_space(writer); + + if (available > 0) + memset(writer->end, 0, available); + + dm_bufio_mark_buffer_dirty(writer->buffer); + } + + dm_bufio_release(writer->buffer); + writer->buffer = NULL; + writer->start = NULL; + writer->end = NULL; + writer->block_number++; + return writer->error; +} + +void uds_free_buffered_writer(struct buffered_writer *writer) +{ + int result; + + if (writer == NULL) + return; + + flush_previous_buffer(writer); + result = -dm_bufio_write_dirty_buffers(writer->client); + if (result != UDS_SUCCESS) + vdo_log_warning_strerror(result, "%s: failed to sync storage", __func__); + + dm_bufio_client_destroy(writer->client); + uds_put_io_factory(writer->factory); + vdo_free(writer); +} + +/* + * Append data to the buffer, writing as needed. If no data is provided, zeros are written instead. + * If a write error occurs, it is recorded and returned on every subsequent write attempt. + */ +int uds_write_to_buffered_writer(struct buffered_writer *writer, const u8 *data, + size_t length) +{ + int result = writer->error; + size_t chunk_size; + + while ((length > 0) && (result == UDS_SUCCESS)) { + if (writer->buffer == NULL) { + result = prepare_next_buffer(writer); + continue; + } + + chunk_size = min(length, get_remaining_write_space(writer)); + if (data == NULL) { + memset(writer->end, 0, chunk_size); + } else { + memcpy(writer->end, data, chunk_size); + data += chunk_size; + } + + length -= chunk_size; + writer->end += chunk_size; + + if (get_remaining_write_space(writer) == 0) + result = uds_flush_buffered_writer(writer); + } + + return result; +} + +int uds_flush_buffered_writer(struct buffered_writer *writer) +{ + if (writer->error != UDS_SUCCESS) + return writer->error; + + return flush_previous_buffer(writer); +} diff --git a/drivers/md/dm-vdo/indexer/io-factory.h b/drivers/md/dm-vdo/indexer/io-factory.h new file mode 100644 index 0000000000..7fb5a0616a --- /dev/null +++ b/drivers/md/dm-vdo/indexer/io-factory.h @@ -0,0 +1,64 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright 2023 Red Hat + */ + +#ifndef UDS_IO_FACTORY_H +#define UDS_IO_FACTORY_H + +#include + +/* + * The I/O factory manages all low-level I/O operations to the underlying storage device. Its main + * clients are the index layout and the volume. The buffered reader and buffered writer interfaces + * are helpers for accessing data in a contiguous range of storage blocks. + */ + +struct buffered_reader; +struct buffered_writer; + +struct io_factory; + +enum { + UDS_BLOCK_SIZE = 4096, + SECTORS_PER_BLOCK = UDS_BLOCK_SIZE >> SECTOR_SHIFT, +}; + +int __must_check uds_make_io_factory(struct block_device *bdev, + struct io_factory **factory_ptr); + +int __must_check uds_replace_storage(struct io_factory *factory, + struct block_device *bdev); + +void uds_put_io_factory(struct io_factory *factory); + +size_t __must_check uds_get_writable_size(struct io_factory *factory); + +int __must_check uds_make_bufio(struct io_factory *factory, off_t block_offset, + size_t block_size, unsigned int reserved_buffers, + struct dm_bufio_client **client_ptr); + +int __must_check uds_make_buffered_reader(struct io_factory *factory, off_t offset, + u64 block_count, + struct buffered_reader **reader_ptr); + +void uds_free_buffered_reader(struct buffered_reader *reader); + +int __must_check uds_read_from_buffered_reader(struct buffered_reader *reader, u8 *data, + size_t length); + +int __must_check uds_verify_buffered_data(struct buffered_reader *reader, const u8 *value, + size_t length); + +int __must_check uds_make_buffered_writer(struct io_factory *factory, off_t offset, + u64 block_count, + struct buffered_writer **writer_ptr); + +void uds_free_buffered_writer(struct buffered_writer *buffer); + +int __must_check uds_write_to_buffered_writer(struct buffered_writer *writer, + const u8 *data, size_t length); + +int __must_check uds_flush_buffered_writer(struct buffered_writer *writer); + +#endif /* UDS_IO_FACTORY_H */ diff --git a/drivers/md/dm-vdo/indexer/open-chapter.c b/drivers/md/dm-vdo/indexer/open-chapter.c new file mode 100644 index 0000000000..4a67bcadaa --- /dev/null +++ b/drivers/md/dm-vdo/indexer/open-chapter.c @@ -0,0 +1,426 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2023 Red Hat + */ + +#include "open-chapter.h" + +#include + +#include "logger.h" +#include "memory-alloc.h" +#include "numeric.h" +#include "permassert.h" + +#include "config.h" +#include "hash-utils.h" + +/* + * Each index zone has a dedicated open chapter zone structure which gets an equal share of the + * open chapter space. Records are assigned to zones based on their record name. Within each zone, + * records are stored in an array in the order they arrive. Additionally, a reference to each + * record is stored in a hash table to help determine if a new record duplicates an existing one. + * If new metadata for an existing name arrives, the record is altered in place. The array of + * records is 1-based so that record number 0 can be used to indicate an unused hash slot. + * + * Deleted records are marked with a flag rather than actually removed to simplify hash table + * management. The array of deleted flags overlays the array of hash slots, but the flags are + * indexed by record number instead of by record name. The number of hash slots will always be a + * power of two that is greater than the number of records to be indexed, guaranteeing that hash + * insertion cannot fail, and that there are sufficient flags for all records. + * + * Once any open chapter zone fills its available space, the chapter is closed. The records from + * each zone are interleaved to attempt to preserve temporal locality and assigned to record pages. + * Empty or deleted records are replaced by copies of a valid record so that the record pages only + * contain valid records. The chapter then constructs a delta index which maps each record name to + * the record page on which that record can be found, which is split into index pages. These + * structures are then passed to the volume to be recorded on storage. + * + * When the index is saved, the open chapter records are saved in a single array, once again + * interleaved to attempt to preserve temporal locality. When the index is reloaded, there may be a + * different number of zones than previously, so the records must be parcelled out to their new + * zones. In addition, depending on the distribution of record names, a new zone may have more + * records than it has space. In this case, the latest records for that zone will be discarded. + */ + +static const u8 OPEN_CHAPTER_MAGIC[] = "ALBOC"; +static const u8 OPEN_CHAPTER_VERSION[] = "02.00"; + +#define OPEN_CHAPTER_MAGIC_LENGTH (sizeof(OPEN_CHAPTER_MAGIC) - 1) +#define OPEN_CHAPTER_VERSION_LENGTH (sizeof(OPEN_CHAPTER_VERSION) - 1) +#define LOAD_RATIO 2 + +static inline size_t records_size(const struct open_chapter_zone *open_chapter) +{ + return sizeof(struct uds_volume_record) * (1 + open_chapter->capacity); +} + +static inline size_t slots_size(size_t slot_count) +{ + return sizeof(struct open_chapter_zone_slot) * slot_count; +} + +int uds_make_open_chapter(const struct index_geometry *geometry, unsigned int zone_count, + struct open_chapter_zone **open_chapter_ptr) +{ + int result; + struct open_chapter_zone *open_chapter; + size_t capacity = geometry->records_per_chapter / zone_count; + size_t slot_count = (1 << bits_per(capacity * LOAD_RATIO)); + + result = vdo_allocate_extended(struct open_chapter_zone, slot_count, + struct open_chapter_zone_slot, "open chapter", + &open_chapter); + if (result != VDO_SUCCESS) + return result; + + open_chapter->slot_count = slot_count; + open_chapter->capacity = capacity; + result = vdo_allocate_cache_aligned(records_size(open_chapter), "record pages", + &open_chapter->records); + if (result != VDO_SUCCESS) { + uds_free_open_chapter(open_chapter); + return result; + } + + *open_chapter_ptr = open_chapter; + return UDS_SUCCESS; +} + +void uds_reset_open_chapter(struct open_chapter_zone *open_chapter) +{ + open_chapter->size = 0; + open_chapter->deletions = 0; + + memset(open_chapter->records, 0, records_size(open_chapter)); + memset(open_chapter->slots, 0, slots_size(open_chapter->slot_count)); +} + +static unsigned int probe_chapter_slots(struct open_chapter_zone *open_chapter, + const struct uds_record_name *name) +{ + struct uds_volume_record *record; + unsigned int slot_count = open_chapter->slot_count; + unsigned int slot = uds_name_to_hash_slot(name, slot_count); + unsigned int record_number; + unsigned int attempts = 1; + + while (true) { + record_number = open_chapter->slots[slot].record_number; + + /* + * If the hash slot is empty, we've reached the end of a chain without finding the + * record and should terminate the search. + */ + if (record_number == 0) + return slot; + + /* + * If the name of the record referenced by the slot matches and has not been + * deleted, then we've found the requested name. + */ + record = &open_chapter->records[record_number]; + if ((memcmp(&record->name, name, UDS_RECORD_NAME_SIZE) == 0) && + !open_chapter->slots[record_number].deleted) + return slot; + + /* + * Quadratic probing: advance the probe by 1, 2, 3, etc. and try again. This + * performs better than linear probing and works best for 2^N slots. + */ + slot = (slot + attempts++) % slot_count; + } +} + +void uds_search_open_chapter(struct open_chapter_zone *open_chapter, + const struct uds_record_name *name, + struct uds_record_data *metadata, bool *found) +{ + unsigned int slot; + unsigned int record_number; + + slot = probe_chapter_slots(open_chapter, name); + record_number = open_chapter->slots[slot].record_number; + if (record_number == 0) { + *found = false; + } else { + *found = true; + *metadata = open_chapter->records[record_number].data; + } +} + +/* Add a record to the open chapter zone and return the remaining space. */ +int uds_put_open_chapter(struct open_chapter_zone *open_chapter, + const struct uds_record_name *name, + const struct uds_record_data *metadata) +{ + unsigned int slot; + unsigned int record_number; + struct uds_volume_record *record; + + if (open_chapter->size >= open_chapter->capacity) + return 0; + + slot = probe_chapter_slots(open_chapter, name); + record_number = open_chapter->slots[slot].record_number; + + if (record_number == 0) { + record_number = ++open_chapter->size; + open_chapter->slots[slot].record_number = record_number; + } + + record = &open_chapter->records[record_number]; + record->name = *name; + record->data = *metadata; + + return open_chapter->capacity - open_chapter->size; +} + +void uds_remove_from_open_chapter(struct open_chapter_zone *open_chapter, + const struct uds_record_name *name) +{ + unsigned int slot; + unsigned int record_number; + + slot = probe_chapter_slots(open_chapter, name); + record_number = open_chapter->slots[slot].record_number; + + if (record_number > 0) { + open_chapter->slots[record_number].deleted = true; + open_chapter->deletions += 1; + } +} + +void uds_free_open_chapter(struct open_chapter_zone *open_chapter) +{ + if (open_chapter != NULL) { + vdo_free(open_chapter->records); + vdo_free(open_chapter); + } +} + +/* Map each record name to its record page number in the delta chapter index. */ +static int fill_delta_chapter_index(struct open_chapter_zone **chapter_zones, + unsigned int zone_count, + struct open_chapter_index *index, + struct uds_volume_record *collated_records) +{ + int result; + unsigned int records_per_chapter; + unsigned int records_per_page; + unsigned int record_index; + unsigned int records = 0; + u32 page_number; + unsigned int z; + int overflow_count = 0; + struct uds_volume_record *fill_record = NULL; + + /* + * The record pages should not have any empty space, so find a record with which to fill + * the chapter zone if it was closed early, and also to replace any deleted records. The + * last record in any filled zone is guaranteed to not have been deleted, so use one of + * those. + */ + for (z = 0; z < zone_count; z++) { + struct open_chapter_zone *zone = chapter_zones[z]; + + if (zone->size == zone->capacity) { + fill_record = &zone->records[zone->size]; + break; + } + } + + records_per_chapter = index->geometry->records_per_chapter; + records_per_page = index->geometry->records_per_page; + + for (records = 0; records < records_per_chapter; records++) { + struct uds_volume_record *record = &collated_records[records]; + struct open_chapter_zone *open_chapter; + + /* The record arrays in the zones are 1-based. */ + record_index = 1 + (records / zone_count); + page_number = records / records_per_page; + open_chapter = chapter_zones[records % zone_count]; + + /* Use the fill record in place of an unused record. */ + if (record_index > open_chapter->size || + open_chapter->slots[record_index].deleted) { + *record = *fill_record; + continue; + } + + *record = open_chapter->records[record_index]; + result = uds_put_open_chapter_index_record(index, &record->name, + page_number); + switch (result) { + case UDS_SUCCESS: + break; + case UDS_OVERFLOW: + overflow_count++; + break; + default: + vdo_log_error_strerror(result, + "failed to build open chapter index"); + return result; + } + } + + if (overflow_count > 0) + vdo_log_warning("Failed to add %d entries to chapter index", + overflow_count); + + return UDS_SUCCESS; +} + +int uds_close_open_chapter(struct open_chapter_zone **chapter_zones, + unsigned int zone_count, struct volume *volume, + struct open_chapter_index *chapter_index, + struct uds_volume_record *collated_records, + u64 virtual_chapter_number) +{ + int result; + + uds_empty_open_chapter_index(chapter_index, virtual_chapter_number); + result = fill_delta_chapter_index(chapter_zones, zone_count, chapter_index, + collated_records); + if (result != UDS_SUCCESS) + return result; + + return uds_write_chapter(volume, chapter_index, collated_records); +} + +int uds_save_open_chapter(struct uds_index *index, struct buffered_writer *writer) +{ + int result; + struct open_chapter_zone *open_chapter; + struct uds_volume_record *record; + u8 record_count_data[sizeof(u32)]; + u32 record_count = 0; + unsigned int record_index; + unsigned int z; + + result = uds_write_to_buffered_writer(writer, OPEN_CHAPTER_MAGIC, + OPEN_CHAPTER_MAGIC_LENGTH); + if (result != UDS_SUCCESS) + return result; + + result = uds_write_to_buffered_writer(writer, OPEN_CHAPTER_VERSION, + OPEN_CHAPTER_VERSION_LENGTH); + if (result != UDS_SUCCESS) + return result; + + for (z = 0; z < index->zone_count; z++) { + open_chapter = index->zones[z]->open_chapter; + record_count += open_chapter->size - open_chapter->deletions; + } + + put_unaligned_le32(record_count, record_count_data); + result = uds_write_to_buffered_writer(writer, record_count_data, + sizeof(record_count_data)); + if (result != UDS_SUCCESS) + return result; + + record_index = 1; + while (record_count > 0) { + for (z = 0; z < index->zone_count; z++) { + open_chapter = index->zones[z]->open_chapter; + if (record_index > open_chapter->size) + continue; + + if (open_chapter->slots[record_index].deleted) + continue; + + record = &open_chapter->records[record_index]; + result = uds_write_to_buffered_writer(writer, (u8 *) record, + sizeof(*record)); + if (result != UDS_SUCCESS) + return result; + + record_count--; + } + + record_index++; + } + + return uds_flush_buffered_writer(writer); +} + +u64 uds_compute_saved_open_chapter_size(struct index_geometry *geometry) +{ + unsigned int records_per_chapter = geometry->records_per_chapter; + + return OPEN_CHAPTER_MAGIC_LENGTH + OPEN_CHAPTER_VERSION_LENGTH + sizeof(u32) + + records_per_chapter * sizeof(struct uds_volume_record); +} + +static int load_version20(struct uds_index *index, struct buffered_reader *reader) +{ + int result; + u32 record_count; + u8 record_count_data[sizeof(u32)]; + struct uds_volume_record record; + + /* + * Track which zones cannot accept any more records. If the open chapter had a different + * number of zones previously, some new zones may have more records than they have space + * for. These overflow records will be discarded. + */ + bool full_flags[MAX_ZONES] = { + false, + }; + + result = uds_read_from_buffered_reader(reader, (u8 *) &record_count_data, + sizeof(record_count_data)); + if (result != UDS_SUCCESS) + return result; + + record_count = get_unaligned_le32(record_count_data); + while (record_count-- > 0) { + unsigned int zone = 0; + + result = uds_read_from_buffered_reader(reader, (u8 *) &record, + sizeof(record)); + if (result != UDS_SUCCESS) + return result; + + if (index->zone_count > 1) + zone = uds_get_volume_index_zone(index->volume_index, + &record.name); + + if (!full_flags[zone]) { + struct open_chapter_zone *open_chapter; + unsigned int remaining; + + open_chapter = index->zones[zone]->open_chapter; + remaining = uds_put_open_chapter(open_chapter, &record.name, + &record.data); + /* Do not allow any zone to fill completely. */ + full_flags[zone] = (remaining <= 1); + } + } + + return UDS_SUCCESS; +} + +int uds_load_open_chapter(struct uds_index *index, struct buffered_reader *reader) +{ + u8 version[OPEN_CHAPTER_VERSION_LENGTH]; + int result; + + result = uds_verify_buffered_data(reader, OPEN_CHAPTER_MAGIC, + OPEN_CHAPTER_MAGIC_LENGTH); + if (result != UDS_SUCCESS) + return result; + + result = uds_read_from_buffered_reader(reader, version, sizeof(version)); + if (result != UDS_SUCCESS) + return result; + + if (memcmp(OPEN_CHAPTER_VERSION, version, sizeof(version)) != 0) { + return vdo_log_error_strerror(UDS_CORRUPT_DATA, + "Invalid open chapter version: %.*s", + (int) sizeof(version), version); + } + + return load_version20(index, reader); +} diff --git a/drivers/md/dm-vdo/indexer/open-chapter.h b/drivers/md/dm-vdo/indexer/open-chapter.h new file mode 100644 index 0000000000..a4250bb195 --- /dev/null +++ b/drivers/md/dm-vdo/indexer/open-chapter.h @@ -0,0 +1,79 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright 2023 Red Hat + */ + +#ifndef UDS_OPEN_CHAPTER_H +#define UDS_OPEN_CHAPTER_H + +#include "chapter-index.h" +#include "geometry.h" +#include "index.h" +#include "volume.h" + +/* + * The open chapter tracks the newest records in memory. Like the index as a whole, each open + * chapter is divided into a number of independent zones which are interleaved when the chapter is + * committed to the volume. + */ + +enum { + OPEN_CHAPTER_RECORD_NUMBER_BITS = 23, +}; + +struct open_chapter_zone_slot { + /* If non-zero, the record number addressed by this hash slot */ + unsigned int record_number : OPEN_CHAPTER_RECORD_NUMBER_BITS; + /* If true, the record at the index of this hash slot was deleted */ + bool deleted : 1; +} __packed; + +struct open_chapter_zone { + /* The maximum number of records that can be stored */ + unsigned int capacity; + /* The number of records stored */ + unsigned int size; + /* The number of deleted records */ + unsigned int deletions; + /* Array of chunk records, 1-based */ + struct uds_volume_record *records; + /* The number of slots in the hash table */ + unsigned int slot_count; + /* The hash table slots, referencing virtual record numbers */ + struct open_chapter_zone_slot slots[]; +}; + +int __must_check uds_make_open_chapter(const struct index_geometry *geometry, + unsigned int zone_count, + struct open_chapter_zone **open_chapter_ptr); + +void uds_reset_open_chapter(struct open_chapter_zone *open_chapter); + +void uds_search_open_chapter(struct open_chapter_zone *open_chapter, + const struct uds_record_name *name, + struct uds_record_data *metadata, bool *found); + +int __must_check uds_put_open_chapter(struct open_chapter_zone *open_chapter, + const struct uds_record_name *name, + const struct uds_record_data *metadata); + +void uds_remove_from_open_chapter(struct open_chapter_zone *open_chapter, + const struct uds_record_name *name); + +void uds_free_open_chapter(struct open_chapter_zone *open_chapter); + +int __must_check uds_close_open_chapter(struct open_chapter_zone **chapter_zones, + unsigned int zone_count, struct volume *volume, + struct open_chapter_index *chapter_index, + struct uds_volume_record *collated_records, + u64 virtual_chapter_number); + +int __must_check uds_save_open_chapter(struct uds_index *index, + struct buffered_writer *writer); + +int __must_check uds_load_open_chapter(struct uds_index *index, + struct buffered_reader *reader); + +u64 uds_compute_saved_open_chapter_size(struct index_geometry *geometry); + +#endif /* UDS_OPEN_CHAPTER_H */ diff --git a/drivers/md/dm-vdo/indexer/radix-sort.c b/drivers/md/dm-vdo/indexer/radix-sort.c new file mode 100644 index 0000000000..66b8c706a1 --- /dev/null +++ b/drivers/md/dm-vdo/indexer/radix-sort.c @@ -0,0 +1,330 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2023 Red Hat + */ + +#include "radix-sort.h" + +#include +#include + +#include "memory-alloc.h" +#include "string-utils.h" + +/* + * This implementation allocates one large object to do the sorting, which can be reused as many + * times as desired. The amount of memory required is logarithmically proportional to the number of + * keys to be sorted. + */ + +/* Piles smaller than this are handled with a simple insertion sort. */ +#define INSERTION_SORT_THRESHOLD 12 + +/* Sort keys are pointers to immutable fixed-length arrays of bytes. */ +typedef const u8 *sort_key_t; + +/* + * The keys are separated into piles based on the byte in each keys at the current offset, so the + * number of keys with each byte must be counted. + */ +struct histogram { + /* The number of non-empty bins */ + u16 used; + /* The index (key byte) of the first non-empty bin */ + u16 first; + /* The index (key byte) of the last non-empty bin */ + u16 last; + /* The number of occurrences of each specific byte */ + u32 size[256]; +}; + +/* + * Sub-tasks are manually managed on a stack, both for performance and to put a logarithmic bound + * on the stack space needed. + */ +struct task { + /* Pointer to the first key to sort. */ + sort_key_t *first_key; + /* Pointer to the last key to sort. */ + sort_key_t *last_key; + /* The offset into the key at which to continue sorting. */ + u16 offset; + /* The number of bytes remaining in the sort keys. */ + u16 length; +}; + +struct radix_sorter { + unsigned int count; + struct histogram bins; + sort_key_t *pile[256]; + struct task *end_of_stack; + struct task insertion_list[256]; + struct task stack[]; +}; + +/* Compare a segment of two fixed-length keys starting at an offset. */ +static inline int compare(sort_key_t key1, sort_key_t key2, u16 offset, u16 length) +{ + return memcmp(&key1[offset], &key2[offset], length); +} + +/* Insert the next unsorted key into an array of sorted keys. */ +static inline void insert_key(const struct task task, sort_key_t *next) +{ + /* Pull the unsorted key out, freeing up the array slot. */ + sort_key_t unsorted = *next; + + /* Compare the key to the preceding sorted entries, shifting down ones that are larger. */ + while ((--next >= task.first_key) && + (compare(unsorted, next[0], task.offset, task.length) < 0)) + next[1] = next[0]; + + /* Insert the key into the last slot that was cleared, sorting it. */ + next[1] = unsorted; +} + +/* + * Sort a range of key segments using an insertion sort. This simple sort is faster than the + * 256-way radix sort when the number of keys to sort is small. + */ +static inline void insertion_sort(const struct task task) +{ + sort_key_t *next; + + for (next = task.first_key + 1; next <= task.last_key; next++) + insert_key(task, next); +} + +/* Push a sorting task onto a task stack. */ +static inline void push_task(struct task **stack_pointer, sort_key_t *first_key, + u32 count, u16 offset, u16 length) +{ + struct task *task = (*stack_pointer)++; + + task->first_key = first_key; + task->last_key = &first_key[count - 1]; + task->offset = offset; + task->length = length; +} + +static inline void swap_keys(sort_key_t *a, sort_key_t *b) +{ + sort_key_t c = *a; + *a = *b; + *b = c; +} + +/* + * Count the number of times each byte value appears in the arrays of keys to sort at the current + * offset, keeping track of the number of non-empty bins, and the index of the first and last + * non-empty bin. + */ +static inline void measure_bins(const struct task task, struct histogram *bins) +{ + sort_key_t *key_ptr; + + /* + * Subtle invariant: bins->used and bins->size[] are zero because the sorting code clears + * it all out as it goes. Even though this structure is re-used, we don't need to pay to + * zero it before starting a new tally. + */ + bins->first = U8_MAX; + bins->last = 0; + + for (key_ptr = task.first_key; key_ptr <= task.last_key; key_ptr++) { + /* Increment the count for the byte in the key at the current offset. */ + u8 bin = (*key_ptr)[task.offset]; + u32 size = ++bins->size[bin]; + + /* Track non-empty bins. */ + if (size == 1) { + bins->used += 1; + if (bin < bins->first) + bins->first = bin; + + if (bin > bins->last) + bins->last = bin; + } + } +} + +/* + * Convert the bin sizes to pointers to where each pile goes. + * + * pile[0] = first_key + bin->size[0], + * pile[1] = pile[0] + bin->size[1], etc. + * + * After the keys are moved to the appropriate pile, we'll need to sort each of the piles by the + * next radix position. A new task is put on the stack for each pile containing lots of keys, or a + * new task is put on the list for each pile containing few keys. + * + * @stack: pointer the top of the stack + * @end_of_stack: the end of the stack + * @list: pointer the head of the list + * @pile: array for pointers to the end of each pile + * @bins: the histogram of the sizes of each pile + * @first_key: the first key of the stack + * @offset: the next radix position to sort by + * @length: the number of bytes remaining in the sort keys + * + * Return: UDS_SUCCESS or an error code + */ +static inline int push_bins(struct task **stack, struct task *end_of_stack, + struct task **list, sort_key_t *pile[], + struct histogram *bins, sort_key_t *first_key, + u16 offset, u16 length) +{ + sort_key_t *pile_start = first_key; + int bin; + + for (bin = bins->first; ; bin++) { + u32 size = bins->size[bin]; + + /* Skip empty piles. */ + if (size == 0) + continue; + + /* There's no need to sort empty keys. */ + if (length > 0) { + if (size > INSERTION_SORT_THRESHOLD) { + if (*stack >= end_of_stack) + return UDS_BAD_STATE; + + push_task(stack, pile_start, size, offset, length); + } else if (size > 1) { + push_task(list, pile_start, size, offset, length); + } + } + + pile_start += size; + pile[bin] = pile_start; + if (--bins->used == 0) + break; + } + + return UDS_SUCCESS; +} + +int uds_make_radix_sorter(unsigned int count, struct radix_sorter **sorter) +{ + int result; + unsigned int stack_size = count / INSERTION_SORT_THRESHOLD; + struct radix_sorter *radix_sorter; + + result = vdo_allocate_extended(struct radix_sorter, stack_size, struct task, + __func__, &radix_sorter); + if (result != VDO_SUCCESS) + return result; + + radix_sorter->count = count; + radix_sorter->end_of_stack = radix_sorter->stack + stack_size; + *sorter = radix_sorter; + return UDS_SUCCESS; +} + +void uds_free_radix_sorter(struct radix_sorter *sorter) +{ + vdo_free(sorter); +} + +/* + * Sort pointers to fixed-length keys (arrays of bytes) using a radix sort. The sort implementation + * is unstable, so the relative ordering of equal keys is not preserved. + */ +int uds_radix_sort(struct radix_sorter *sorter, const unsigned char *keys[], + unsigned int count, unsigned short length) +{ + struct task start; + struct histogram *bins = &sorter->bins; + sort_key_t **pile = sorter->pile; + struct task *task_stack = sorter->stack; + + /* All zero-length keys are identical and therefore already sorted. */ + if ((count == 0) || (length == 0)) + return UDS_SUCCESS; + + /* The initial task is to sort the entire length of all the keys. */ + start = (struct task) { + .first_key = keys, + .last_key = &keys[count - 1], + .offset = 0, + .length = length, + }; + + if (count <= INSERTION_SORT_THRESHOLD) { + insertion_sort(start); + return UDS_SUCCESS; + } + + if (count > sorter->count) + return UDS_INVALID_ARGUMENT; + + /* + * Repeatedly consume a sorting task from the stack and process it, pushing new sub-tasks + * onto the stack for each radix-sorted pile. When all tasks and sub-tasks have been + * processed, the stack will be empty and all the keys in the starting task will be fully + * sorted. + */ + for (*task_stack = start; task_stack >= sorter->stack; task_stack--) { + const struct task task = *task_stack; + struct task *insertion_task_list; + int result; + sort_key_t *fence; + sort_key_t *end; + + measure_bins(task, bins); + + /* + * Now that we know how large each bin is, generate pointers for each of the piles + * and push a new task to sort each pile by the next radix byte. + */ + insertion_task_list = sorter->insertion_list; + result = push_bins(&task_stack, sorter->end_of_stack, + &insertion_task_list, pile, bins, task.first_key, + task.offset + 1, task.length - 1); + if (result != UDS_SUCCESS) { + memset(bins, 0, sizeof(*bins)); + return result; + } + + /* Now bins->used is zero again. */ + + /* + * Don't bother processing the last pile: when piles 0..N-1 are all in place, then + * pile N must also be in place. + */ + end = task.last_key - bins->size[bins->last]; + bins->size[bins->last] = 0; + + for (fence = task.first_key; fence <= end; ) { + u8 bin; + sort_key_t key = *fence; + + /* + * The radix byte of the key tells us which pile it belongs in. Swap it for + * an unprocessed item just below that pile, and repeat. + */ + while (--pile[bin = key[task.offset]] > fence) + swap_keys(pile[bin], &key); + + /* + * The pile reached the fence. Put the key at the bottom of that pile, + * completing it, and advance the fence to the next pile. + */ + *fence = key; + fence += bins->size[bin]; + bins->size[bin] = 0; + } + + /* Now bins->size[] is all zero again. */ + + /* + * When the number of keys in a task gets small enough, it is faster to use an + * insertion sort than to keep subdividing into tiny piles. + */ + while (--insertion_task_list >= sorter->insertion_list) + insertion_sort(*insertion_task_list); + } + + return UDS_SUCCESS; +} diff --git a/drivers/md/dm-vdo/indexer/radix-sort.h b/drivers/md/dm-vdo/indexer/radix-sort.h new file mode 100644 index 0000000000..812949bc2c --- /dev/null +++ b/drivers/md/dm-vdo/indexer/radix-sort.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright 2023 Red Hat + */ + +#ifndef UDS_RADIX_SORT_H +#define UDS_RADIX_SORT_H + +/* + * Radix sort is implemented using an American Flag sort, an unstable, in-place 8-bit radix + * exchange sort. This is adapted from the algorithm in the paper by Peter M. McIlroy, Keith + * Bostic, and M. Douglas McIlroy, "Engineering Radix Sort". + * + * http://www.usenix.org/publications/compsystems/1993/win_mcilroy.pdf + */ + +struct radix_sorter; + +int __must_check uds_make_radix_sorter(unsigned int count, struct radix_sorter **sorter); + +void uds_free_radix_sorter(struct radix_sorter *sorter); + +int __must_check uds_radix_sort(struct radix_sorter *sorter, const unsigned char *keys[], + unsigned int count, unsigned short length); + +#endif /* UDS_RADIX_SORT_H */ diff --git a/drivers/md/dm-vdo/indexer/sparse-cache.c b/drivers/md/dm-vdo/indexer/sparse-cache.c new file mode 100644 index 0000000000..2892016782 --- /dev/null +++ b/drivers/md/dm-vdo/indexer/sparse-cache.c @@ -0,0 +1,624 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2023 Red Hat + */ + +#include "sparse-cache.h" + +#include +#include +#include + +#include "logger.h" +#include "memory-alloc.h" +#include "permassert.h" + +#include "chapter-index.h" +#include "config.h" +#include "index.h" + +/* + * Since the cache is small, it is implemented as a simple array of cache entries. Searching for a + * specific virtual chapter is implemented as a linear search. The cache replacement policy is + * least-recently-used (LRU). Again, the small size of the cache allows the LRU order to be + * maintained by shifting entries in an array list. + * + * Changing the contents of the cache requires the coordinated participation of all zone threads + * via the careful use of barrier messages sent to all the index zones by the triage queue worker + * thread. The critical invariant for coordination is that the cache membership must not change + * between updates, so that all calls to uds_sparse_cache_contains() from the zone threads must all + * receive the same results for every virtual chapter number. To ensure that critical invariant, + * state changes such as "that virtual chapter is no longer in the volume" and "skip searching that + * chapter because it has had too many cache misses" are represented separately from the cache + * membership information (the virtual chapter number). + * + * As a result of this invariant, we have the guarantee that every zone thread will call + * uds_update_sparse_cache() once and exactly once to request a chapter that is not in the cache, + * and the serialization of the barrier requests from the triage queue ensures they will all + * request the same chapter number. This means the only synchronization we need can be provided by + * a pair of thread barriers used only in the uds_update_sparse_cache() call, providing a critical + * section where a single zone thread can drive the cache update while all the other zone threads + * are known to be blocked, waiting in the second barrier. Outside that critical section, all the + * zone threads implicitly hold a shared lock. Inside it, the thread for zone zero holds an + * exclusive lock. No other threads may access or modify the cache entries. + * + * Chapter statistics must only be modified by a single thread, which is also the zone zero thread. + * All fields that might be frequently updated by that thread are kept in separate cache-aligned + * structures so they will not cause cache contention via "false sharing" with the fields that are + * frequently accessed by all of the zone threads. + * + * The LRU order is managed independently by each zone thread, and each zone uses its own list for + * searching and cache membership queries. The zone zero list is used to decide which chapter to + * evict when the cache is updated, and its search list is copied to the other threads at that + * time. + * + * The virtual chapter number field of the cache entry is the single field indicating whether a + * chapter is a member of the cache or not. The value NO_CHAPTER is used to represent a null or + * undefined chapter number. When present in the virtual chapter number field of a + * cached_chapter_index, it indicates that the cache entry is dead, and all the other fields of + * that entry (other than immutable pointers to cache memory) are undefined and irrelevant. Any + * cache entry that is not marked as dead is fully defined and a member of the cache, and + * uds_sparse_cache_contains() will always return true for any virtual chapter number that appears + * in any of the cache entries. + * + * A chapter index that is a member of the cache may be excluded from searches between calls to + * uds_update_sparse_cache() in two different ways. First, when a chapter falls off the end of the + * volume, its virtual chapter number will be less that the oldest virtual chapter number. Since + * that chapter is no longer part of the volume, there's no point in continuing to search that + * chapter index. Once invalidated, that virtual chapter will still be considered a member of the + * cache, but it will no longer be searched for matching names. + * + * The second mechanism is a heuristic based on keeping track of the number of consecutive search + * misses in a given chapter index. Once that count exceeds a threshold, the skip_search flag will + * be set to true, causing the chapter to be skipped when searching the entire cache, but still + * allowing it to be found when searching for a hook in that specific chapter. Finding a hook will + * clear the skip_search flag, once again allowing the non-hook searches to use that cache entry. + * Again, regardless of the state of the skip_search flag, the virtual chapter must still + * considered to be a member of the cache for uds_sparse_cache_contains(). + */ + +#define SKIP_SEARCH_THRESHOLD 20000 +#define ZONE_ZERO 0 + +/* + * These counters are essentially fields of the struct cached_chapter_index, but are segregated + * into this structure because they are frequently modified. They are grouped and aligned to keep + * them on different cache lines from the chapter fields that are accessed far more often than they + * are updated. + */ +struct __aligned(L1_CACHE_BYTES) cached_index_counters { + u64 consecutive_misses; +}; + +struct __aligned(L1_CACHE_BYTES) cached_chapter_index { + /* + * The virtual chapter number of the cached chapter index. NO_CHAPTER means this cache + * entry is unused. This field must only be modified in the critical section in + * uds_update_sparse_cache(). + */ + u64 virtual_chapter; + + u32 index_pages_count; + + /* + * These pointers are immutable during the life of the cache. The contents of the arrays + * change when the cache entry is replaced. + */ + struct delta_index_page *index_pages; + struct dm_buffer **page_buffers; + + /* + * If set, skip the chapter when searching the entire cache. This flag is just a + * performance optimization. This flag is mutable between cache updates, but it rarely + * changes and is frequently accessed, so it groups with the immutable fields. + */ + bool skip_search; + + /* + * The cache-aligned counters change often and are placed at the end of the structure to + * prevent false sharing with the more stable fields above. + */ + struct cached_index_counters counters; +}; + +/* + * A search_list represents an ordering of the sparse chapter index cache entry array, from most + * recently accessed to least recently accessed, which is the order in which the indexes should be + * searched and the reverse order in which they should be evicted from the cache. + * + * Cache entries that are dead or empty are kept at the end of the list, avoiding the need to even + * iterate over them to search, and ensuring that dead entries are replaced before any live entries + * are evicted. + * + * The search list is instantiated for each zone thread, avoiding any need for synchronization. The + * structure is allocated on a cache boundary to avoid false sharing of memory cache lines between + * zone threads. + */ +struct search_list { + u8 capacity; + u8 first_dead_entry; + struct cached_chapter_index *entries[]; +}; + +struct threads_barrier { + /* Lock for this barrier object */ + struct semaphore lock; + /* Semaphore for threads waiting at this barrier */ + struct semaphore wait; + /* Number of threads which have arrived */ + int arrived; + /* Total number of threads using this barrier */ + int thread_count; +}; + +struct sparse_cache { + const struct index_geometry *geometry; + unsigned int capacity; + unsigned int zone_count; + + unsigned int skip_threshold; + struct search_list *search_lists[MAX_ZONES]; + struct cached_chapter_index **scratch_entries; + + struct threads_barrier begin_update_barrier; + struct threads_barrier end_update_barrier; + + struct cached_chapter_index chapters[]; +}; + +static void initialize_threads_barrier(struct threads_barrier *barrier, + unsigned int thread_count) +{ + sema_init(&barrier->lock, 1); + barrier->arrived = 0; + barrier->thread_count = thread_count; + sema_init(&barrier->wait, 0); +} + +static inline void __down(struct semaphore *semaphore) +{ + /* + * Do not use down(semaphore). Instead use down_interruptible so that + * we do not get 120 second stall messages in kern.log. + */ + while (down_interruptible(semaphore) != 0) { + /* + * If we're called from a user-mode process (e.g., "dmsetup + * remove") while waiting for an operation that may take a + * while (e.g., UDS index save), and a signal is sent (SIGINT, + * SIGUSR2), then down_interruptible will not block. If that + * happens, sleep briefly to avoid keeping the CPU locked up in + * this loop. We could just call cond_resched, but then we'd + * still keep consuming CPU time slices and swamp other threads + * trying to do computational work. + */ + fsleep(1000); + } +} + +static void enter_threads_barrier(struct threads_barrier *barrier) +{ + __down(&barrier->lock); + if (++barrier->arrived == barrier->thread_count) { + /* last thread */ + int i; + + for (i = 1; i < barrier->thread_count; i++) + up(&barrier->wait); + + barrier->arrived = 0; + up(&barrier->lock); + } else { + up(&barrier->lock); + __down(&barrier->wait); + } +} + +static int __must_check initialize_cached_chapter_index(struct cached_chapter_index *chapter, + const struct index_geometry *geometry) +{ + int result; + + chapter->virtual_chapter = NO_CHAPTER; + chapter->index_pages_count = geometry->index_pages_per_chapter; + + result = vdo_allocate(chapter->index_pages_count, struct delta_index_page, + __func__, &chapter->index_pages); + if (result != VDO_SUCCESS) + return result; + + return vdo_allocate(chapter->index_pages_count, struct dm_buffer *, + "sparse index volume pages", &chapter->page_buffers); +} + +static int __must_check make_search_list(struct sparse_cache *cache, + struct search_list **list_ptr) +{ + struct search_list *list; + unsigned int bytes; + u8 i; + int result; + + bytes = (sizeof(struct search_list) + + (cache->capacity * sizeof(struct cached_chapter_index *))); + result = vdo_allocate_cache_aligned(bytes, "search list", &list); + if (result != VDO_SUCCESS) + return result; + + list->capacity = cache->capacity; + list->first_dead_entry = 0; + + for (i = 0; i < list->capacity; i++) + list->entries[i] = &cache->chapters[i]; + + *list_ptr = list; + return UDS_SUCCESS; +} + +int uds_make_sparse_cache(const struct index_geometry *geometry, unsigned int capacity, + unsigned int zone_count, struct sparse_cache **cache_ptr) +{ + int result; + unsigned int i; + struct sparse_cache *cache; + unsigned int bytes; + + bytes = (sizeof(struct sparse_cache) + (capacity * sizeof(struct cached_chapter_index))); + result = vdo_allocate_cache_aligned(bytes, "sparse cache", &cache); + if (result != VDO_SUCCESS) + return result; + + cache->geometry = geometry; + cache->capacity = capacity; + cache->zone_count = zone_count; + + /* + * Scale down the skip threshold since the cache only counts cache misses in zone zero, but + * requests are being handled in all zones. + */ + cache->skip_threshold = (SKIP_SEARCH_THRESHOLD / zone_count); + + initialize_threads_barrier(&cache->begin_update_barrier, zone_count); + initialize_threads_barrier(&cache->end_update_barrier, zone_count); + + for (i = 0; i < capacity; i++) { + result = initialize_cached_chapter_index(&cache->chapters[i], geometry); + if (result != UDS_SUCCESS) + goto out; + } + + for (i = 0; i < zone_count; i++) { + result = make_search_list(cache, &cache->search_lists[i]); + if (result != UDS_SUCCESS) + goto out; + } + + /* purge_search_list() needs some temporary lists for sorting. */ + result = vdo_allocate(capacity * 2, struct cached_chapter_index *, + "scratch entries", &cache->scratch_entries); + if (result != VDO_SUCCESS) + goto out; + + *cache_ptr = cache; + return UDS_SUCCESS; +out: + uds_free_sparse_cache(cache); + return result; +} + +static inline void set_skip_search(struct cached_chapter_index *chapter, + bool skip_search) +{ + /* Check before setting to reduce cache line contention. */ + if (READ_ONCE(chapter->skip_search) != skip_search) + WRITE_ONCE(chapter->skip_search, skip_search); +} + +static void score_search_hit(struct cached_chapter_index *chapter) +{ + chapter->counters.consecutive_misses = 0; + set_skip_search(chapter, false); +} + +static void score_search_miss(struct sparse_cache *cache, + struct cached_chapter_index *chapter) +{ + chapter->counters.consecutive_misses++; + if (chapter->counters.consecutive_misses > cache->skip_threshold) + set_skip_search(chapter, true); +} + +static void release_cached_chapter_index(struct cached_chapter_index *chapter) +{ + unsigned int i; + + chapter->virtual_chapter = NO_CHAPTER; + if (chapter->page_buffers == NULL) + return; + + for (i = 0; i < chapter->index_pages_count; i++) { + if (chapter->page_buffers[i] != NULL) + dm_bufio_release(vdo_forget(chapter->page_buffers[i])); + } +} + +void uds_free_sparse_cache(struct sparse_cache *cache) +{ + unsigned int i; + + if (cache == NULL) + return; + + vdo_free(cache->scratch_entries); + + for (i = 0; i < cache->zone_count; i++) + vdo_free(cache->search_lists[i]); + + for (i = 0; i < cache->capacity; i++) { + release_cached_chapter_index(&cache->chapters[i]); + vdo_free(cache->chapters[i].index_pages); + vdo_free(cache->chapters[i].page_buffers); + } + + vdo_free(cache); +} + +/* + * Take the indicated element of the search list and move it to the start, pushing the pointers + * previously before it back down the list. + */ +static inline void set_newest_entry(struct search_list *search_list, u8 index) +{ + struct cached_chapter_index *newest; + + if (index > 0) { + newest = search_list->entries[index]; + memmove(&search_list->entries[1], &search_list->entries[0], + index * sizeof(struct cached_chapter_index *)); + search_list->entries[0] = newest; + } + + /* + * This function may have moved a dead chapter to the front of the list for reuse, in which + * case the set of dead chapters becomes smaller. + */ + if (search_list->first_dead_entry <= index) + search_list->first_dead_entry++; +} + +bool uds_sparse_cache_contains(struct sparse_cache *cache, u64 virtual_chapter, + unsigned int zone_number) +{ + struct search_list *search_list; + struct cached_chapter_index *chapter; + u8 i; + + /* + * The correctness of the barriers depends on the invariant that between calls to + * uds_update_sparse_cache(), the answers this function returns must never vary: the result + * for a given chapter must be identical across zones. That invariant must be maintained + * even if the chapter falls off the end of the volume, or if searching it is disabled + * because of too many search misses. + */ + search_list = cache->search_lists[zone_number]; + for (i = 0; i < search_list->first_dead_entry; i++) { + chapter = search_list->entries[i]; + + if (virtual_chapter == chapter->virtual_chapter) { + if (zone_number == ZONE_ZERO) + score_search_hit(chapter); + + set_newest_entry(search_list, i); + return true; + } + } + + return false; +} + +/* + * Re-sort cache entries into three sets (active, skippable, and dead) while maintaining the LRU + * ordering that already existed. This operation must only be called during the critical section in + * uds_update_sparse_cache(). + */ +static void purge_search_list(struct search_list *search_list, + struct sparse_cache *cache, u64 oldest_virtual_chapter) +{ + struct cached_chapter_index **entries; + struct cached_chapter_index **skipped; + struct cached_chapter_index **dead; + struct cached_chapter_index *chapter; + unsigned int next_alive = 0; + unsigned int next_skipped = 0; + unsigned int next_dead = 0; + unsigned int i; + + entries = &search_list->entries[0]; + skipped = &cache->scratch_entries[0]; + dead = &cache->scratch_entries[search_list->capacity]; + + for (i = 0; i < search_list->first_dead_entry; i++) { + chapter = search_list->entries[i]; + if ((chapter->virtual_chapter < oldest_virtual_chapter) || + (chapter->virtual_chapter == NO_CHAPTER)) + dead[next_dead++] = chapter; + else if (chapter->skip_search) + skipped[next_skipped++] = chapter; + else + entries[next_alive++] = chapter; + } + + memcpy(&entries[next_alive], skipped, + next_skipped * sizeof(struct cached_chapter_index *)); + memcpy(&entries[next_alive + next_skipped], dead, + next_dead * sizeof(struct cached_chapter_index *)); + search_list->first_dead_entry = next_alive + next_skipped; +} + +static int __must_check cache_chapter_index(struct cached_chapter_index *chapter, + u64 virtual_chapter, + const struct volume *volume) +{ + int result; + + release_cached_chapter_index(chapter); + + result = uds_read_chapter_index_from_volume(volume, virtual_chapter, + chapter->page_buffers, + chapter->index_pages); + if (result != UDS_SUCCESS) + return result; + + chapter->counters.consecutive_misses = 0; + chapter->virtual_chapter = virtual_chapter; + chapter->skip_search = false; + + return UDS_SUCCESS; +} + +static inline void copy_search_list(const struct search_list *source, + struct search_list *target) +{ + *target = *source; + memcpy(target->entries, source->entries, + source->capacity * sizeof(struct cached_chapter_index *)); +} + +/* + * Update the sparse cache to contain a chapter index. This function must be called by all the zone + * threads with the same chapter number to correctly enter the thread barriers used to synchronize + * the cache updates. + */ +int uds_update_sparse_cache(struct index_zone *zone, u64 virtual_chapter) +{ + int result = UDS_SUCCESS; + const struct uds_index *index = zone->index; + struct sparse_cache *cache = index->volume->sparse_cache; + + if (uds_sparse_cache_contains(cache, virtual_chapter, zone->id)) + return UDS_SUCCESS; + + /* + * Wait for every zone thread to reach its corresponding barrier request and invoke this + * function before starting to modify the cache. + */ + enter_threads_barrier(&cache->begin_update_barrier); + + /* + * This is the start of the critical section: the zone zero thread is captain, effectively + * holding an exclusive lock on the sparse cache. All the other zone threads must do + * nothing between the two barriers. They will wait at the end_update_barrier again for the + * captain to finish the update. + */ + + if (zone->id == ZONE_ZERO) { + unsigned int z; + struct search_list *list = cache->search_lists[ZONE_ZERO]; + + purge_search_list(list, cache, zone->oldest_virtual_chapter); + + if (virtual_chapter >= index->oldest_virtual_chapter) { + set_newest_entry(list, list->capacity - 1); + result = cache_chapter_index(list->entries[0], virtual_chapter, + index->volume); + } + + for (z = 1; z < cache->zone_count; z++) + copy_search_list(list, cache->search_lists[z]); + } + + /* + * This is the end of the critical section. All cache invariants must have been restored. + */ + enter_threads_barrier(&cache->end_update_barrier); + return result; +} + +void uds_invalidate_sparse_cache(struct sparse_cache *cache) +{ + unsigned int i; + + for (i = 0; i < cache->capacity; i++) + release_cached_chapter_index(&cache->chapters[i]); +} + +static inline bool should_skip_chapter(struct cached_chapter_index *chapter, + u64 oldest_chapter, u64 requested_chapter) +{ + if ((chapter->virtual_chapter == NO_CHAPTER) || + (chapter->virtual_chapter < oldest_chapter)) + return true; + + if (requested_chapter != NO_CHAPTER) + return requested_chapter != chapter->virtual_chapter; + else + return READ_ONCE(chapter->skip_search); +} + +static int __must_check search_cached_chapter_index(struct cached_chapter_index *chapter, + const struct index_geometry *geometry, + const struct index_page_map *index_page_map, + const struct uds_record_name *name, + u16 *record_page_ptr) +{ + u32 physical_chapter = + uds_map_to_physical_chapter(geometry, chapter->virtual_chapter); + u32 index_page_number = + uds_find_index_page_number(index_page_map, name, physical_chapter); + struct delta_index_page *index_page = + &chapter->index_pages[index_page_number]; + + return uds_search_chapter_index_page(index_page, geometry, name, + record_page_ptr); +} + +int uds_search_sparse_cache(struct index_zone *zone, const struct uds_record_name *name, + u64 *virtual_chapter_ptr, u16 *record_page_ptr) +{ + int result; + struct volume *volume = zone->index->volume; + struct sparse_cache *cache = volume->sparse_cache; + struct cached_chapter_index *chapter; + struct search_list *search_list; + u8 i; + /* Search the entire cache unless a specific chapter was requested. */ + bool search_one = (*virtual_chapter_ptr != NO_CHAPTER); + + *record_page_ptr = NO_CHAPTER_INDEX_ENTRY; + search_list = cache->search_lists[zone->id]; + for (i = 0; i < search_list->first_dead_entry; i++) { + chapter = search_list->entries[i]; + + if (should_skip_chapter(chapter, zone->oldest_virtual_chapter, + *virtual_chapter_ptr)) + continue; + + result = search_cached_chapter_index(chapter, cache->geometry, + volume->index_page_map, name, + record_page_ptr); + if (result != UDS_SUCCESS) + return result; + + if (*record_page_ptr != NO_CHAPTER_INDEX_ENTRY) { + /* + * In theory, this might be a false match while a true match exists in + * another chapter, but that's a very rare case and not worth the extra + * search complexity. + */ + set_newest_entry(search_list, i); + if (zone->id == ZONE_ZERO) + score_search_hit(chapter); + + *virtual_chapter_ptr = chapter->virtual_chapter; + return UDS_SUCCESS; + } + + if (zone->id == ZONE_ZERO) + score_search_miss(cache, chapter); + + if (search_one) + break; + } + + return UDS_SUCCESS; +} diff --git a/drivers/md/dm-vdo/indexer/sparse-cache.h b/drivers/md/dm-vdo/indexer/sparse-cache.h new file mode 100644 index 0000000000..45e2dcf165 --- /dev/null +++ b/drivers/md/dm-vdo/indexer/sparse-cache.h @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright 2023 Red Hat + */ + +#ifndef UDS_SPARSE_CACHE_H +#define UDS_SPARSE_CACHE_H + +#include "geometry.h" +#include "indexer.h" + +/* + * The sparse cache is a cache of entire chapter indexes from sparse chapters used for searching + * for names after all other search paths have failed. It contains only complete chapter indexes; + * record pages from sparse chapters and single index pages used for resolving hooks are kept in + * the regular page cache in the volume. + * + * The most important property of this cache is the absence of synchronization for read operations. + * Safe concurrent access to the cache by the zone threads is controlled by the triage queue and + * the barrier requests it issues to the zone queues. The set of cached chapters does not and must + * not change between the carefully coordinated calls to uds_update_sparse_cache() from the zone + * threads. Outside of updates, every zone will get the same result when calling + * uds_sparse_cache_contains() as every other zone. + */ + +struct index_zone; +struct sparse_cache; + +int __must_check uds_make_sparse_cache(const struct index_geometry *geometry, + unsigned int capacity, unsigned int zone_count, + struct sparse_cache **cache_ptr); + +void uds_free_sparse_cache(struct sparse_cache *cache); + +bool uds_sparse_cache_contains(struct sparse_cache *cache, u64 virtual_chapter, + unsigned int zone_number); + +int __must_check uds_update_sparse_cache(struct index_zone *zone, u64 virtual_chapter); + +void uds_invalidate_sparse_cache(struct sparse_cache *cache); + +int __must_check uds_search_sparse_cache(struct index_zone *zone, + const struct uds_record_name *name, + u64 *virtual_chapter_ptr, u16 *record_page_ptr); + +#endif /* UDS_SPARSE_CACHE_H */ diff --git a/drivers/md/dm-vdo/indexer/volume-index.c b/drivers/md/dm-vdo/indexer/volume-index.c new file mode 100644 index 0000000000..12f954a0c5 --- /dev/null +++ b/drivers/md/dm-vdo/indexer/volume-index.c @@ -0,0 +1,1283 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2023 Red Hat + */ +#include "volume-index.h" + +#include +#include +#include +#include +#include + +#include "errors.h" +#include "logger.h" +#include "memory-alloc.h" +#include "numeric.h" +#include "permassert.h" +#include "thread-utils.h" + +#include "config.h" +#include "geometry.h" +#include "hash-utils.h" +#include "indexer.h" + +/* + * The volume index is a combination of two separate subindexes, one containing sparse hook entries + * (retained for all chapters), and one containing the remaining entries (retained only for the + * dense chapters). If there are no sparse chapters, only the non-hook sub index is used, and it + * will contain all records for all chapters. + * + * The volume index is also divided into zones, with one thread operating on each zone. Each + * incoming request is dispatched to the appropriate thread, and then to the appropriate subindex. + * Each delta list is handled by a single zone. To ensure that the distribution of delta lists to + * zones doesn't underflow (leaving some zone with no delta lists), the minimum number of delta + * lists must be the square of the maximum zone count for both subindexes. + * + * Each subindex zone is a delta index where the payload is a chapter number. The volume index can + * compute the delta list number, address, and zone number from the record name in order to + * dispatch record handling to the correct structures. + * + * Most operations that use all the zones take place either before request processing is allowed, + * or after all requests have been flushed in order to shut down. The only multi-threaded operation + * supported during normal operation is the uds_lookup_volume_index_name() method, used to determine + * whether a new chapter should be loaded into the sparse index cache. This operation only uses the + * sparse hook subindex, and the zone mutexes are used to make this operation safe. + * + * There are three ways of expressing chapter numbers in the volume index: virtual, index, and + * rolling. The interface to the volume index uses virtual chapter numbers, which are 64 bits long. + * Internally the subindex stores only the minimal number of bits necessary by masking away the + * high-order bits. When the index needs to deal with ordering of index chapter numbers, as when + * flushing entries from older chapters, it rolls the index chapter number around so that the + * smallest one in use is mapped to 0. See convert_index_to_virtual() or flush_invalid_entries() + * for an example of this technique. + * + * For efficiency, when older chapter numbers become invalid, the index does not immediately remove + * the invalidated entries. Instead it lazily removes them from a given delta list the next time it + * walks that list during normal operation. Because of this, the index size must be increased + * somewhat to accommodate all the invalid entries that have not yet been removed. For the standard + * index sizes, this requires about 4 chapters of old entries per 1024 chapters of valid entries in + * the index. + */ + +struct sub_index_parameters { + /* The number of bits in address mask */ + u8 address_bits; + /* The number of bits in chapter number */ + u8 chapter_bits; + /* The mean delta */ + u32 mean_delta; + /* The number of delta lists */ + u64 list_count; + /* The number of chapters used */ + u32 chapter_count; + /* The number of bits per chapter */ + size_t chapter_size_in_bits; + /* The number of bytes of delta list memory */ + size_t memory_size; + /* The number of bytes the index should keep free at all times */ + size_t target_free_bytes; +}; + +struct split_config { + /* The hook subindex configuration */ + struct uds_configuration hook_config; + struct index_geometry hook_geometry; + + /* The non-hook subindex configuration */ + struct uds_configuration non_hook_config; + struct index_geometry non_hook_geometry; +}; + +struct chapter_range { + u32 chapter_start; + u32 chapter_count; +}; + +#define MAGIC_SIZE 8 + +static const char MAGIC_START_5[] = "MI5-0005"; + +struct sub_index_data { + char magic[MAGIC_SIZE]; /* MAGIC_START_5 */ + u64 volume_nonce; + u64 virtual_chapter_low; + u64 virtual_chapter_high; + u32 first_list; + u32 list_count; +}; + +static const char MAGIC_START_6[] = "MI6-0001"; + +struct volume_index_data { + char magic[MAGIC_SIZE]; /* MAGIC_START_6 */ + u32 sparse_sample_rate; +}; + +static inline u32 extract_address(const struct volume_sub_index *sub_index, + const struct uds_record_name *name) +{ + return uds_extract_volume_index_bytes(name) & sub_index->address_mask; +} + +static inline u32 extract_dlist_num(const struct volume_sub_index *sub_index, + const struct uds_record_name *name) +{ + u64 bits = uds_extract_volume_index_bytes(name); + + return (bits >> sub_index->address_bits) % sub_index->list_count; +} + +static inline const struct volume_sub_index_zone * +get_zone_for_record(const struct volume_index_record *record) +{ + return &record->sub_index->zones[record->zone_number]; +} + +static inline u64 convert_index_to_virtual(const struct volume_index_record *record, + u32 index_chapter) +{ + const struct volume_sub_index_zone *volume_index_zone = get_zone_for_record(record); + u32 rolling_chapter = ((index_chapter - volume_index_zone->virtual_chapter_low) & + record->sub_index->chapter_mask); + + return volume_index_zone->virtual_chapter_low + rolling_chapter; +} + +static inline u32 convert_virtual_to_index(const struct volume_sub_index *sub_index, + u64 virtual_chapter) +{ + return virtual_chapter & sub_index->chapter_mask; +} + +static inline bool is_virtual_chapter_indexed(const struct volume_index_record *record, + u64 virtual_chapter) +{ + const struct volume_sub_index_zone *volume_index_zone = get_zone_for_record(record); + + return ((virtual_chapter >= volume_index_zone->virtual_chapter_low) && + (virtual_chapter <= volume_index_zone->virtual_chapter_high)); +} + +static inline bool has_sparse(const struct volume_index *volume_index) +{ + return volume_index->sparse_sample_rate > 0; +} + +bool uds_is_volume_index_sample(const struct volume_index *volume_index, + const struct uds_record_name *name) +{ + if (!has_sparse(volume_index)) + return false; + + return (uds_extract_sampling_bytes(name) % volume_index->sparse_sample_rate) == 0; +} + +static inline const struct volume_sub_index * +get_volume_sub_index(const struct volume_index *volume_index, + const struct uds_record_name *name) +{ + return (uds_is_volume_index_sample(volume_index, name) ? + &volume_index->vi_hook : + &volume_index->vi_non_hook); +} + +static unsigned int get_volume_sub_index_zone(const struct volume_sub_index *sub_index, + const struct uds_record_name *name) +{ + return extract_dlist_num(sub_index, name) / sub_index->delta_index.lists_per_zone; +} + +unsigned int uds_get_volume_index_zone(const struct volume_index *volume_index, + const struct uds_record_name *name) +{ + return get_volume_sub_index_zone(get_volume_sub_index(volume_index, name), name); +} + +#define DELTA_LIST_SIZE 256 + +static int compute_volume_sub_index_parameters(const struct uds_configuration *config, + struct sub_index_parameters *params) +{ + u64 entries_in_volume_index, address_span; + u32 chapters_in_volume_index, invalid_chapters; + u32 rounded_chapters; + u64 delta_list_records; + u32 address_count; + u64 index_size_in_bits; + size_t expected_index_size; + u64 min_delta_lists = MAX_ZONES * MAX_ZONES; + struct index_geometry *geometry = config->geometry; + u64 records_per_chapter = geometry->records_per_chapter; + + params->chapter_count = geometry->chapters_per_volume; + /* + * Make sure that the number of delta list records in the volume index does not change when + * the volume is reduced by one chapter. This preserves the mapping from name to volume + * index delta list. + */ + rounded_chapters = params->chapter_count; + if (uds_is_reduced_index_geometry(geometry)) + rounded_chapters += 1; + delta_list_records = records_per_chapter * rounded_chapters; + address_count = config->volume_index_mean_delta * DELTA_LIST_SIZE; + params->list_count = max(delta_list_records / DELTA_LIST_SIZE, min_delta_lists); + params->address_bits = bits_per(address_count - 1); + params->chapter_bits = bits_per(rounded_chapters - 1); + if ((u32) params->list_count != params->list_count) { + return vdo_log_warning_strerror(UDS_INVALID_ARGUMENT, + "cannot initialize volume index with %llu delta lists", + (unsigned long long) params->list_count); + } + + if (params->address_bits > 31) { + return vdo_log_warning_strerror(UDS_INVALID_ARGUMENT, + "cannot initialize volume index with %u address bits", + params->address_bits); + } + + /* + * The probability that a given delta list is not touched during the writing of an entire + * chapter is: + * + * double p_not_touched = pow((double) (params->list_count - 1) / params->list_count, + * records_per_chapter); + * + * For the standard index sizes, about 78% of the delta lists are not touched, and + * therefore contain old index entries that have not been eliminated by the lazy LRU + * processing. Then the number of old index entries that accumulate over the entire index, + * in terms of full chapters worth of entries, is: + * + * double invalid_chapters = p_not_touched / (1.0 - p_not_touched); + * + * For the standard index sizes, the index needs about 3.5 chapters of space for the old + * entries in a 1024 chapter index, so round this up to use 4 chapters per 1024 chapters in + * the index. + */ + invalid_chapters = max(rounded_chapters / 256, 2U); + chapters_in_volume_index = rounded_chapters + invalid_chapters; + entries_in_volume_index = records_per_chapter * chapters_in_volume_index; + + address_span = params->list_count << params->address_bits; + params->mean_delta = address_span / entries_in_volume_index; + + /* + * Compute the expected size of a full index, then set the total memory to be 6% larger + * than that expected size. This number should be large enough that there are not many + * rebalances when the index is full. + */ + params->chapter_size_in_bits = uds_compute_delta_index_size(records_per_chapter, + params->mean_delta, + params->chapter_bits); + index_size_in_bits = params->chapter_size_in_bits * chapters_in_volume_index; + expected_index_size = index_size_in_bits / BITS_PER_BYTE; + params->memory_size = expected_index_size * 106 / 100; + + params->target_free_bytes = expected_index_size / 20; + return UDS_SUCCESS; +} + +static void uninitialize_volume_sub_index(struct volume_sub_index *sub_index) +{ + vdo_free(vdo_forget(sub_index->flush_chapters)); + vdo_free(vdo_forget(sub_index->zones)); + uds_uninitialize_delta_index(&sub_index->delta_index); +} + +void uds_free_volume_index(struct volume_index *volume_index) +{ + if (volume_index == NULL) + return; + + if (volume_index->zones != NULL) + vdo_free(vdo_forget(volume_index->zones)); + + uninitialize_volume_sub_index(&volume_index->vi_non_hook); + uninitialize_volume_sub_index(&volume_index->vi_hook); + vdo_free(volume_index); +} + + +static int compute_volume_sub_index_save_bytes(const struct uds_configuration *config, + size_t *bytes) +{ + struct sub_index_parameters params = { .address_bits = 0 }; + int result; + + result = compute_volume_sub_index_parameters(config, ¶ms); + if (result != UDS_SUCCESS) + return result; + + *bytes = (sizeof(struct sub_index_data) + params.list_count * sizeof(u64) + + uds_compute_delta_index_save_bytes(params.list_count, + params.memory_size)); + return UDS_SUCCESS; +} + +/* This function is only useful if the configuration includes sparse chapters. */ +static void split_configuration(const struct uds_configuration *config, + struct split_config *split) +{ + u64 sample_rate, sample_records; + u64 dense_chapters, sparse_chapters; + + /* Start with copies of the base configuration. */ + split->hook_config = *config; + split->hook_geometry = *config->geometry; + split->hook_config.geometry = &split->hook_geometry; + split->non_hook_config = *config; + split->non_hook_geometry = *config->geometry; + split->non_hook_config.geometry = &split->non_hook_geometry; + + sample_rate = config->sparse_sample_rate; + sparse_chapters = config->geometry->sparse_chapters_per_volume; + dense_chapters = config->geometry->chapters_per_volume - sparse_chapters; + sample_records = config->geometry->records_per_chapter / sample_rate; + + /* Adjust the number of records indexed for each chapter. */ + split->hook_geometry.records_per_chapter = sample_records; + split->non_hook_geometry.records_per_chapter -= sample_records; + + /* Adjust the number of chapters indexed. */ + split->hook_geometry.sparse_chapters_per_volume = 0; + split->non_hook_geometry.sparse_chapters_per_volume = 0; + split->non_hook_geometry.chapters_per_volume = dense_chapters; +} + +static int compute_volume_index_save_bytes(const struct uds_configuration *config, + size_t *bytes) +{ + size_t hook_bytes, non_hook_bytes; + struct split_config split; + int result; + + if (!uds_is_sparse_index_geometry(config->geometry)) + return compute_volume_sub_index_save_bytes(config, bytes); + + split_configuration(config, &split); + result = compute_volume_sub_index_save_bytes(&split.hook_config, &hook_bytes); + if (result != UDS_SUCCESS) + return result; + + result = compute_volume_sub_index_save_bytes(&split.non_hook_config, + &non_hook_bytes); + if (result != UDS_SUCCESS) + return result; + + *bytes = sizeof(struct volume_index_data) + hook_bytes + non_hook_bytes; + return UDS_SUCCESS; +} + +int uds_compute_volume_index_save_blocks(const struct uds_configuration *config, + size_t block_size, u64 *block_count) +{ + size_t bytes; + int result; + + result = compute_volume_index_save_bytes(config, &bytes); + if (result != UDS_SUCCESS) + return result; + + bytes += sizeof(struct delta_list_save_info); + *block_count = DIV_ROUND_UP(bytes, block_size) + MAX_ZONES; + return UDS_SUCCESS; +} + +/* Flush invalid entries while walking the delta list. */ +static inline int flush_invalid_entries(struct volume_index_record *record, + struct chapter_range *flush_range, + u32 *next_chapter_to_invalidate) +{ + int result; + + result = uds_next_delta_index_entry(&record->delta_entry); + if (result != UDS_SUCCESS) + return result; + + while (!record->delta_entry.at_end) { + u32 index_chapter = uds_get_delta_entry_value(&record->delta_entry); + u32 relative_chapter = ((index_chapter - flush_range->chapter_start) & + record->sub_index->chapter_mask); + + if (likely(relative_chapter >= flush_range->chapter_count)) { + if (relative_chapter < *next_chapter_to_invalidate) + *next_chapter_to_invalidate = relative_chapter; + break; + } + + result = uds_remove_delta_index_entry(&record->delta_entry); + if (result != UDS_SUCCESS) + return result; + } + + return UDS_SUCCESS; +} + +/* Find the matching record, or the list offset where the record would go. */ +static int get_volume_index_entry(struct volume_index_record *record, u32 list_number, + u32 key, struct chapter_range *flush_range) +{ + struct volume_index_record other_record; + const struct volume_sub_index *sub_index = record->sub_index; + u32 next_chapter_to_invalidate = sub_index->chapter_mask; + int result; + + result = uds_start_delta_index_search(&sub_index->delta_index, list_number, 0, + &record->delta_entry); + if (result != UDS_SUCCESS) + return result; + + do { + result = flush_invalid_entries(record, flush_range, + &next_chapter_to_invalidate); + if (result != UDS_SUCCESS) + return result; + } while (!record->delta_entry.at_end && (key > record->delta_entry.key)); + + result = uds_remember_delta_index_offset(&record->delta_entry); + if (result != UDS_SUCCESS) + return result; + + /* Check any collision records for a more precise match. */ + other_record = *record; + if (!other_record.delta_entry.at_end && (key == other_record.delta_entry.key)) { + for (;;) { + u8 collision_name[UDS_RECORD_NAME_SIZE]; + + result = flush_invalid_entries(&other_record, flush_range, + &next_chapter_to_invalidate); + if (result != UDS_SUCCESS) + return result; + + if (other_record.delta_entry.at_end || + !other_record.delta_entry.is_collision) + break; + + result = uds_get_delta_entry_collision(&other_record.delta_entry, + collision_name); + if (result != UDS_SUCCESS) + return result; + + if (memcmp(collision_name, record->name, UDS_RECORD_NAME_SIZE) == 0) { + *record = other_record; + break; + } + } + } + while (!other_record.delta_entry.at_end) { + result = flush_invalid_entries(&other_record, flush_range, + &next_chapter_to_invalidate); + if (result != UDS_SUCCESS) + return result; + } + next_chapter_to_invalidate += flush_range->chapter_start; + next_chapter_to_invalidate &= sub_index->chapter_mask; + flush_range->chapter_start = next_chapter_to_invalidate; + flush_range->chapter_count = 0; + return UDS_SUCCESS; +} + +static int get_volume_sub_index_record(struct volume_sub_index *sub_index, + const struct uds_record_name *name, + struct volume_index_record *record) +{ + int result; + const struct volume_sub_index_zone *volume_index_zone; + u32 address = extract_address(sub_index, name); + u32 delta_list_number = extract_dlist_num(sub_index, name); + u64 flush_chapter = sub_index->flush_chapters[delta_list_number]; + + record->sub_index = sub_index; + record->mutex = NULL; + record->name = name; + record->zone_number = delta_list_number / sub_index->delta_index.lists_per_zone; + volume_index_zone = get_zone_for_record(record); + + if (flush_chapter < volume_index_zone->virtual_chapter_low) { + struct chapter_range range; + u64 flush_count = volume_index_zone->virtual_chapter_low - flush_chapter; + + range.chapter_start = convert_virtual_to_index(sub_index, flush_chapter); + range.chapter_count = (flush_count > sub_index->chapter_mask ? + sub_index->chapter_mask + 1 : + flush_count); + result = get_volume_index_entry(record, delta_list_number, address, + &range); + flush_chapter = convert_index_to_virtual(record, range.chapter_start); + if (flush_chapter > volume_index_zone->virtual_chapter_high) + flush_chapter = volume_index_zone->virtual_chapter_high; + sub_index->flush_chapters[delta_list_number] = flush_chapter; + } else { + result = uds_get_delta_index_entry(&sub_index->delta_index, + delta_list_number, address, + name->name, &record->delta_entry); + } + + if (result != UDS_SUCCESS) + return result; + + record->is_found = + (!record->delta_entry.at_end && (record->delta_entry.key == address)); + if (record->is_found) { + u32 index_chapter = uds_get_delta_entry_value(&record->delta_entry); + + record->virtual_chapter = convert_index_to_virtual(record, index_chapter); + } + + record->is_collision = record->delta_entry.is_collision; + return UDS_SUCCESS; +} + +int uds_get_volume_index_record(struct volume_index *volume_index, + const struct uds_record_name *name, + struct volume_index_record *record) +{ + int result; + + if (uds_is_volume_index_sample(volume_index, name)) { + /* + * Other threads cannot be allowed to call uds_lookup_volume_index_name() while + * this thread is finding the volume index record. Due to the lazy LRU flushing of + * the volume index, uds_get_volume_index_record() is not a read-only operation. + */ + unsigned int zone = + get_volume_sub_index_zone(&volume_index->vi_hook, name); + struct mutex *mutex = &volume_index->zones[zone].hook_mutex; + + mutex_lock(mutex); + result = get_volume_sub_index_record(&volume_index->vi_hook, name, + record); + mutex_unlock(mutex); + /* Remember the mutex so that other operations on the index record can use it. */ + record->mutex = mutex; + } else { + result = get_volume_sub_index_record(&volume_index->vi_non_hook, name, + record); + } + + return result; +} + +int uds_put_volume_index_record(struct volume_index_record *record, u64 virtual_chapter) +{ + int result; + u32 address; + const struct volume_sub_index *sub_index = record->sub_index; + + if (!is_virtual_chapter_indexed(record, virtual_chapter)) { + u64 low = get_zone_for_record(record)->virtual_chapter_low; + u64 high = get_zone_for_record(record)->virtual_chapter_high; + + return vdo_log_warning_strerror(UDS_INVALID_ARGUMENT, + "cannot put record into chapter number %llu that is out of the valid range %llu to %llu", + (unsigned long long) virtual_chapter, + (unsigned long long) low, + (unsigned long long) high); + } + address = extract_address(sub_index, record->name); + if (unlikely(record->mutex != NULL)) + mutex_lock(record->mutex); + result = uds_put_delta_index_entry(&record->delta_entry, address, + convert_virtual_to_index(sub_index, + virtual_chapter), + record->is_found ? record->name->name : NULL); + if (unlikely(record->mutex != NULL)) + mutex_unlock(record->mutex); + switch (result) { + case UDS_SUCCESS: + record->virtual_chapter = virtual_chapter; + record->is_collision = record->delta_entry.is_collision; + record->is_found = true; + break; + case UDS_OVERFLOW: + vdo_log_ratelimit(vdo_log_warning_strerror, UDS_OVERFLOW, + "Volume index entry dropped due to overflow condition"); + uds_log_delta_index_entry(&record->delta_entry); + break; + default: + break; + } + + return result; +} + +int uds_remove_volume_index_record(struct volume_index_record *record) +{ + int result; + + if (!record->is_found) + return vdo_log_warning_strerror(UDS_BAD_STATE, + "illegal operation on new record"); + + /* Mark the record so that it cannot be used again */ + record->is_found = false; + if (unlikely(record->mutex != NULL)) + mutex_lock(record->mutex); + result = uds_remove_delta_index_entry(&record->delta_entry); + if (unlikely(record->mutex != NULL)) + mutex_unlock(record->mutex); + return result; +} + +static void set_volume_sub_index_zone_open_chapter(struct volume_sub_index *sub_index, + unsigned int zone_number, + u64 virtual_chapter) +{ + u64 used_bits = 0; + struct volume_sub_index_zone *zone = &sub_index->zones[zone_number]; + struct delta_zone *delta_zone; + u32 i; + + zone->virtual_chapter_low = (virtual_chapter >= sub_index->chapter_count ? + virtual_chapter - sub_index->chapter_count + 1 : + 0); + zone->virtual_chapter_high = virtual_chapter; + + /* Check to see if the new zone data is too large. */ + delta_zone = &sub_index->delta_index.delta_zones[zone_number]; + for (i = 1; i <= delta_zone->list_count; i++) + used_bits += delta_zone->delta_lists[i].size; + + if (used_bits > sub_index->max_zone_bits) { + /* Expire enough chapters to free the desired space. */ + u64 expire_count = + 1 + (used_bits - sub_index->max_zone_bits) / sub_index->chapter_zone_bits; + + if (expire_count == 1) { + vdo_log_ratelimit(vdo_log_info, + "zone %u: At chapter %llu, expiring chapter %llu early", + zone_number, + (unsigned long long) virtual_chapter, + (unsigned long long) zone->virtual_chapter_low); + zone->early_flushes++; + zone->virtual_chapter_low++; + } else { + u64 first_expired = zone->virtual_chapter_low; + + if (first_expired + expire_count < zone->virtual_chapter_high) { + zone->early_flushes += expire_count; + zone->virtual_chapter_low += expire_count; + } else { + zone->early_flushes += + zone->virtual_chapter_high - zone->virtual_chapter_low; + zone->virtual_chapter_low = zone->virtual_chapter_high; + } + vdo_log_ratelimit(vdo_log_info, + "zone %u: At chapter %llu, expiring chapters %llu to %llu early", + zone_number, + (unsigned long long) virtual_chapter, + (unsigned long long) first_expired, + (unsigned long long) zone->virtual_chapter_low - 1); + } + } +} + +void uds_set_volume_index_zone_open_chapter(struct volume_index *volume_index, + unsigned int zone_number, + u64 virtual_chapter) +{ + struct mutex *mutex = &volume_index->zones[zone_number].hook_mutex; + + set_volume_sub_index_zone_open_chapter(&volume_index->vi_non_hook, zone_number, + virtual_chapter); + + /* + * Other threads cannot be allowed to call uds_lookup_volume_index_name() while the open + * chapter number is changing. + */ + if (has_sparse(volume_index)) { + mutex_lock(mutex); + set_volume_sub_index_zone_open_chapter(&volume_index->vi_hook, + zone_number, virtual_chapter); + mutex_unlock(mutex); + } +} + +/* + * Set the newest open chapter number for the index, while also advancing the oldest valid chapter + * number. + */ +void uds_set_volume_index_open_chapter(struct volume_index *volume_index, + u64 virtual_chapter) +{ + unsigned int zone; + + for (zone = 0; zone < volume_index->zone_count; zone++) + uds_set_volume_index_zone_open_chapter(volume_index, zone, virtual_chapter); +} + +int uds_set_volume_index_record_chapter(struct volume_index_record *record, + u64 virtual_chapter) +{ + const struct volume_sub_index *sub_index = record->sub_index; + int result; + + if (!record->is_found) + return vdo_log_warning_strerror(UDS_BAD_STATE, + "illegal operation on new record"); + + if (!is_virtual_chapter_indexed(record, virtual_chapter)) { + u64 low = get_zone_for_record(record)->virtual_chapter_low; + u64 high = get_zone_for_record(record)->virtual_chapter_high; + + return vdo_log_warning_strerror(UDS_INVALID_ARGUMENT, + "cannot set chapter number %llu that is out of the valid range %llu to %llu", + (unsigned long long) virtual_chapter, + (unsigned long long) low, + (unsigned long long) high); + } + + if (unlikely(record->mutex != NULL)) + mutex_lock(record->mutex); + result = uds_set_delta_entry_value(&record->delta_entry, + convert_virtual_to_index(sub_index, + virtual_chapter)); + if (unlikely(record->mutex != NULL)) + mutex_unlock(record->mutex); + if (result != UDS_SUCCESS) + return result; + + record->virtual_chapter = virtual_chapter; + return UDS_SUCCESS; +} + +static u64 lookup_volume_sub_index_name(const struct volume_sub_index *sub_index, + const struct uds_record_name *name) +{ + int result; + u32 address = extract_address(sub_index, name); + u32 delta_list_number = extract_dlist_num(sub_index, name); + unsigned int zone_number = get_volume_sub_index_zone(sub_index, name); + const struct volume_sub_index_zone *zone = &sub_index->zones[zone_number]; + u64 virtual_chapter; + u32 index_chapter; + u32 rolling_chapter; + struct delta_index_entry delta_entry; + + result = uds_get_delta_index_entry(&sub_index->delta_index, delta_list_number, + address, name->name, &delta_entry); + if (result != UDS_SUCCESS) + return NO_CHAPTER; + + if (delta_entry.at_end || (delta_entry.key != address)) + return NO_CHAPTER; + + index_chapter = uds_get_delta_entry_value(&delta_entry); + rolling_chapter = (index_chapter - zone->virtual_chapter_low) & sub_index->chapter_mask; + + virtual_chapter = zone->virtual_chapter_low + rolling_chapter; + if (virtual_chapter > zone->virtual_chapter_high) + return NO_CHAPTER; + + return virtual_chapter; +} + +/* Do a read-only lookup of the record name for sparse cache management. */ +u64 uds_lookup_volume_index_name(const struct volume_index *volume_index, + const struct uds_record_name *name) +{ + unsigned int zone_number = uds_get_volume_index_zone(volume_index, name); + struct mutex *mutex = &volume_index->zones[zone_number].hook_mutex; + u64 virtual_chapter; + + if (!uds_is_volume_index_sample(volume_index, name)) + return NO_CHAPTER; + + mutex_lock(mutex); + virtual_chapter = lookup_volume_sub_index_name(&volume_index->vi_hook, name); + mutex_unlock(mutex); + + return virtual_chapter; +} + +static void abort_restoring_volume_sub_index(struct volume_sub_index *sub_index) +{ + uds_reset_delta_index(&sub_index->delta_index); +} + +static void abort_restoring_volume_index(struct volume_index *volume_index) +{ + abort_restoring_volume_sub_index(&volume_index->vi_non_hook); + if (has_sparse(volume_index)) + abort_restoring_volume_sub_index(&volume_index->vi_hook); +} + +static int start_restoring_volume_sub_index(struct volume_sub_index *sub_index, + struct buffered_reader **readers, + unsigned int reader_count) +{ + unsigned int z; + int result; + u64 virtual_chapter_low = 0, virtual_chapter_high = 0; + unsigned int i; + + for (i = 0; i < reader_count; i++) { + struct sub_index_data header; + u8 buffer[sizeof(struct sub_index_data)]; + size_t offset = 0; + u32 j; + + result = uds_read_from_buffered_reader(readers[i], buffer, + sizeof(buffer)); + if (result != UDS_SUCCESS) { + return vdo_log_warning_strerror(result, + "failed to read volume index header"); + } + + memcpy(&header.magic, buffer, MAGIC_SIZE); + offset += MAGIC_SIZE; + decode_u64_le(buffer, &offset, &header.volume_nonce); + decode_u64_le(buffer, &offset, &header.virtual_chapter_low); + decode_u64_le(buffer, &offset, &header.virtual_chapter_high); + decode_u32_le(buffer, &offset, &header.first_list); + decode_u32_le(buffer, &offset, &header.list_count); + + result = VDO_ASSERT(offset == sizeof(buffer), + "%zu bytes decoded of %zu expected", offset, + sizeof(buffer)); + if (result != VDO_SUCCESS) + result = UDS_CORRUPT_DATA; + + if (memcmp(header.magic, MAGIC_START_5, MAGIC_SIZE) != 0) { + return vdo_log_warning_strerror(UDS_CORRUPT_DATA, + "volume index file had bad magic number"); + } + + if (sub_index->volume_nonce == 0) { + sub_index->volume_nonce = header.volume_nonce; + } else if (header.volume_nonce != sub_index->volume_nonce) { + return vdo_log_warning_strerror(UDS_CORRUPT_DATA, + "volume index volume nonce incorrect"); + } + + if (i == 0) { + virtual_chapter_low = header.virtual_chapter_low; + virtual_chapter_high = header.virtual_chapter_high; + } else if (virtual_chapter_high != header.virtual_chapter_high) { + u64 low = header.virtual_chapter_low; + u64 high = header.virtual_chapter_high; + + return vdo_log_warning_strerror(UDS_CORRUPT_DATA, + "Inconsistent volume index zone files: Chapter range is [%llu,%llu], chapter range %d is [%llu,%llu]", + (unsigned long long) virtual_chapter_low, + (unsigned long long) virtual_chapter_high, + i, (unsigned long long) low, + (unsigned long long) high); + } else if (virtual_chapter_low < header.virtual_chapter_low) { + virtual_chapter_low = header.virtual_chapter_low; + } + + for (j = 0; j < header.list_count; j++) { + u8 decoded[sizeof(u64)]; + + result = uds_read_from_buffered_reader(readers[i], decoded, + sizeof(u64)); + if (result != UDS_SUCCESS) { + return vdo_log_warning_strerror(result, + "failed to read volume index flush ranges"); + } + + sub_index->flush_chapters[header.first_list + j] = + get_unaligned_le64(decoded); + } + } + + for (z = 0; z < sub_index->zone_count; z++) { + memset(&sub_index->zones[z], 0, sizeof(struct volume_sub_index_zone)); + sub_index->zones[z].virtual_chapter_low = virtual_chapter_low; + sub_index->zones[z].virtual_chapter_high = virtual_chapter_high; + } + + result = uds_start_restoring_delta_index(&sub_index->delta_index, readers, + reader_count); + if (result != UDS_SUCCESS) + return vdo_log_warning_strerror(result, "restoring delta index failed"); + + return UDS_SUCCESS; +} + +static int start_restoring_volume_index(struct volume_index *volume_index, + struct buffered_reader **buffered_readers, + unsigned int reader_count) +{ + unsigned int i; + int result; + + if (!has_sparse(volume_index)) { + return start_restoring_volume_sub_index(&volume_index->vi_non_hook, + buffered_readers, reader_count); + } + + for (i = 0; i < reader_count; i++) { + struct volume_index_data header; + u8 buffer[sizeof(struct volume_index_data)]; + size_t offset = 0; + + result = uds_read_from_buffered_reader(buffered_readers[i], buffer, + sizeof(buffer)); + if (result != UDS_SUCCESS) { + return vdo_log_warning_strerror(result, + "failed to read volume index header"); + } + + memcpy(&header.magic, buffer, MAGIC_SIZE); + offset += MAGIC_SIZE; + decode_u32_le(buffer, &offset, &header.sparse_sample_rate); + + result = VDO_ASSERT(offset == sizeof(buffer), + "%zu bytes decoded of %zu expected", offset, + sizeof(buffer)); + if (result != VDO_SUCCESS) + result = UDS_CORRUPT_DATA; + + if (memcmp(header.magic, MAGIC_START_6, MAGIC_SIZE) != 0) + return vdo_log_warning_strerror(UDS_CORRUPT_DATA, + "volume index file had bad magic number"); + + if (i == 0) { + volume_index->sparse_sample_rate = header.sparse_sample_rate; + } else if (volume_index->sparse_sample_rate != header.sparse_sample_rate) { + vdo_log_warning_strerror(UDS_CORRUPT_DATA, + "Inconsistent sparse sample rate in delta index zone files: %u vs. %u", + volume_index->sparse_sample_rate, + header.sparse_sample_rate); + return UDS_CORRUPT_DATA; + } + } + + result = start_restoring_volume_sub_index(&volume_index->vi_non_hook, + buffered_readers, reader_count); + if (result != UDS_SUCCESS) + return result; + + return start_restoring_volume_sub_index(&volume_index->vi_hook, buffered_readers, + reader_count); +} + +static int finish_restoring_volume_sub_index(struct volume_sub_index *sub_index, + struct buffered_reader **buffered_readers, + unsigned int reader_count) +{ + return uds_finish_restoring_delta_index(&sub_index->delta_index, + buffered_readers, reader_count); +} + +static int finish_restoring_volume_index(struct volume_index *volume_index, + struct buffered_reader **buffered_readers, + unsigned int reader_count) +{ + int result; + + result = finish_restoring_volume_sub_index(&volume_index->vi_non_hook, + buffered_readers, reader_count); + if ((result == UDS_SUCCESS) && has_sparse(volume_index)) { + result = finish_restoring_volume_sub_index(&volume_index->vi_hook, + buffered_readers, + reader_count); + } + + return result; +} + +int uds_load_volume_index(struct volume_index *volume_index, + struct buffered_reader **readers, unsigned int reader_count) +{ + int result; + + /* Start by reading the header section of the stream. */ + result = start_restoring_volume_index(volume_index, readers, reader_count); + if (result != UDS_SUCCESS) + return result; + + result = finish_restoring_volume_index(volume_index, readers, reader_count); + if (result != UDS_SUCCESS) { + abort_restoring_volume_index(volume_index); + return result; + } + + /* Check the final guard lists to make sure there is no extra data. */ + result = uds_check_guard_delta_lists(readers, reader_count); + if (result != UDS_SUCCESS) + abort_restoring_volume_index(volume_index); + + return result; +} + +static int start_saving_volume_sub_index(const struct volume_sub_index *sub_index, + unsigned int zone_number, + struct buffered_writer *buffered_writer) +{ + int result; + struct volume_sub_index_zone *volume_index_zone = &sub_index->zones[zone_number]; + u32 first_list = sub_index->delta_index.delta_zones[zone_number].first_list; + u32 list_count = sub_index->delta_index.delta_zones[zone_number].list_count; + u8 buffer[sizeof(struct sub_index_data)]; + size_t offset = 0; + u32 i; + + memcpy(buffer, MAGIC_START_5, MAGIC_SIZE); + offset += MAGIC_SIZE; + encode_u64_le(buffer, &offset, sub_index->volume_nonce); + encode_u64_le(buffer, &offset, volume_index_zone->virtual_chapter_low); + encode_u64_le(buffer, &offset, volume_index_zone->virtual_chapter_high); + encode_u32_le(buffer, &offset, first_list); + encode_u32_le(buffer, &offset, list_count); + + result = VDO_ASSERT(offset == sizeof(struct sub_index_data), + "%zu bytes of config written, of %zu expected", offset, + sizeof(struct sub_index_data)); + if (result != VDO_SUCCESS) + return result; + + result = uds_write_to_buffered_writer(buffered_writer, buffer, offset); + if (result != UDS_SUCCESS) + return vdo_log_warning_strerror(result, + "failed to write volume index header"); + + for (i = 0; i < list_count; i++) { + u8 encoded[sizeof(u64)]; + + put_unaligned_le64(sub_index->flush_chapters[first_list + i], &encoded); + result = uds_write_to_buffered_writer(buffered_writer, encoded, + sizeof(u64)); + if (result != UDS_SUCCESS) { + return vdo_log_warning_strerror(result, + "failed to write volume index flush ranges"); + } + } + + return uds_start_saving_delta_index(&sub_index->delta_index, zone_number, + buffered_writer); +} + +static int start_saving_volume_index(const struct volume_index *volume_index, + unsigned int zone_number, + struct buffered_writer *writer) +{ + u8 buffer[sizeof(struct volume_index_data)]; + size_t offset = 0; + int result; + + if (!has_sparse(volume_index)) { + return start_saving_volume_sub_index(&volume_index->vi_non_hook, + zone_number, writer); + } + + memcpy(buffer, MAGIC_START_6, MAGIC_SIZE); + offset += MAGIC_SIZE; + encode_u32_le(buffer, &offset, volume_index->sparse_sample_rate); + result = VDO_ASSERT(offset == sizeof(struct volume_index_data), + "%zu bytes of header written, of %zu expected", offset, + sizeof(struct volume_index_data)); + if (result != VDO_SUCCESS) + return result; + + result = uds_write_to_buffered_writer(writer, buffer, offset); + if (result != UDS_SUCCESS) { + vdo_log_warning_strerror(result, "failed to write volume index header"); + return result; + } + + result = start_saving_volume_sub_index(&volume_index->vi_non_hook, zone_number, + writer); + if (result != UDS_SUCCESS) + return result; + + return start_saving_volume_sub_index(&volume_index->vi_hook, zone_number, + writer); +} + +static int finish_saving_volume_sub_index(const struct volume_sub_index *sub_index, + unsigned int zone_number) +{ + return uds_finish_saving_delta_index(&sub_index->delta_index, zone_number); +} + +static int finish_saving_volume_index(const struct volume_index *volume_index, + unsigned int zone_number) +{ + int result; + + result = finish_saving_volume_sub_index(&volume_index->vi_non_hook, zone_number); + if ((result == UDS_SUCCESS) && has_sparse(volume_index)) + result = finish_saving_volume_sub_index(&volume_index->vi_hook, zone_number); + return result; +} + +int uds_save_volume_index(struct volume_index *volume_index, + struct buffered_writer **writers, unsigned int writer_count) +{ + int result = UDS_SUCCESS; + unsigned int zone; + + for (zone = 0; zone < writer_count; zone++) { + result = start_saving_volume_index(volume_index, zone, writers[zone]); + if (result != UDS_SUCCESS) + break; + + result = finish_saving_volume_index(volume_index, zone); + if (result != UDS_SUCCESS) + break; + + result = uds_write_guard_delta_list(writers[zone]); + if (result != UDS_SUCCESS) + break; + + result = uds_flush_buffered_writer(writers[zone]); + if (result != UDS_SUCCESS) + break; + } + + return result; +} + +static void get_volume_sub_index_stats(const struct volume_sub_index *sub_index, + struct volume_index_stats *stats) +{ + struct delta_index_stats dis; + unsigned int z; + + uds_get_delta_index_stats(&sub_index->delta_index, &dis); + stats->rebalance_time = dis.rebalance_time; + stats->rebalance_count = dis.rebalance_count; + stats->record_count = dis.record_count; + stats->collision_count = dis.collision_count; + stats->discard_count = dis.discard_count; + stats->overflow_count = dis.overflow_count; + stats->delta_lists = dis.list_count; + stats->early_flushes = 0; + for (z = 0; z < sub_index->zone_count; z++) + stats->early_flushes += sub_index->zones[z].early_flushes; +} + +void uds_get_volume_index_stats(const struct volume_index *volume_index, + struct volume_index_stats *stats) +{ + struct volume_index_stats sparse_stats; + + get_volume_sub_index_stats(&volume_index->vi_non_hook, stats); + if (!has_sparse(volume_index)) + return; + + get_volume_sub_index_stats(&volume_index->vi_hook, &sparse_stats); + stats->rebalance_time += sparse_stats.rebalance_time; + stats->rebalance_count += sparse_stats.rebalance_count; + stats->record_count += sparse_stats.record_count; + stats->collision_count += sparse_stats.collision_count; + stats->discard_count += sparse_stats.discard_count; + stats->overflow_count += sparse_stats.overflow_count; + stats->delta_lists += sparse_stats.delta_lists; + stats->early_flushes += sparse_stats.early_flushes; +} + +static int initialize_volume_sub_index(const struct uds_configuration *config, + u64 volume_nonce, u8 tag, + struct volume_sub_index *sub_index) +{ + struct sub_index_parameters params = { .address_bits = 0 }; + unsigned int zone_count = config->zone_count; + u64 available_bytes = 0; + unsigned int z; + int result; + + result = compute_volume_sub_index_parameters(config, ¶ms); + if (result != UDS_SUCCESS) + return result; + + sub_index->address_bits = params.address_bits; + sub_index->address_mask = (1u << params.address_bits) - 1; + sub_index->chapter_bits = params.chapter_bits; + sub_index->chapter_mask = (1u << params.chapter_bits) - 1; + sub_index->chapter_count = params.chapter_count; + sub_index->list_count = params.list_count; + sub_index->zone_count = zone_count; + sub_index->chapter_zone_bits = params.chapter_size_in_bits / zone_count; + sub_index->volume_nonce = volume_nonce; + + result = uds_initialize_delta_index(&sub_index->delta_index, zone_count, + params.list_count, params.mean_delta, + params.chapter_bits, params.memory_size, + tag); + if (result != UDS_SUCCESS) + return result; + + for (z = 0; z < sub_index->delta_index.zone_count; z++) + available_bytes += sub_index->delta_index.delta_zones[z].size; + available_bytes -= params.target_free_bytes; + sub_index->max_zone_bits = (available_bytes * BITS_PER_BYTE) / zone_count; + sub_index->memory_size = (sub_index->delta_index.memory_size + + sizeof(struct volume_sub_index) + + (params.list_count * sizeof(u64)) + + (zone_count * sizeof(struct volume_sub_index_zone))); + + /* The following arrays are initialized to all zeros. */ + result = vdo_allocate(params.list_count, u64, "first chapter to flush", + &sub_index->flush_chapters); + if (result != VDO_SUCCESS) + return result; + + return vdo_allocate(zone_count, struct volume_sub_index_zone, + "volume index zones", &sub_index->zones); +} + +int uds_make_volume_index(const struct uds_configuration *config, u64 volume_nonce, + struct volume_index **volume_index_ptr) +{ + struct split_config split; + unsigned int zone; + struct volume_index *volume_index; + int result; + + result = vdo_allocate(1, struct volume_index, "volume index", &volume_index); + if (result != VDO_SUCCESS) + return result; + + volume_index->zone_count = config->zone_count; + + if (!uds_is_sparse_index_geometry(config->geometry)) { + result = initialize_volume_sub_index(config, volume_nonce, 'm', + &volume_index->vi_non_hook); + if (result != UDS_SUCCESS) { + uds_free_volume_index(volume_index); + return result; + } + + volume_index->memory_size = volume_index->vi_non_hook.memory_size; + *volume_index_ptr = volume_index; + return UDS_SUCCESS; + } + + volume_index->sparse_sample_rate = config->sparse_sample_rate; + + result = vdo_allocate(config->zone_count, struct volume_index_zone, + "volume index zones", &volume_index->zones); + if (result != VDO_SUCCESS) { + uds_free_volume_index(volume_index); + return result; + } + + for (zone = 0; zone < config->zone_count; zone++) + mutex_init(&volume_index->zones[zone].hook_mutex); + + split_configuration(config, &split); + result = initialize_volume_sub_index(&split.non_hook_config, volume_nonce, 'd', + &volume_index->vi_non_hook); + if (result != UDS_SUCCESS) { + uds_free_volume_index(volume_index); + return vdo_log_error_strerror(result, + "Error creating non hook volume index"); + } + + result = initialize_volume_sub_index(&split.hook_config, volume_nonce, 's', + &volume_index->vi_hook); + if (result != UDS_SUCCESS) { + uds_free_volume_index(volume_index); + return vdo_log_error_strerror(result, + "Error creating hook volume index"); + } + + volume_index->memory_size = + volume_index->vi_non_hook.memory_size + volume_index->vi_hook.memory_size; + *volume_index_ptr = volume_index; + return UDS_SUCCESS; +} diff --git a/drivers/md/dm-vdo/indexer/volume-index.h b/drivers/md/dm-vdo/indexer/volume-index.h new file mode 100644 index 0000000000..583998c547 --- /dev/null +++ b/drivers/md/dm-vdo/indexer/volume-index.h @@ -0,0 +1,193 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright 2023 Red Hat + */ + +#ifndef UDS_VOLUME_INDEX_H +#define UDS_VOLUME_INDEX_H + +#include + +#include "thread-utils.h" + +#include "config.h" +#include "delta-index.h" +#include "indexer.h" + +/* + * The volume index is the primary top-level index for UDS. It contains records which map a record + * name to the chapter where a record with that name is stored. This mapping can definitively say + * when no record exists. However, because we only use a subset of the name for this index, it + * cannot definitively say that a record for the entry does exist. It can only say that if a record + * exists, it will be in a particular chapter. The request can then be dispatched to that chapter + * for further processing. + * + * If the volume_index_record does not actually match the record name, the index can store a more + * specific collision record to disambiguate the new entry from the existing one. Index entries are + * managed with volume_index_record structures. + */ + +#define NO_CHAPTER U64_MAX + +struct volume_index_stats { + /* Nanoseconds spent rebalancing */ + ktime_t rebalance_time; + /* Number of memory rebalances */ + u32 rebalance_count; + /* The number of records in the index */ + u64 record_count; + /* The number of collision records */ + u64 collision_count; + /* The number of records removed */ + u64 discard_count; + /* The number of UDS_OVERFLOWs detected */ + u64 overflow_count; + /* The number of delta lists */ + u32 delta_lists; + /* Number of early flushes */ + u64 early_flushes; +}; + +struct volume_sub_index_zone { + u64 virtual_chapter_low; + u64 virtual_chapter_high; + u64 early_flushes; +} __aligned(L1_CACHE_BYTES); + +struct volume_sub_index { + /* The delta index */ + struct delta_index delta_index; + /* The first chapter to be flushed in each zone */ + u64 *flush_chapters; + /* The zones */ + struct volume_sub_index_zone *zones; + /* The volume nonce */ + u64 volume_nonce; + /* Expected size of a chapter (per zone) */ + u64 chapter_zone_bits; + /* Maximum size of the index (per zone) */ + u64 max_zone_bits; + /* The number of bits in address mask */ + u8 address_bits; + /* Mask to get address within delta list */ + u32 address_mask; + /* The number of bits in chapter number */ + u8 chapter_bits; + /* The largest storable chapter number */ + u32 chapter_mask; + /* The number of chapters used */ + u32 chapter_count; + /* The number of delta lists */ + u32 list_count; + /* The number of zones */ + unsigned int zone_count; + /* The amount of memory allocated */ + u64 memory_size; +}; + +struct volume_index_zone { + /* Protects the sampled index in this zone */ + struct mutex hook_mutex; +} __aligned(L1_CACHE_BYTES); + +struct volume_index { + u32 sparse_sample_rate; + unsigned int zone_count; + u64 memory_size; + struct volume_sub_index vi_non_hook; + struct volume_sub_index vi_hook; + struct volume_index_zone *zones; +}; + +/* + * The volume_index_record structure is used to facilitate processing of a record name. A client + * first calls uds_get_volume_index_record() to find the volume index record for a record name. The + * fields of the record can then be examined to determine the state of the record. + * + * If is_found is false, then the index did not find an entry for the record name. Calling + * uds_put_volume_index_record() will insert a new entry for that name at the proper place. + * + * If is_found is true, then we did find an entry for the record name, and the virtual_chapter and + * is_collision fields reflect the entry found. Subsequently, a call to + * uds_remove_volume_index_record() will remove the entry, a call to + * uds_set_volume_index_record_chapter() will update the existing entry, and a call to + * uds_put_volume_index_record() will insert a new collision record after the existing entry. + */ +struct volume_index_record { + /* Public fields */ + + /* Chapter where the record info is found */ + u64 virtual_chapter; + /* This record is a collision */ + bool is_collision; + /* This record is the requested record */ + bool is_found; + + /* Private fields */ + + /* Zone that contains this name */ + unsigned int zone_number; + /* The volume index */ + struct volume_sub_index *sub_index; + /* Mutex for accessing this delta index entry in the hook index */ + struct mutex *mutex; + /* The record name to which this record refers */ + const struct uds_record_name *name; + /* The delta index entry for this record */ + struct delta_index_entry delta_entry; +}; + +int __must_check uds_make_volume_index(const struct uds_configuration *config, + u64 volume_nonce, + struct volume_index **volume_index); + +void uds_free_volume_index(struct volume_index *volume_index); + +int __must_check uds_compute_volume_index_save_blocks(const struct uds_configuration *config, + size_t block_size, + u64 *block_count); + +unsigned int __must_check uds_get_volume_index_zone(const struct volume_index *volume_index, + const struct uds_record_name *name); + +bool __must_check uds_is_volume_index_sample(const struct volume_index *volume_index, + const struct uds_record_name *name); + +/* + * This function is only used to manage sparse cache membership. Most requests should use + * uds_get_volume_index_record() to look up index records instead. + */ +u64 __must_check uds_lookup_volume_index_name(const struct volume_index *volume_index, + const struct uds_record_name *name); + +int __must_check uds_get_volume_index_record(struct volume_index *volume_index, + const struct uds_record_name *name, + struct volume_index_record *record); + +int __must_check uds_put_volume_index_record(struct volume_index_record *record, + u64 virtual_chapter); + +int __must_check uds_remove_volume_index_record(struct volume_index_record *record); + +int __must_check uds_set_volume_index_record_chapter(struct volume_index_record *record, + u64 virtual_chapter); + +void uds_set_volume_index_open_chapter(struct volume_index *volume_index, + u64 virtual_chapter); + +void uds_set_volume_index_zone_open_chapter(struct volume_index *volume_index, + unsigned int zone_number, + u64 virtual_chapter); + +int __must_check uds_load_volume_index(struct volume_index *volume_index, + struct buffered_reader **readers, + unsigned int reader_count); + +int __must_check uds_save_volume_index(struct volume_index *volume_index, + struct buffered_writer **writers, + unsigned int writer_count); + +void uds_get_volume_index_stats(const struct volume_index *volume_index, + struct volume_index_stats *stats); + +#endif /* UDS_VOLUME_INDEX_H */ diff --git a/drivers/md/dm-vdo/indexer/volume.c b/drivers/md/dm-vdo/indexer/volume.c new file mode 100644 index 0000000000..655453bb27 --- /dev/null +++ b/drivers/md/dm-vdo/indexer/volume.c @@ -0,0 +1,1693 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2023 Red Hat + */ + +#include "volume.h" + +#include +#include +#include + +#include "errors.h" +#include "logger.h" +#include "memory-alloc.h" +#include "permassert.h" +#include "string-utils.h" +#include "thread-utils.h" + +#include "chapter-index.h" +#include "config.h" +#include "geometry.h" +#include "hash-utils.h" +#include "index.h" +#include "sparse-cache.h" + +/* + * The first block of the volume layout is reserved for the volume header, which is no longer used. + * The remainder of the volume is divided into chapters consisting of several pages of records, and + * several pages of static index to use to find those records. The index pages are recorded first, + * followed by the record pages. The chapters are written in order as they are filled, so the + * volume storage acts as a circular log of the most recent chapters, with each new chapter + * overwriting the oldest saved one. + * + * When a new chapter is filled and closed, the records from that chapter are sorted and + * interleaved in approximate temporal order, and assigned to record pages. Then a static delta + * index is generated to store which record page contains each record. The in-memory index page map + * is also updated to indicate which delta lists fall on each chapter index page. This means that + * when a record is read, the volume only has to load a single index page and a single record page, + * rather than search the entire chapter. These index and record pages are written to storage, and + * the index pages are transferred to the page cache under the theory that the most recently + * written chapter is likely to be accessed again soon. + * + * When reading a record, the volume index will indicate which chapter should contain it. The + * volume uses the index page map to determine which chapter index page needs to be loaded, and + * then reads the relevant record page number from the chapter index. Both index and record pages + * are stored in a page cache when read for the common case that subsequent records need the same + * pages. The page cache evicts the least recently accessed entries when caching new pages. In + * addition, the volume uses dm-bufio to manage access to the storage, which may allow for + * additional caching depending on available system resources. + * + * Record requests are handled from cached pages when possible. If a page needs to be read, it is + * placed on a queue along with the request that wants to read it. Any requests for the same page + * that arrive while the read is pending are added to the queue entry. A separate reader thread + * handles the queued reads, adding the page to the cache and updating any requests queued with it + * so they can continue processing. This allows the index zone threads to continue processing new + * requests rather than wait for the storage reads. + * + * When an index rebuild is necessary, the volume reads each stored chapter to determine which + * range of chapters contain valid records, so that those records can be used to reconstruct the + * in-memory volume index. + */ + +/* The maximum allowable number of contiguous bad chapters */ +#define MAX_BAD_CHAPTERS 100 +#define VOLUME_CACHE_MAX_ENTRIES (U16_MAX >> 1) +#define VOLUME_CACHE_QUEUED_FLAG (1 << 15) +#define VOLUME_CACHE_MAX_QUEUED_READS 4096 + +static const u64 BAD_CHAPTER = U64_MAX; + +/* + * The invalidate counter is two 32 bits fields stored together atomically. The low order 32 bits + * are the physical page number of the cached page being read. The high order 32 bits are a + * sequence number. This value is written when the zone that owns it begins or completes a cache + * search. Any other thread will only read the counter in wait_for_pending_searches() while waiting + * to update the cache contents. + */ +union invalidate_counter { + u64 value; + struct { + u32 page; + u32 counter; + }; +}; + +static inline u32 map_to_page_number(struct index_geometry *geometry, u32 physical_page) +{ + return (physical_page - HEADER_PAGES_PER_VOLUME) % geometry->pages_per_chapter; +} + +static inline u32 map_to_chapter_number(struct index_geometry *geometry, u32 physical_page) +{ + return (physical_page - HEADER_PAGES_PER_VOLUME) / geometry->pages_per_chapter; +} + +static inline bool is_record_page(struct index_geometry *geometry, u32 physical_page) +{ + return map_to_page_number(geometry, physical_page) >= geometry->index_pages_per_chapter; +} + +static u32 map_to_physical_page(const struct index_geometry *geometry, u32 chapter, u32 page) +{ + /* Page zero is the header page, so the first chapter index page is page one. */ + return HEADER_PAGES_PER_VOLUME + (geometry->pages_per_chapter * chapter) + page; +} + +static inline union invalidate_counter get_invalidate_counter(struct page_cache *cache, + unsigned int zone_number) +{ + return (union invalidate_counter) { + .value = READ_ONCE(cache->search_pending_counters[zone_number].atomic_value), + }; +} + +static inline void set_invalidate_counter(struct page_cache *cache, + unsigned int zone_number, + union invalidate_counter invalidate_counter) +{ + WRITE_ONCE(cache->search_pending_counters[zone_number].atomic_value, + invalidate_counter.value); +} + +static inline bool search_pending(union invalidate_counter invalidate_counter) +{ + return (invalidate_counter.counter & 1) != 0; +} + +/* Lock the cache for a zone in order to search for a page. */ +static void begin_pending_search(struct page_cache *cache, u32 physical_page, + unsigned int zone_number) +{ + union invalidate_counter invalidate_counter = + get_invalidate_counter(cache, zone_number); + + invalidate_counter.page = physical_page; + invalidate_counter.counter++; + set_invalidate_counter(cache, zone_number, invalidate_counter); + VDO_ASSERT_LOG_ONLY(search_pending(invalidate_counter), + "Search is pending for zone %u", zone_number); + /* + * This memory barrier ensures that the write to the invalidate counter is seen by other + * threads before this thread accesses the cached page. The corresponding read memory + * barrier is in wait_for_pending_searches(). + */ + smp_mb(); +} + +/* Unlock the cache for a zone by clearing its invalidate counter. */ +static void end_pending_search(struct page_cache *cache, unsigned int zone_number) +{ + union invalidate_counter invalidate_counter; + + /* + * This memory barrier ensures that this thread completes reads of the + * cached page before other threads see the write to the invalidate + * counter. + */ + smp_mb(); + + invalidate_counter = get_invalidate_counter(cache, zone_number); + VDO_ASSERT_LOG_ONLY(search_pending(invalidate_counter), + "Search is pending for zone %u", zone_number); + invalidate_counter.counter++; + set_invalidate_counter(cache, zone_number, invalidate_counter); +} + +static void wait_for_pending_searches(struct page_cache *cache, u32 physical_page) +{ + union invalidate_counter initial_counters[MAX_ZONES]; + unsigned int i; + + /* + * We hold the read_threads_mutex. We are waiting for threads that do not hold the + * read_threads_mutex. Those threads have "locked" their targeted page by setting the + * search_pending_counter. The corresponding write memory barrier is in + * begin_pending_search(). + */ + smp_mb(); + + for (i = 0; i < cache->zone_count; i++) + initial_counters[i] = get_invalidate_counter(cache, i); + for (i = 0; i < cache->zone_count; i++) { + if (search_pending(initial_counters[i]) && + (initial_counters[i].page == physical_page)) { + /* + * There is an active search using the physical page. We need to wait for + * the search to finish. + * + * FIXME: Investigate using wait_event() to wait for the search to finish. + */ + while (initial_counters[i].value == + get_invalidate_counter(cache, i).value) + cond_resched(); + } + } +} + +static void release_page_buffer(struct cached_page *page) +{ + if (page->buffer != NULL) + dm_bufio_release(vdo_forget(page->buffer)); +} + +static void clear_cache_page(struct page_cache *cache, struct cached_page *page) +{ + /* Do not clear read_pending because the read queue relies on it. */ + release_page_buffer(page); + page->physical_page = cache->indexable_pages; + WRITE_ONCE(page->last_used, 0); +} + +static void make_page_most_recent(struct page_cache *cache, struct cached_page *page) +{ + /* + * ASSERTION: We are either a zone thread holding a search_pending_counter, or we are any + * thread holding the read_threads_mutex. + */ + if (atomic64_read(&cache->clock) != READ_ONCE(page->last_used)) + WRITE_ONCE(page->last_used, atomic64_inc_return(&cache->clock)); +} + +/* Select a page to remove from the cache to make space for a new entry. */ +static struct cached_page *select_victim_in_cache(struct page_cache *cache) +{ + struct cached_page *page; + int oldest_index = 0; + s64 oldest_time = S64_MAX; + s64 last_used; + u16 i; + + /* Find the oldest unclaimed page. We hold the read_threads_mutex. */ + for (i = 0; i < cache->cache_slots; i++) { + /* A page with a pending read must not be replaced. */ + if (cache->cache[i].read_pending) + continue; + + last_used = READ_ONCE(cache->cache[i].last_used); + if (last_used <= oldest_time) { + oldest_time = last_used; + oldest_index = i; + } + } + + page = &cache->cache[oldest_index]; + if (page->physical_page != cache->indexable_pages) { + WRITE_ONCE(cache->index[page->physical_page], cache->cache_slots); + wait_for_pending_searches(cache, page->physical_page); + } + + page->read_pending = true; + clear_cache_page(cache, page); + return page; +} + +/* Make a newly filled cache entry available to other threads. */ +static int put_page_in_cache(struct page_cache *cache, u32 physical_page, + struct cached_page *page) +{ + int result; + + /* We hold the read_threads_mutex. */ + result = VDO_ASSERT((page->read_pending), "page to install has a pending read"); + if (result != VDO_SUCCESS) + return result; + + page->physical_page = physical_page; + make_page_most_recent(cache, page); + page->read_pending = false; + + /* + * We hold the read_threads_mutex, but we must have a write memory barrier before making + * the cached_page available to the readers that do not hold the mutex. The corresponding + * read memory barrier is in get_page_and_index(). + */ + smp_wmb(); + + /* This assignment also clears the queued flag. */ + WRITE_ONCE(cache->index[physical_page], page - cache->cache); + return UDS_SUCCESS; +} + +static void cancel_page_in_cache(struct page_cache *cache, u32 physical_page, + struct cached_page *page) +{ + int result; + + /* We hold the read_threads_mutex. */ + result = VDO_ASSERT((page->read_pending), "page to install has a pending read"); + if (result != VDO_SUCCESS) + return; + + clear_cache_page(cache, page); + page->read_pending = false; + + /* Clear the mapping and the queued flag for the new page. */ + WRITE_ONCE(cache->index[physical_page], cache->cache_slots); +} + +static inline u16 next_queue_position(u16 position) +{ + return (position + 1) % VOLUME_CACHE_MAX_QUEUED_READS; +} + +static inline void advance_queue_position(u16 *position) +{ + *position = next_queue_position(*position); +} + +static inline bool read_queue_is_full(struct page_cache *cache) +{ + return cache->read_queue_first == next_queue_position(cache->read_queue_last); +} + +static bool enqueue_read(struct page_cache *cache, struct uds_request *request, + u32 physical_page) +{ + struct queued_read *queue_entry; + u16 last = cache->read_queue_last; + u16 read_queue_index; + + /* We hold the read_threads_mutex. */ + if ((cache->index[physical_page] & VOLUME_CACHE_QUEUED_FLAG) == 0) { + /* This page has no existing entry in the queue. */ + if (read_queue_is_full(cache)) + return false; + + /* Fill in the read queue entry. */ + cache->read_queue[last].physical_page = physical_page; + cache->read_queue[last].invalid = false; + cache->read_queue[last].first_request = NULL; + cache->read_queue[last].last_request = NULL; + + /* Point the cache index to the read queue entry. */ + read_queue_index = last; + WRITE_ONCE(cache->index[physical_page], + read_queue_index | VOLUME_CACHE_QUEUED_FLAG); + + advance_queue_position(&cache->read_queue_last); + } else { + /* It's already queued, so add this request to the existing entry. */ + read_queue_index = cache->index[physical_page] & ~VOLUME_CACHE_QUEUED_FLAG; + } + + request->next_request = NULL; + queue_entry = &cache->read_queue[read_queue_index]; + if (queue_entry->first_request == NULL) + queue_entry->first_request = request; + else + queue_entry->last_request->next_request = request; + queue_entry->last_request = request; + + return true; +} + +static void enqueue_page_read(struct volume *volume, struct uds_request *request, + u32 physical_page) +{ + /* Mark the page as queued, so that chapter invalidation knows to cancel a read. */ + while (!enqueue_read(&volume->page_cache, request, physical_page)) { + vdo_log_debug("Read queue full, waiting for reads to finish"); + uds_wait_cond(&volume->read_threads_read_done_cond, + &volume->read_threads_mutex); + } + + uds_signal_cond(&volume->read_threads_cond); +} + +/* + * Reserve the next read queue entry for processing, but do not actually remove it from the queue. + * Must be followed by release_queued_requests(). + */ +static struct queued_read *reserve_read_queue_entry(struct page_cache *cache) +{ + /* We hold the read_threads_mutex. */ + struct queued_read *entry; + u16 index_value; + bool queued; + + /* No items to dequeue */ + if (cache->read_queue_next_read == cache->read_queue_last) + return NULL; + + entry = &cache->read_queue[cache->read_queue_next_read]; + index_value = cache->index[entry->physical_page]; + queued = (index_value & VOLUME_CACHE_QUEUED_FLAG) != 0; + /* Check to see if it's still queued before resetting. */ + if (entry->invalid && queued) + WRITE_ONCE(cache->index[entry->physical_page], cache->cache_slots); + + /* + * If a synchronous read has taken this page, set invalid to true so it doesn't get + * overwritten. Requests will just be requeued. + */ + if (!queued) + entry->invalid = true; + + entry->reserved = true; + advance_queue_position(&cache->read_queue_next_read); + return entry; +} + +static inline struct queued_read *wait_to_reserve_read_queue_entry(struct volume *volume) +{ + struct queued_read *queue_entry = NULL; + + while (!volume->read_threads_exiting) { + queue_entry = reserve_read_queue_entry(&volume->page_cache); + if (queue_entry != NULL) + break; + + uds_wait_cond(&volume->read_threads_cond, &volume->read_threads_mutex); + } + + return queue_entry; +} + +static int init_chapter_index_page(const struct volume *volume, u8 *index_page, + u32 chapter, u32 index_page_number, + struct delta_index_page *chapter_index_page) +{ + u64 ci_virtual; + u32 ci_chapter; + u32 lowest_list; + u32 highest_list; + struct index_geometry *geometry = volume->geometry; + int result; + + result = uds_initialize_chapter_index_page(chapter_index_page, geometry, + index_page, volume->nonce); + if (volume->lookup_mode == LOOKUP_FOR_REBUILD) + return result; + + if (result != UDS_SUCCESS) { + return vdo_log_error_strerror(result, + "Reading chapter index page for chapter %u page %u", + chapter, index_page_number); + } + + uds_get_list_number_bounds(volume->index_page_map, chapter, index_page_number, + &lowest_list, &highest_list); + ci_virtual = chapter_index_page->virtual_chapter_number; + ci_chapter = uds_map_to_physical_chapter(geometry, ci_virtual); + if ((chapter == ci_chapter) && + (lowest_list == chapter_index_page->lowest_list_number) && + (highest_list == chapter_index_page->highest_list_number)) + return UDS_SUCCESS; + + vdo_log_warning("Index page map updated to %llu", + (unsigned long long) volume->index_page_map->last_update); + vdo_log_warning("Page map expects that chapter %u page %u has range %u to %u, but chapter index page has chapter %llu with range %u to %u", + chapter, index_page_number, lowest_list, highest_list, + (unsigned long long) ci_virtual, + chapter_index_page->lowest_list_number, + chapter_index_page->highest_list_number); + return vdo_log_error_strerror(UDS_CORRUPT_DATA, + "index page map mismatch with chapter index"); +} + +static int initialize_index_page(const struct volume *volume, u32 physical_page, + struct cached_page *page) +{ + u32 chapter = map_to_chapter_number(volume->geometry, physical_page); + u32 index_page_number = map_to_page_number(volume->geometry, physical_page); + + return init_chapter_index_page(volume, dm_bufio_get_block_data(page->buffer), + chapter, index_page_number, &page->index_page); +} + +static bool search_record_page(const u8 record_page[], + const struct uds_record_name *name, + const struct index_geometry *geometry, + struct uds_record_data *metadata) +{ + /* + * The array of records is sorted by name and stored as a binary tree in heap order, so the + * root of the tree is the first array element. + */ + u32 node = 0; + const struct uds_volume_record *records = (const struct uds_volume_record *) record_page; + + while (node < geometry->records_per_page) { + int result; + const struct uds_volume_record *record = &records[node]; + + result = memcmp(name, &record->name, UDS_RECORD_NAME_SIZE); + if (result == 0) { + if (metadata != NULL) + *metadata = record->data; + return true; + } + + /* The children of node N are at indexes 2N+1 and 2N+2. */ + node = ((2 * node) + ((result < 0) ? 1 : 2)); + } + + return false; +} + +/* + * If we've read in a record page, we're going to do an immediate search, to speed up processing by + * avoiding get_record_from_zone(), and to ensure that requests make progress even when queued. If + * we've read in an index page, we save the record page number so we don't have to resolve the + * index page again. We use the location, virtual_chapter, and old_metadata fields in the request + * to allow the index code to know where to begin processing the request again. + */ +static int search_page(struct cached_page *page, const struct volume *volume, + struct uds_request *request, u32 physical_page) +{ + int result; + enum uds_index_region location; + u16 record_page_number; + + if (is_record_page(volume->geometry, physical_page)) { + if (search_record_page(dm_bufio_get_block_data(page->buffer), + &request->record_name, volume->geometry, + &request->old_metadata)) + location = UDS_LOCATION_RECORD_PAGE_LOOKUP; + else + location = UDS_LOCATION_UNAVAILABLE; + } else { + result = uds_search_chapter_index_page(&page->index_page, + volume->geometry, + &request->record_name, + &record_page_number); + if (result != UDS_SUCCESS) + return result; + + if (record_page_number == NO_CHAPTER_INDEX_ENTRY) { + location = UDS_LOCATION_UNAVAILABLE; + } else { + location = UDS_LOCATION_INDEX_PAGE_LOOKUP; + *((u16 *) &request->old_metadata) = record_page_number; + } + } + + request->location = location; + request->found = false; + return UDS_SUCCESS; +} + +static int process_entry(struct volume *volume, struct queued_read *entry) +{ + u32 page_number = entry->physical_page; + struct uds_request *request; + struct cached_page *page = NULL; + u8 *page_data; + int result; + + if (entry->invalid) { + vdo_log_debug("Requeuing requests for invalid page"); + return UDS_SUCCESS; + } + + page = select_victim_in_cache(&volume->page_cache); + + mutex_unlock(&volume->read_threads_mutex); + page_data = dm_bufio_read(volume->client, page_number, &page->buffer); + mutex_lock(&volume->read_threads_mutex); + if (IS_ERR(page_data)) { + result = -PTR_ERR(page_data); + vdo_log_warning_strerror(result, + "error reading physical page %u from volume", + page_number); + cancel_page_in_cache(&volume->page_cache, page_number, page); + return result; + } + + if (entry->invalid) { + vdo_log_warning("Page %u invalidated after read", page_number); + cancel_page_in_cache(&volume->page_cache, page_number, page); + return UDS_SUCCESS; + } + + if (!is_record_page(volume->geometry, page_number)) { + result = initialize_index_page(volume, page_number, page); + if (result != UDS_SUCCESS) { + vdo_log_warning("Error initializing chapter index page"); + cancel_page_in_cache(&volume->page_cache, page_number, page); + return result; + } + } + + result = put_page_in_cache(&volume->page_cache, page_number, page); + if (result != UDS_SUCCESS) { + vdo_log_warning("Error putting page %u in cache", page_number); + cancel_page_in_cache(&volume->page_cache, page_number, page); + return result; + } + + request = entry->first_request; + while ((request != NULL) && (result == UDS_SUCCESS)) { + result = search_page(page, volume, request, page_number); + request = request->next_request; + } + + return result; +} + +static void release_queued_requests(struct volume *volume, struct queued_read *entry, + int result) +{ + struct page_cache *cache = &volume->page_cache; + u16 next_read = cache->read_queue_next_read; + struct uds_request *request; + struct uds_request *next; + + for (request = entry->first_request; request != NULL; request = next) { + next = request->next_request; + request->status = result; + request->requeued = true; + uds_enqueue_request(request, STAGE_INDEX); + } + + entry->reserved = false; + + /* Move the read_queue_first pointer as far as we can. */ + while ((cache->read_queue_first != next_read) && + (!cache->read_queue[cache->read_queue_first].reserved)) + advance_queue_position(&cache->read_queue_first); + uds_broadcast_cond(&volume->read_threads_read_done_cond); +} + +static void read_thread_function(void *arg) +{ + struct volume *volume = arg; + + vdo_log_debug("reader starting"); + mutex_lock(&volume->read_threads_mutex); + while (true) { + struct queued_read *queue_entry; + int result; + + queue_entry = wait_to_reserve_read_queue_entry(volume); + if (volume->read_threads_exiting) + break; + + result = process_entry(volume, queue_entry); + release_queued_requests(volume, queue_entry, result); + } + mutex_unlock(&volume->read_threads_mutex); + vdo_log_debug("reader done"); +} + +static void get_page_and_index(struct page_cache *cache, u32 physical_page, + int *queue_index, struct cached_page **page_ptr) +{ + u16 index_value; + u16 index; + bool queued; + + /* + * ASSERTION: We are either a zone thread holding a search_pending_counter, or we are any + * thread holding the read_threads_mutex. + * + * Holding only a search_pending_counter is the most frequent case. + */ + /* + * It would be unlikely for the compiler to turn the usage of index_value into two reads of + * cache->index, but it would be possible and very bad if those reads did not return the + * same bits. + */ + index_value = READ_ONCE(cache->index[physical_page]); + queued = (index_value & VOLUME_CACHE_QUEUED_FLAG) != 0; + index = index_value & ~VOLUME_CACHE_QUEUED_FLAG; + + if (!queued && (index < cache->cache_slots)) { + *page_ptr = &cache->cache[index]; + /* + * We have acquired access to the cached page, but unless we hold the + * read_threads_mutex, we need a read memory barrier now. The corresponding write + * memory barrier is in put_page_in_cache(). + */ + smp_rmb(); + } else { + *page_ptr = NULL; + } + + *queue_index = queued ? index : -1; +} + +static void get_page_from_cache(struct page_cache *cache, u32 physical_page, + struct cached_page **page) +{ + /* + * ASSERTION: We are in a zone thread. + * ASSERTION: We holding a search_pending_counter or the read_threads_mutex. + */ + int queue_index = -1; + + get_page_and_index(cache, physical_page, &queue_index, page); +} + +static int read_page_locked(struct volume *volume, u32 physical_page, + struct cached_page **page_ptr) +{ + int result = UDS_SUCCESS; + struct cached_page *page = NULL; + u8 *page_data; + + page = select_victim_in_cache(&volume->page_cache); + page_data = dm_bufio_read(volume->client, physical_page, &page->buffer); + if (IS_ERR(page_data)) { + result = -PTR_ERR(page_data); + vdo_log_warning_strerror(result, + "error reading physical page %u from volume", + physical_page); + cancel_page_in_cache(&volume->page_cache, physical_page, page); + return result; + } + + if (!is_record_page(volume->geometry, physical_page)) { + result = initialize_index_page(volume, physical_page, page); + if (result != UDS_SUCCESS) { + if (volume->lookup_mode != LOOKUP_FOR_REBUILD) + vdo_log_warning("Corrupt index page %u", physical_page); + cancel_page_in_cache(&volume->page_cache, physical_page, page); + return result; + } + } + + result = put_page_in_cache(&volume->page_cache, physical_page, page); + if (result != UDS_SUCCESS) { + vdo_log_warning("Error putting page %u in cache", physical_page); + cancel_page_in_cache(&volume->page_cache, physical_page, page); + return result; + } + + *page_ptr = page; + return UDS_SUCCESS; +} + +/* Retrieve a page from the cache while holding the read threads mutex. */ +static int get_volume_page_locked(struct volume *volume, u32 physical_page, + struct cached_page **page_ptr) +{ + int result; + struct cached_page *page = NULL; + + get_page_from_cache(&volume->page_cache, physical_page, &page); + if (page == NULL) { + result = read_page_locked(volume, physical_page, &page); + if (result != UDS_SUCCESS) + return result; + } else { + make_page_most_recent(&volume->page_cache, page); + } + + *page_ptr = page; + return UDS_SUCCESS; +} + +/* Retrieve a page from the cache while holding a search_pending lock. */ +static int get_volume_page_protected(struct volume *volume, struct uds_request *request, + u32 physical_page, struct cached_page **page_ptr) +{ + struct cached_page *page; + + get_page_from_cache(&volume->page_cache, physical_page, &page); + if (page != NULL) { + if (request->zone_number == 0) { + /* Only one zone is allowed to update the LRU. */ + make_page_most_recent(&volume->page_cache, page); + } + + *page_ptr = page; + return UDS_SUCCESS; + } + + /* Prepare to enqueue a read for the page. */ + end_pending_search(&volume->page_cache, request->zone_number); + mutex_lock(&volume->read_threads_mutex); + + /* + * Do the lookup again while holding the read mutex (no longer the fast case so this should + * be fine to repeat). We need to do this because a page may have been added to the cache + * by a reader thread between the time we searched above and the time we went to actually + * try to enqueue it below. This could result in us enqueuing another read for a page which + * is already in the cache, which would mean we end up with two entries in the cache for + * the same page. + */ + get_page_from_cache(&volume->page_cache, physical_page, &page); + if (page == NULL) { + enqueue_page_read(volume, request, physical_page); + /* + * The performance gain from unlocking first, while "search pending" mode is off, + * turns out to be significant in some cases. The page is not available yet so + * the order does not matter for correctness as it does below. + */ + mutex_unlock(&volume->read_threads_mutex); + begin_pending_search(&volume->page_cache, physical_page, + request->zone_number); + return UDS_QUEUED; + } + + /* + * Now that the page is loaded, the volume needs to switch to "reader thread unlocked" and + * "search pending" state in careful order so no other thread can mess with the data before + * the caller gets to look at it. + */ + begin_pending_search(&volume->page_cache, physical_page, request->zone_number); + mutex_unlock(&volume->read_threads_mutex); + *page_ptr = page; + return UDS_SUCCESS; +} + +static int get_volume_page(struct volume *volume, u32 chapter, u32 page_number, + struct cached_page **page_ptr) +{ + int result; + u32 physical_page = map_to_physical_page(volume->geometry, chapter, page_number); + + mutex_lock(&volume->read_threads_mutex); + result = get_volume_page_locked(volume, physical_page, page_ptr); + mutex_unlock(&volume->read_threads_mutex); + return result; +} + +int uds_get_volume_record_page(struct volume *volume, u32 chapter, u32 page_number, + u8 **data_ptr) +{ + int result; + struct cached_page *page = NULL; + + result = get_volume_page(volume, chapter, page_number, &page); + if (result == UDS_SUCCESS) + *data_ptr = dm_bufio_get_block_data(page->buffer); + return result; +} + +int uds_get_volume_index_page(struct volume *volume, u32 chapter, u32 page_number, + struct delta_index_page **index_page_ptr) +{ + int result; + struct cached_page *page = NULL; + + result = get_volume_page(volume, chapter, page_number, &page); + if (result == UDS_SUCCESS) + *index_page_ptr = &page->index_page; + return result; +} + +/* + * Find the record page associated with a name in a given index page. This will return UDS_QUEUED + * if the page in question must be read from storage. + */ +static int search_cached_index_page(struct volume *volume, struct uds_request *request, + u32 chapter, u32 index_page_number, + u16 *record_page_number) +{ + int result; + struct cached_page *page = NULL; + u32 physical_page = map_to_physical_page(volume->geometry, chapter, + index_page_number); + + /* + * Make sure the invalidate counter is updated before we try and read the mapping. This + * prevents this thread from reading a page in the cache which has already been marked for + * invalidation by the reader thread, before the reader thread has noticed that the + * invalidate_counter has been incremented. + */ + begin_pending_search(&volume->page_cache, physical_page, request->zone_number); + + result = get_volume_page_protected(volume, request, physical_page, &page); + if (result != UDS_SUCCESS) { + end_pending_search(&volume->page_cache, request->zone_number); + return result; + } + + result = uds_search_chapter_index_page(&page->index_page, volume->geometry, + &request->record_name, + record_page_number); + end_pending_search(&volume->page_cache, request->zone_number); + return result; +} + +/* + * Find the metadata associated with a name in a given record page. This will return UDS_QUEUED if + * the page in question must be read from storage. + */ +int uds_search_cached_record_page(struct volume *volume, struct uds_request *request, + u32 chapter, u16 record_page_number, bool *found) +{ + struct cached_page *record_page; + struct index_geometry *geometry = volume->geometry; + int result; + u32 physical_page, page_number; + + *found = false; + if (record_page_number == NO_CHAPTER_INDEX_ENTRY) + return UDS_SUCCESS; + + result = VDO_ASSERT(record_page_number < geometry->record_pages_per_chapter, + "0 <= %d < %u", record_page_number, + geometry->record_pages_per_chapter); + if (result != VDO_SUCCESS) + return result; + + page_number = geometry->index_pages_per_chapter + record_page_number; + + physical_page = map_to_physical_page(volume->geometry, chapter, page_number); + + /* + * Make sure the invalidate counter is updated before we try and read the mapping. This + * prevents this thread from reading a page in the cache which has already been marked for + * invalidation by the reader thread, before the reader thread has noticed that the + * invalidate_counter has been incremented. + */ + begin_pending_search(&volume->page_cache, physical_page, request->zone_number); + + result = get_volume_page_protected(volume, request, physical_page, &record_page); + if (result != UDS_SUCCESS) { + end_pending_search(&volume->page_cache, request->zone_number); + return result; + } + + if (search_record_page(dm_bufio_get_block_data(record_page->buffer), + &request->record_name, geometry, &request->old_metadata)) + *found = true; + + end_pending_search(&volume->page_cache, request->zone_number); + return UDS_SUCCESS; +} + +void uds_prefetch_volume_chapter(const struct volume *volume, u32 chapter) +{ + const struct index_geometry *geometry = volume->geometry; + u32 physical_page = map_to_physical_page(geometry, chapter, 0); + + dm_bufio_prefetch(volume->client, physical_page, geometry->pages_per_chapter); +} + +int uds_read_chapter_index_from_volume(const struct volume *volume, u64 virtual_chapter, + struct dm_buffer *volume_buffers[], + struct delta_index_page index_pages[]) +{ + int result; + u32 i; + const struct index_geometry *geometry = volume->geometry; + u32 physical_chapter = uds_map_to_physical_chapter(geometry, virtual_chapter); + u32 physical_page = map_to_physical_page(geometry, physical_chapter, 0); + + dm_bufio_prefetch(volume->client, physical_page, geometry->index_pages_per_chapter); + for (i = 0; i < geometry->index_pages_per_chapter; i++) { + u8 *index_page; + + index_page = dm_bufio_read(volume->client, physical_page + i, + &volume_buffers[i]); + if (IS_ERR(index_page)) { + result = -PTR_ERR(index_page); + vdo_log_warning_strerror(result, + "error reading physical page %u", + physical_page); + return result; + } + + result = init_chapter_index_page(volume, index_page, physical_chapter, i, + &index_pages[i]); + if (result != UDS_SUCCESS) + return result; + } + + return UDS_SUCCESS; +} + +int uds_search_volume_page_cache(struct volume *volume, struct uds_request *request, + bool *found) +{ + int result; + u32 physical_chapter = + uds_map_to_physical_chapter(volume->geometry, request->virtual_chapter); + u32 index_page_number; + u16 record_page_number; + + index_page_number = uds_find_index_page_number(volume->index_page_map, + &request->record_name, + physical_chapter); + + if (request->location == UDS_LOCATION_INDEX_PAGE_LOOKUP) { + record_page_number = *((u16 *) &request->old_metadata); + } else { + result = search_cached_index_page(volume, request, physical_chapter, + index_page_number, + &record_page_number); + if (result != UDS_SUCCESS) + return result; + } + + return uds_search_cached_record_page(volume, request, physical_chapter, + record_page_number, found); +} + +int uds_search_volume_page_cache_for_rebuild(struct volume *volume, + const struct uds_record_name *name, + u64 virtual_chapter, bool *found) +{ + int result; + struct index_geometry *geometry = volume->geometry; + struct cached_page *page; + u32 physical_chapter = uds_map_to_physical_chapter(geometry, virtual_chapter); + u32 index_page_number; + u16 record_page_number; + u32 page_number; + + *found = false; + index_page_number = + uds_find_index_page_number(volume->index_page_map, name, + physical_chapter); + result = get_volume_page(volume, physical_chapter, index_page_number, &page); + if (result != UDS_SUCCESS) + return result; + + result = uds_search_chapter_index_page(&page->index_page, geometry, name, + &record_page_number); + if (result != UDS_SUCCESS) + return result; + + if (record_page_number == NO_CHAPTER_INDEX_ENTRY) + return UDS_SUCCESS; + + page_number = geometry->index_pages_per_chapter + record_page_number; + result = get_volume_page(volume, physical_chapter, page_number, &page); + if (result != UDS_SUCCESS) + return result; + + *found = search_record_page(dm_bufio_get_block_data(page->buffer), name, + geometry, NULL); + return UDS_SUCCESS; +} + +static void invalidate_page(struct page_cache *cache, u32 physical_page) +{ + struct cached_page *page; + int queue_index = -1; + + /* We hold the read_threads_mutex. */ + get_page_and_index(cache, physical_page, &queue_index, &page); + if (page != NULL) { + WRITE_ONCE(cache->index[page->physical_page], cache->cache_slots); + wait_for_pending_searches(cache, page->physical_page); + clear_cache_page(cache, page); + } else if (queue_index > -1) { + vdo_log_debug("setting pending read to invalid"); + cache->read_queue[queue_index].invalid = true; + } +} + +void uds_forget_chapter(struct volume *volume, u64 virtual_chapter) +{ + u32 physical_chapter = + uds_map_to_physical_chapter(volume->geometry, virtual_chapter); + u32 first_page = map_to_physical_page(volume->geometry, physical_chapter, 0); + u32 i; + + vdo_log_debug("forgetting chapter %llu", (unsigned long long) virtual_chapter); + mutex_lock(&volume->read_threads_mutex); + for (i = 0; i < volume->geometry->pages_per_chapter; i++) + invalidate_page(&volume->page_cache, first_page + i); + mutex_unlock(&volume->read_threads_mutex); +} + +/* + * Donate an index pages from a newly written chapter to the page cache since it is likely to be + * used again soon. The caller must already hold the reader thread mutex. + */ +static int donate_index_page_locked(struct volume *volume, u32 physical_chapter, + u32 index_page_number, struct dm_buffer *page_buffer) +{ + int result; + struct cached_page *page = NULL; + u32 physical_page = + map_to_physical_page(volume->geometry, physical_chapter, + index_page_number); + + page = select_victim_in_cache(&volume->page_cache); + page->buffer = page_buffer; + result = init_chapter_index_page(volume, dm_bufio_get_block_data(page_buffer), + physical_chapter, index_page_number, + &page->index_page); + if (result != UDS_SUCCESS) { + vdo_log_warning("Error initialize chapter index page"); + cancel_page_in_cache(&volume->page_cache, physical_page, page); + return result; + } + + result = put_page_in_cache(&volume->page_cache, physical_page, page); + if (result != UDS_SUCCESS) { + vdo_log_warning("Error putting page %u in cache", physical_page); + cancel_page_in_cache(&volume->page_cache, physical_page, page); + return result; + } + + return UDS_SUCCESS; +} + +static int write_index_pages(struct volume *volume, u32 physical_chapter_number, + struct open_chapter_index *chapter_index) +{ + struct index_geometry *geometry = volume->geometry; + struct dm_buffer *page_buffer; + u32 first_index_page = map_to_physical_page(geometry, physical_chapter_number, 0); + u32 delta_list_number = 0; + u32 index_page_number; + + for (index_page_number = 0; + index_page_number < geometry->index_pages_per_chapter; + index_page_number++) { + u8 *page_data; + u32 physical_page = first_index_page + index_page_number; + u32 lists_packed; + bool last_page; + int result; + + page_data = dm_bufio_new(volume->client, physical_page, &page_buffer); + if (IS_ERR(page_data)) { + return vdo_log_warning_strerror(-PTR_ERR(page_data), + "failed to prepare index page"); + } + + last_page = ((index_page_number + 1) == geometry->index_pages_per_chapter); + result = uds_pack_open_chapter_index_page(chapter_index, page_data, + delta_list_number, last_page, + &lists_packed); + if (result != UDS_SUCCESS) { + dm_bufio_release(page_buffer); + return vdo_log_warning_strerror(result, + "failed to pack index page"); + } + + dm_bufio_mark_buffer_dirty(page_buffer); + + if (lists_packed == 0) { + vdo_log_debug("no delta lists packed on chapter %u page %u", + physical_chapter_number, index_page_number); + } else { + delta_list_number += lists_packed; + } + + uds_update_index_page_map(volume->index_page_map, + chapter_index->virtual_chapter_number, + physical_chapter_number, index_page_number, + delta_list_number - 1); + + mutex_lock(&volume->read_threads_mutex); + result = donate_index_page_locked(volume, physical_chapter_number, + index_page_number, page_buffer); + mutex_unlock(&volume->read_threads_mutex); + if (result != UDS_SUCCESS) { + dm_bufio_release(page_buffer); + return result; + } + } + + return UDS_SUCCESS; +} + +static u32 encode_tree(u8 record_page[], + const struct uds_volume_record *sorted_pointers[], + u32 next_record, u32 node, u32 node_count) +{ + if (node < node_count) { + u32 child = (2 * node) + 1; + + next_record = encode_tree(record_page, sorted_pointers, next_record, + child, node_count); + + /* + * In-order traversal: copy the contents of the next record into the page at the + * node offset. + */ + memcpy(&record_page[node * BYTES_PER_RECORD], + sorted_pointers[next_record++], BYTES_PER_RECORD); + + next_record = encode_tree(record_page, sorted_pointers, next_record, + child + 1, node_count); + } + + return next_record; +} + +static int encode_record_page(const struct volume *volume, + const struct uds_volume_record records[], u8 record_page[]) +{ + int result; + u32 i; + u32 records_per_page = volume->geometry->records_per_page; + const struct uds_volume_record **record_pointers = volume->record_pointers; + + for (i = 0; i < records_per_page; i++) + record_pointers[i] = &records[i]; + + /* + * Sort the record pointers by using just the names in the records, which is less work than + * sorting the entire record values. + */ + BUILD_BUG_ON(offsetof(struct uds_volume_record, name) != 0); + result = uds_radix_sort(volume->radix_sorter, (const u8 **) record_pointers, + records_per_page, UDS_RECORD_NAME_SIZE); + if (result != UDS_SUCCESS) + return result; + + encode_tree(record_page, record_pointers, 0, 0, records_per_page); + return UDS_SUCCESS; +} + +static int write_record_pages(struct volume *volume, u32 physical_chapter_number, + const struct uds_volume_record *records) +{ + u32 record_page_number; + struct index_geometry *geometry = volume->geometry; + struct dm_buffer *page_buffer; + const struct uds_volume_record *next_record = records; + u32 first_record_page = map_to_physical_page(geometry, physical_chapter_number, + geometry->index_pages_per_chapter); + + for (record_page_number = 0; + record_page_number < geometry->record_pages_per_chapter; + record_page_number++) { + u8 *page_data; + u32 physical_page = first_record_page + record_page_number; + int result; + + page_data = dm_bufio_new(volume->client, physical_page, &page_buffer); + if (IS_ERR(page_data)) { + return vdo_log_warning_strerror(-PTR_ERR(page_data), + "failed to prepare record page"); + } + + result = encode_record_page(volume, next_record, page_data); + if (result != UDS_SUCCESS) { + dm_bufio_release(page_buffer); + return vdo_log_warning_strerror(result, + "failed to encode record page %u", + record_page_number); + } + + next_record += geometry->records_per_page; + dm_bufio_mark_buffer_dirty(page_buffer); + dm_bufio_release(page_buffer); + } + + return UDS_SUCCESS; +} + +int uds_write_chapter(struct volume *volume, struct open_chapter_index *chapter_index, + const struct uds_volume_record *records) +{ + int result; + u32 physical_chapter_number = + uds_map_to_physical_chapter(volume->geometry, + chapter_index->virtual_chapter_number); + + result = write_index_pages(volume, physical_chapter_number, chapter_index); + if (result != UDS_SUCCESS) + return result; + + result = write_record_pages(volume, physical_chapter_number, records); + if (result != UDS_SUCCESS) + return result; + + result = -dm_bufio_write_dirty_buffers(volume->client); + if (result != UDS_SUCCESS) + vdo_log_error_strerror(result, "cannot sync chapter to volume"); + + return result; +} + +static void probe_chapter(struct volume *volume, u32 chapter_number, + u64 *virtual_chapter_number) +{ + const struct index_geometry *geometry = volume->geometry; + u32 expected_list_number = 0; + u32 i; + u64 vcn = BAD_CHAPTER; + + *virtual_chapter_number = BAD_CHAPTER; + dm_bufio_prefetch(volume->client, + map_to_physical_page(geometry, chapter_number, 0), + geometry->index_pages_per_chapter); + + for (i = 0; i < geometry->index_pages_per_chapter; i++) { + struct delta_index_page *page; + int result; + + result = uds_get_volume_index_page(volume, chapter_number, i, &page); + if (result != UDS_SUCCESS) + return; + + if (page->virtual_chapter_number == BAD_CHAPTER) { + vdo_log_error("corrupt index page in chapter %u", + chapter_number); + return; + } + + if (vcn == BAD_CHAPTER) { + vcn = page->virtual_chapter_number; + } else if (page->virtual_chapter_number != vcn) { + vdo_log_error("inconsistent chapter %u index page %u: expected vcn %llu, got vcn %llu", + chapter_number, i, (unsigned long long) vcn, + (unsigned long long) page->virtual_chapter_number); + return; + } + + if (expected_list_number != page->lowest_list_number) { + vdo_log_error("inconsistent chapter %u index page %u: expected list number %u, got list number %u", + chapter_number, i, expected_list_number, + page->lowest_list_number); + return; + } + expected_list_number = page->highest_list_number + 1; + + result = uds_validate_chapter_index_page(page, geometry); + if (result != UDS_SUCCESS) + return; + } + + if (chapter_number != uds_map_to_physical_chapter(geometry, vcn)) { + vdo_log_error("chapter %u vcn %llu is out of phase (%u)", chapter_number, + (unsigned long long) vcn, geometry->chapters_per_volume); + return; + } + + *virtual_chapter_number = vcn; +} + +/* Find the last valid physical chapter in the volume. */ +static void find_real_end_of_volume(struct volume *volume, u32 limit, u32 *limit_ptr) +{ + u32 span = 1; + u32 tries = 0; + + while (limit > 0) { + u32 chapter = (span > limit) ? 0 : limit - span; + u64 vcn = 0; + + probe_chapter(volume, chapter, &vcn); + if (vcn == BAD_CHAPTER) { + limit = chapter; + if (++tries > 1) + span *= 2; + } else { + if (span == 1) + break; + span /= 2; + tries = 0; + } + } + + *limit_ptr = limit; +} + +static int find_chapter_limits(struct volume *volume, u32 chapter_limit, u64 *lowest_vcn, + u64 *highest_vcn) +{ + struct index_geometry *geometry = volume->geometry; + u64 zero_vcn; + u64 lowest = BAD_CHAPTER; + u64 highest = BAD_CHAPTER; + u64 moved_chapter = BAD_CHAPTER; + u32 left_chapter = 0; + u32 right_chapter = 0; + u32 bad_chapters = 0; + + /* + * This method assumes there is at most one run of contiguous bad chapters caused by + * unflushed writes. Either the bad spot is at the beginning and end, or somewhere in the + * middle. Wherever it is, the highest and lowest VCNs are adjacent to it. Otherwise the + * volume is cleanly saved and somewhere in the middle of it the highest VCN immediately + * precedes the lowest one. + */ + + /* It doesn't matter if this results in a bad spot (BAD_CHAPTER). */ + probe_chapter(volume, 0, &zero_vcn); + + /* + * Binary search for end of the discontinuity in the monotonically increasing virtual + * chapter numbers; bad spots are treated as a span of BAD_CHAPTER values. In effect we're + * searching for the index of the smallest value less than zero_vcn. In the case we go off + * the end it means that chapter 0 has the lowest vcn. + * + * If a virtual chapter is out-of-order, it will be the one moved by conversion. Always + * skip over the moved chapter when searching, adding it to the range at the end if + * necessary. + */ + if (geometry->remapped_physical > 0) { + u64 remapped_vcn; + + probe_chapter(volume, geometry->remapped_physical, &remapped_vcn); + if (remapped_vcn == geometry->remapped_virtual) + moved_chapter = geometry->remapped_physical; + } + + left_chapter = 0; + right_chapter = chapter_limit; + + while (left_chapter < right_chapter) { + u64 probe_vcn; + u32 chapter = (left_chapter + right_chapter) / 2; + + if (chapter == moved_chapter) + chapter--; + + probe_chapter(volume, chapter, &probe_vcn); + if (zero_vcn <= probe_vcn) { + left_chapter = chapter + 1; + if (left_chapter == moved_chapter) + left_chapter++; + } else { + right_chapter = chapter; + } + } + + /* If left_chapter goes off the end, chapter 0 has the lowest virtual chapter number.*/ + if (left_chapter >= chapter_limit) + left_chapter = 0; + + /* At this point, left_chapter is the chapter with the lowest virtual chapter number. */ + probe_chapter(volume, left_chapter, &lowest); + + /* The moved chapter might be the lowest in the range. */ + if ((moved_chapter != BAD_CHAPTER) && (lowest == geometry->remapped_virtual + 1)) + lowest = geometry->remapped_virtual; + + /* + * Circularly scan backwards, moving over any bad chapters until encountering a good one, + * which is the chapter with the highest vcn. + */ + while (highest == BAD_CHAPTER) { + right_chapter = (right_chapter + chapter_limit - 1) % chapter_limit; + if (right_chapter == moved_chapter) + continue; + + probe_chapter(volume, right_chapter, &highest); + if (bad_chapters++ >= MAX_BAD_CHAPTERS) { + vdo_log_error("too many bad chapters in volume: %u", + bad_chapters); + return UDS_CORRUPT_DATA; + } + } + + *lowest_vcn = lowest; + *highest_vcn = highest; + return UDS_SUCCESS; +} + +/* + * Find the highest and lowest contiguous chapters present in the volume and determine their + * virtual chapter numbers. This is used by rebuild. + */ +int uds_find_volume_chapter_boundaries(struct volume *volume, u64 *lowest_vcn, + u64 *highest_vcn, bool *is_empty) +{ + u32 chapter_limit = volume->geometry->chapters_per_volume; + + find_real_end_of_volume(volume, chapter_limit, &chapter_limit); + if (chapter_limit == 0) { + *lowest_vcn = 0; + *highest_vcn = 0; + *is_empty = true; + return UDS_SUCCESS; + } + + *is_empty = false; + return find_chapter_limits(volume, chapter_limit, lowest_vcn, highest_vcn); +} + +int __must_check uds_replace_volume_storage(struct volume *volume, + struct index_layout *layout, + struct block_device *bdev) +{ + int result; + u32 i; + + result = uds_replace_index_layout_storage(layout, bdev); + if (result != UDS_SUCCESS) + return result; + + /* Release all outstanding dm_bufio objects */ + for (i = 0; i < volume->page_cache.indexable_pages; i++) + volume->page_cache.index[i] = volume->page_cache.cache_slots; + for (i = 0; i < volume->page_cache.cache_slots; i++) + clear_cache_page(&volume->page_cache, &volume->page_cache.cache[i]); + if (volume->sparse_cache != NULL) + uds_invalidate_sparse_cache(volume->sparse_cache); + if (volume->client != NULL) + dm_bufio_client_destroy(vdo_forget(volume->client)); + + return uds_open_volume_bufio(layout, volume->geometry->bytes_per_page, + volume->reserved_buffers, &volume->client); +} + +static int __must_check initialize_page_cache(struct page_cache *cache, + const struct index_geometry *geometry, + u32 chapters_in_cache, + unsigned int zone_count) +{ + int result; + u32 i; + + cache->indexable_pages = geometry->pages_per_volume + 1; + cache->cache_slots = chapters_in_cache * geometry->record_pages_per_chapter; + cache->zone_count = zone_count; + atomic64_set(&cache->clock, 1); + + result = VDO_ASSERT((cache->cache_slots <= VOLUME_CACHE_MAX_ENTRIES), + "requested cache size, %u, within limit %u", + cache->cache_slots, VOLUME_CACHE_MAX_ENTRIES); + if (result != VDO_SUCCESS) + return result; + + result = vdo_allocate(VOLUME_CACHE_MAX_QUEUED_READS, struct queued_read, + "volume read queue", &cache->read_queue); + if (result != VDO_SUCCESS) + return result; + + result = vdo_allocate(cache->zone_count, struct search_pending_counter, + "Volume Cache Zones", &cache->search_pending_counters); + if (result != VDO_SUCCESS) + return result; + + result = vdo_allocate(cache->indexable_pages, u16, "page cache index", + &cache->index); + if (result != VDO_SUCCESS) + return result; + + result = vdo_allocate(cache->cache_slots, struct cached_page, "page cache cache", + &cache->cache); + if (result != VDO_SUCCESS) + return result; + + /* Initialize index values to invalid values. */ + for (i = 0; i < cache->indexable_pages; i++) + cache->index[i] = cache->cache_slots; + + for (i = 0; i < cache->cache_slots; i++) + clear_cache_page(cache, &cache->cache[i]); + + return UDS_SUCCESS; +} + +int uds_make_volume(const struct uds_configuration *config, struct index_layout *layout, + struct volume **new_volume) +{ + unsigned int i; + struct volume *volume = NULL; + struct index_geometry *geometry; + unsigned int reserved_buffers; + int result; + + result = vdo_allocate(1, struct volume, "volume", &volume); + if (result != VDO_SUCCESS) + return result; + + volume->nonce = uds_get_volume_nonce(layout); + + result = uds_copy_index_geometry(config->geometry, &volume->geometry); + if (result != UDS_SUCCESS) { + uds_free_volume(volume); + return vdo_log_warning_strerror(result, + "failed to allocate geometry: error"); + } + geometry = volume->geometry; + + /* + * Reserve a buffer for each entry in the page cache, one for the chapter writer, and one + * for each entry in the sparse cache. + */ + reserved_buffers = config->cache_chapters * geometry->record_pages_per_chapter; + reserved_buffers += 1; + if (uds_is_sparse_index_geometry(geometry)) + reserved_buffers += (config->cache_chapters * geometry->index_pages_per_chapter); + volume->reserved_buffers = reserved_buffers; + result = uds_open_volume_bufio(layout, geometry->bytes_per_page, + volume->reserved_buffers, &volume->client); + if (result != UDS_SUCCESS) { + uds_free_volume(volume); + return result; + } + + result = uds_make_radix_sorter(geometry->records_per_page, + &volume->radix_sorter); + if (result != UDS_SUCCESS) { + uds_free_volume(volume); + return result; + } + + result = vdo_allocate(geometry->records_per_page, + const struct uds_volume_record *, "record pointers", + &volume->record_pointers); + if (result != VDO_SUCCESS) { + uds_free_volume(volume); + return result; + } + + if (uds_is_sparse_index_geometry(geometry)) { + size_t page_size = sizeof(struct delta_index_page) + geometry->bytes_per_page; + + result = uds_make_sparse_cache(geometry, config->cache_chapters, + config->zone_count, + &volume->sparse_cache); + if (result != UDS_SUCCESS) { + uds_free_volume(volume); + return result; + } + + volume->cache_size = + page_size * geometry->index_pages_per_chapter * config->cache_chapters; + } + + result = initialize_page_cache(&volume->page_cache, geometry, + config->cache_chapters, config->zone_count); + if (result != UDS_SUCCESS) { + uds_free_volume(volume); + return result; + } + + volume->cache_size += volume->page_cache.cache_slots * sizeof(struct delta_index_page); + result = uds_make_index_page_map(geometry, &volume->index_page_map); + if (result != UDS_SUCCESS) { + uds_free_volume(volume); + return result; + } + + mutex_init(&volume->read_threads_mutex); + uds_init_cond(&volume->read_threads_read_done_cond); + uds_init_cond(&volume->read_threads_cond); + + result = vdo_allocate(config->read_threads, struct thread *, "reader threads", + &volume->reader_threads); + if (result != VDO_SUCCESS) { + uds_free_volume(volume); + return result; + } + + for (i = 0; i < config->read_threads; i++) { + result = vdo_create_thread(read_thread_function, (void *) volume, + "reader", &volume->reader_threads[i]); + if (result != VDO_SUCCESS) { + uds_free_volume(volume); + return result; + } + + volume->read_thread_count = i + 1; + } + + *new_volume = volume; + return UDS_SUCCESS; +} + +static void uninitialize_page_cache(struct page_cache *cache) +{ + u16 i; + + if (cache->cache != NULL) { + for (i = 0; i < cache->cache_slots; i++) + release_page_buffer(&cache->cache[i]); + } + vdo_free(cache->index); + vdo_free(cache->cache); + vdo_free(cache->search_pending_counters); + vdo_free(cache->read_queue); +} + +void uds_free_volume(struct volume *volume) +{ + if (volume == NULL) + return; + + if (volume->reader_threads != NULL) { + unsigned int i; + + /* This works even if some threads weren't started. */ + mutex_lock(&volume->read_threads_mutex); + volume->read_threads_exiting = true; + uds_broadcast_cond(&volume->read_threads_cond); + mutex_unlock(&volume->read_threads_mutex); + for (i = 0; i < volume->read_thread_count; i++) + vdo_join_threads(volume->reader_threads[i]); + vdo_free(volume->reader_threads); + volume->reader_threads = NULL; + } + + /* Must destroy the client AFTER freeing the cached pages. */ + uninitialize_page_cache(&volume->page_cache); + uds_free_sparse_cache(volume->sparse_cache); + if (volume->client != NULL) + dm_bufio_client_destroy(vdo_forget(volume->client)); + + uds_free_index_page_map(volume->index_page_map); + uds_free_radix_sorter(volume->radix_sorter); + vdo_free(volume->geometry); + vdo_free(volume->record_pointers); + vdo_free(volume); +} diff --git a/drivers/md/dm-vdo/indexer/volume.h b/drivers/md/dm-vdo/indexer/volume.h new file mode 100644 index 0000000000..8679a5e553 --- /dev/null +++ b/drivers/md/dm-vdo/indexer/volume.h @@ -0,0 +1,172 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright 2023 Red Hat + */ + +#ifndef UDS_VOLUME_H +#define UDS_VOLUME_H + +#include +#include +#include +#include + +#include "permassert.h" +#include "thread-utils.h" + +#include "chapter-index.h" +#include "config.h" +#include "geometry.h" +#include "indexer.h" +#include "index-layout.h" +#include "index-page-map.h" +#include "radix-sort.h" +#include "sparse-cache.h" + +/* + * The volume manages deduplication records on permanent storage. The term "volume" can also refer + * to the region of permanent storage where the records (and the chapters containing them) are + * stored. The volume handles all I/O to this region by reading, caching, and writing chapter pages + * as necessary. + */ + +enum index_lookup_mode { + /* Always do lookups in all chapters normally */ + LOOKUP_NORMAL, + /* Only do a subset of lookups needed when rebuilding an index */ + LOOKUP_FOR_REBUILD, +}; + +struct queued_read { + bool invalid; + bool reserved; + u32 physical_page; + struct uds_request *first_request; + struct uds_request *last_request; +}; + +struct __aligned(L1_CACHE_BYTES) search_pending_counter { + u64 atomic_value; +}; + +struct cached_page { + /* Whether this page is currently being read asynchronously */ + bool read_pending; + /* The physical page stored in this cache entry */ + u32 physical_page; + /* The value of the volume clock when this page was last used */ + s64 last_used; + /* The cached page buffer */ + struct dm_buffer *buffer; + /* The chapter index page, meaningless for record pages */ + struct delta_index_page index_page; +}; + +struct page_cache { + /* The number of zones */ + unsigned int zone_count; + /* The number of volume pages that can be cached */ + u32 indexable_pages; + /* The maximum number of simultaneously cached pages */ + u16 cache_slots; + /* An index for each physical page noting where it is in the cache */ + u16 *index; + /* The array of cached pages */ + struct cached_page *cache; + /* A counter for each zone tracking if a search is occurring there */ + struct search_pending_counter *search_pending_counters; + /* The read queue entries as a circular array */ + struct queued_read *read_queue; + + /* All entries above this point are constant after initialization. */ + + /* + * These values are all indexes into the array of read queue entries. New entries in the + * read queue are enqueued at read_queue_last. To dequeue entries, a reader thread gets the + * lock and then claims the entry pointed to by read_queue_next_read and increments that + * value. After the read is completed, the reader thread calls release_read_queue_entry(), + * which increments read_queue_first until it points to a pending read, or is equal to + * read_queue_next_read. This means that if multiple reads are outstanding, + * read_queue_first might not advance until the last of the reads finishes. + */ + u16 read_queue_first; + u16 read_queue_next_read; + u16 read_queue_last; + + atomic64_t clock; +}; + +struct volume { + struct index_geometry *geometry; + struct dm_bufio_client *client; + u64 nonce; + size_t cache_size; + + /* A single page worth of records, for sorting */ + const struct uds_volume_record **record_pointers; + /* Sorter for sorting records within each page */ + struct radix_sorter *radix_sorter; + + struct sparse_cache *sparse_cache; + struct page_cache page_cache; + struct index_page_map *index_page_map; + + struct mutex read_threads_mutex; + struct cond_var read_threads_cond; + struct cond_var read_threads_read_done_cond; + struct thread **reader_threads; + unsigned int read_thread_count; + bool read_threads_exiting; + + enum index_lookup_mode lookup_mode; + unsigned int reserved_buffers; +}; + +int __must_check uds_make_volume(const struct uds_configuration *config, + struct index_layout *layout, + struct volume **new_volume); + +void uds_free_volume(struct volume *volume); + +int __must_check uds_replace_volume_storage(struct volume *volume, + struct index_layout *layout, + struct block_device *bdev); + +int __must_check uds_find_volume_chapter_boundaries(struct volume *volume, + u64 *lowest_vcn, u64 *highest_vcn, + bool *is_empty); + +int __must_check uds_search_volume_page_cache(struct volume *volume, + struct uds_request *request, + bool *found); + +int __must_check uds_search_volume_page_cache_for_rebuild(struct volume *volume, + const struct uds_record_name *name, + u64 virtual_chapter, + bool *found); + +int __must_check uds_search_cached_record_page(struct volume *volume, + struct uds_request *request, u32 chapter, + u16 record_page_number, bool *found); + +void uds_forget_chapter(struct volume *volume, u64 chapter); + +int __must_check uds_write_chapter(struct volume *volume, + struct open_chapter_index *chapter_index, + const struct uds_volume_record records[]); + +void uds_prefetch_volume_chapter(const struct volume *volume, u32 chapter); + +int __must_check uds_read_chapter_index_from_volume(const struct volume *volume, + u64 virtual_chapter, + struct dm_buffer *volume_buffers[], + struct delta_index_page index_pages[]); + +int __must_check uds_get_volume_record_page(struct volume *volume, u32 chapter, + u32 page_number, u8 **data_ptr); + +int __must_check uds_get_volume_index_page(struct volume *volume, u32 chapter, + u32 page_number, + struct delta_index_page **page_ptr); + +#endif /* UDS_VOLUME_H */ -- cgit v1.2.3