diff options
Diffstat (limited to 'drivers/md/dm-vdo/indexer/index.c')
-rw-r--r-- | drivers/md/dm-vdo/indexer/index.c | 1388 |
1 files changed, 1388 insertions, 0 deletions
diff --git a/drivers/md/dm-vdo/indexer/index.c b/drivers/md/dm-vdo/indexer/index.c new file mode 100644 index 0000000000..1ba7671444 --- /dev/null +++ b/drivers/md/dm-vdo/indexer/index.c @@ -0,0 +1,1388 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2023 Red Hat + */ + + +#include "index.h" + +#include "logger.h" +#include "memory-alloc.h" + +#include "funnel-requestqueue.h" +#include "hash-utils.h" +#include "sparse-cache.h" + +static const u64 NO_LAST_SAVE = U64_MAX; + +/* + * When searching for deduplication records, the index first searches the volume index, and then + * searches the chapter index for the relevant chapter. If the chapter has been fully committed to + * storage, the chapter pages are loaded into the page cache. If the chapter has not yet been + * committed (either the open chapter or a recently closed one), the index searches the in-memory + * representation of the chapter. Finally, if the volume index does not find a record and the index + * is sparse, the index will search the sparse cache. + * + * The index send two kinds of messages to coordinate between zones: chapter close messages for the + * chapter writer, and sparse cache barrier messages for the sparse cache. + * + * The chapter writer is responsible for committing chapters of records to storage. Since zones can + * get different numbers of records, some zones may fall behind others. Each time a zone fills up + * its available space in a chapter, it informs the chapter writer that the chapter is complete, + * and also informs all other zones that it has closed the chapter. Each other zone will then close + * the chapter immediately, regardless of how full it is, in order to minimize skew between zones. + * Once every zone has closed the chapter, the chapter writer will commit that chapter to storage. + * + * The last zone to close the chapter also removes the oldest chapter from the volume index. + * Although that chapter is invalid for zones that have moved on, the existence of the open chapter + * means that those zones will never ask the volume index about it. No zone is allowed to get more + * than one chapter ahead of any other. If a zone is so far ahead that it tries to close another + * chapter before the previous one has been closed by all zones, it is forced to wait. + * + * The sparse cache relies on having the same set of chapter indexes available to all zones. When a + * request wants to add a chapter to the sparse cache, it sends a barrier message to each zone + * during the triage stage that acts as a rendezvous. Once every zone has reached the barrier and + * paused its operations, the cache membership is changed and each zone is then informed that it + * can proceed. More details can be found in the sparse cache documentation. + * + * If a sparse cache has only one zone, it will not create a triage queue, but it still needs the + * barrier message to change the sparse cache membership, so the index simulates the message by + * invoking the handler directly. + */ + +struct chapter_writer { + /* The index to which we belong */ + struct uds_index *index; + /* The thread to do the writing */ + struct thread *thread; + /* The lock protecting the following fields */ + struct mutex mutex; + /* The condition signalled on state changes */ + struct cond_var cond; + /* Set to true to stop the thread */ + bool stop; + /* The result from the most recent write */ + int result; + /* The number of bytes allocated by the chapter writer */ + size_t memory_size; + /* The number of zones which have submitted a chapter for writing */ + unsigned int zones_to_write; + /* Open chapter index used by uds_close_open_chapter() */ + struct open_chapter_index *open_chapter_index; + /* Collated records used by uds_close_open_chapter() */ + struct uds_volume_record *collated_records; + /* The chapters to write (one per zone) */ + struct open_chapter_zone *chapters[]; +}; + +static bool is_zone_chapter_sparse(const struct index_zone *zone, u64 virtual_chapter) +{ + return uds_is_chapter_sparse(zone->index->volume->geometry, + zone->oldest_virtual_chapter, + zone->newest_virtual_chapter, virtual_chapter); +} + +static int launch_zone_message(struct uds_zone_message message, unsigned int zone, + struct uds_index *index) +{ + int result; + struct uds_request *request; + + result = vdo_allocate(1, struct uds_request, __func__, &request); + if (result != VDO_SUCCESS) + return result; + + request->index = index; + request->unbatched = true; + request->zone_number = zone; + request->zone_message = message; + + uds_enqueue_request(request, STAGE_MESSAGE); + return UDS_SUCCESS; +} + +static void enqueue_barrier_messages(struct uds_index *index, u64 virtual_chapter) +{ + struct uds_zone_message message = { + .type = UDS_MESSAGE_SPARSE_CACHE_BARRIER, + .virtual_chapter = virtual_chapter, + }; + unsigned int zone; + + for (zone = 0; zone < index->zone_count; zone++) { + int result = launch_zone_message(message, zone, index); + + VDO_ASSERT_LOG_ONLY((result == UDS_SUCCESS), "barrier message allocation"); + } +} + +/* + * Determine whether this request should trigger a sparse cache barrier message to change the + * membership of the sparse cache. If a change in membership is desired, the function returns the + * chapter number to add. + */ +static u64 triage_index_request(struct uds_index *index, struct uds_request *request) +{ + u64 virtual_chapter; + struct index_zone *zone; + + virtual_chapter = uds_lookup_volume_index_name(index->volume_index, + &request->record_name); + if (virtual_chapter == NO_CHAPTER) + return NO_CHAPTER; + + zone = index->zones[request->zone_number]; + if (!is_zone_chapter_sparse(zone, virtual_chapter)) + return NO_CHAPTER; + + /* + * FIXME: Optimize for a common case by remembering the chapter from the most recent + * barrier message and skipping this chapter if is it the same. + */ + + return virtual_chapter; +} + +/* + * Simulate a message to change the sparse cache membership for a single-zone sparse index. This + * allows us to forgo the complicated locking required by a multi-zone sparse index. Any other kind + * of index does nothing here. + */ +static int simulate_index_zone_barrier_message(struct index_zone *zone, + struct uds_request *request) +{ + u64 sparse_virtual_chapter; + + if ((zone->index->zone_count > 1) || + !uds_is_sparse_index_geometry(zone->index->volume->geometry)) + return UDS_SUCCESS; + + sparse_virtual_chapter = triage_index_request(zone->index, request); + if (sparse_virtual_chapter == NO_CHAPTER) + return UDS_SUCCESS; + + return uds_update_sparse_cache(zone, sparse_virtual_chapter); +} + +/* This is the request processing function for the triage queue. */ +static void triage_request(struct uds_request *request) +{ + struct uds_index *index = request->index; + u64 sparse_virtual_chapter = triage_index_request(index, request); + + if (sparse_virtual_chapter != NO_CHAPTER) + enqueue_barrier_messages(index, sparse_virtual_chapter); + + uds_enqueue_request(request, STAGE_INDEX); +} + +static int finish_previous_chapter(struct uds_index *index, u64 current_chapter_number) +{ + int result; + struct chapter_writer *writer = index->chapter_writer; + + mutex_lock(&writer->mutex); + while (index->newest_virtual_chapter < current_chapter_number) + uds_wait_cond(&writer->cond, &writer->mutex); + result = writer->result; + mutex_unlock(&writer->mutex); + + if (result != UDS_SUCCESS) + return vdo_log_error_strerror(result, + "Writing of previous open chapter failed"); + + return UDS_SUCCESS; +} + +static int swap_open_chapter(struct index_zone *zone) +{ + int result; + struct open_chapter_zone *temporary_chapter; + + result = finish_previous_chapter(zone->index, zone->newest_virtual_chapter); + if (result != UDS_SUCCESS) + return result; + + temporary_chapter = zone->open_chapter; + zone->open_chapter = zone->writing_chapter; + zone->writing_chapter = temporary_chapter; + return UDS_SUCCESS; +} + +/* + * Inform the chapter writer that this zone is done with this chapter. The chapter won't start + * writing until all zones have closed it. + */ +static unsigned int start_closing_chapter(struct uds_index *index, + unsigned int zone_number, + struct open_chapter_zone *chapter) +{ + unsigned int finished_zones; + struct chapter_writer *writer = index->chapter_writer; + + mutex_lock(&writer->mutex); + finished_zones = ++writer->zones_to_write; + writer->chapters[zone_number] = chapter; + uds_broadcast_cond(&writer->cond); + mutex_unlock(&writer->mutex); + + return finished_zones; +} + +static int announce_chapter_closed(struct index_zone *zone, u64 closed_chapter) +{ + int result; + unsigned int i; + struct uds_zone_message zone_message = { + .type = UDS_MESSAGE_ANNOUNCE_CHAPTER_CLOSED, + .virtual_chapter = closed_chapter, + }; + + for (i = 0; i < zone->index->zone_count; i++) { + if (zone->id == i) + continue; + + result = launch_zone_message(zone_message, i, zone->index); + if (result != UDS_SUCCESS) + return result; + } + + return UDS_SUCCESS; +} + +static int open_next_chapter(struct index_zone *zone) +{ + int result; + u64 closed_chapter; + u64 expiring; + unsigned int finished_zones; + u32 expire_chapters; + + vdo_log_debug("closing chapter %llu of zone %u after %u entries (%u short)", + (unsigned long long) zone->newest_virtual_chapter, zone->id, + zone->open_chapter->size, + zone->open_chapter->capacity - zone->open_chapter->size); + + result = swap_open_chapter(zone); + if (result != UDS_SUCCESS) + return result; + + closed_chapter = zone->newest_virtual_chapter++; + uds_set_volume_index_zone_open_chapter(zone->index->volume_index, zone->id, + zone->newest_virtual_chapter); + uds_reset_open_chapter(zone->open_chapter); + + finished_zones = start_closing_chapter(zone->index, zone->id, + zone->writing_chapter); + if ((finished_zones == 1) && (zone->index->zone_count > 1)) { + result = announce_chapter_closed(zone, closed_chapter); + if (result != UDS_SUCCESS) + return result; + } + + expiring = zone->oldest_virtual_chapter; + expire_chapters = uds_chapters_to_expire(zone->index->volume->geometry, + zone->newest_virtual_chapter); + zone->oldest_virtual_chapter += expire_chapters; + + if (finished_zones < zone->index->zone_count) + return UDS_SUCCESS; + + while (expire_chapters-- > 0) + uds_forget_chapter(zone->index->volume, expiring++); + + return UDS_SUCCESS; +} + +static int handle_chapter_closed(struct index_zone *zone, u64 virtual_chapter) +{ + if (zone->newest_virtual_chapter == virtual_chapter) + return open_next_chapter(zone); + + return UDS_SUCCESS; +} + +static int dispatch_index_zone_control_request(struct uds_request *request) +{ + struct uds_zone_message *message = &request->zone_message; + struct index_zone *zone = request->index->zones[request->zone_number]; + + switch (message->type) { + case UDS_MESSAGE_SPARSE_CACHE_BARRIER: + return uds_update_sparse_cache(zone, message->virtual_chapter); + + case UDS_MESSAGE_ANNOUNCE_CHAPTER_CLOSED: + return handle_chapter_closed(zone, message->virtual_chapter); + + default: + vdo_log_error("invalid message type: %d", message->type); + return UDS_INVALID_ARGUMENT; + } +} + +static void set_request_location(struct uds_request *request, + enum uds_index_region new_location) +{ + request->location = new_location; + request->found = ((new_location == UDS_LOCATION_IN_OPEN_CHAPTER) || + (new_location == UDS_LOCATION_IN_DENSE) || + (new_location == UDS_LOCATION_IN_SPARSE)); +} + +static void set_chapter_location(struct uds_request *request, + const struct index_zone *zone, u64 virtual_chapter) +{ + request->found = true; + if (virtual_chapter == zone->newest_virtual_chapter) + request->location = UDS_LOCATION_IN_OPEN_CHAPTER; + else if (is_zone_chapter_sparse(zone, virtual_chapter)) + request->location = UDS_LOCATION_IN_SPARSE; + else + request->location = UDS_LOCATION_IN_DENSE; +} + +static int search_sparse_cache_in_zone(struct index_zone *zone, struct uds_request *request, + u64 virtual_chapter, bool *found) +{ + int result; + struct volume *volume; + u16 record_page_number; + u32 chapter; + + result = uds_search_sparse_cache(zone, &request->record_name, &virtual_chapter, + &record_page_number); + if ((result != UDS_SUCCESS) || (virtual_chapter == NO_CHAPTER)) + return result; + + request->virtual_chapter = virtual_chapter; + volume = zone->index->volume; + chapter = uds_map_to_physical_chapter(volume->geometry, virtual_chapter); + return uds_search_cached_record_page(volume, request, chapter, + record_page_number, found); +} + +static int get_record_from_zone(struct index_zone *zone, struct uds_request *request, + bool *found) +{ + struct volume *volume; + + if (request->location == UDS_LOCATION_RECORD_PAGE_LOOKUP) { + *found = true; + return UDS_SUCCESS; + } else if (request->location == UDS_LOCATION_UNAVAILABLE) { + *found = false; + return UDS_SUCCESS; + } + + if (request->virtual_chapter == zone->newest_virtual_chapter) { + uds_search_open_chapter(zone->open_chapter, &request->record_name, + &request->old_metadata, found); + return UDS_SUCCESS; + } + + if ((zone->newest_virtual_chapter > 0) && + (request->virtual_chapter == (zone->newest_virtual_chapter - 1)) && + (zone->writing_chapter->size > 0)) { + uds_search_open_chapter(zone->writing_chapter, &request->record_name, + &request->old_metadata, found); + return UDS_SUCCESS; + } + + volume = zone->index->volume; + if (is_zone_chapter_sparse(zone, request->virtual_chapter) && + uds_sparse_cache_contains(volume->sparse_cache, request->virtual_chapter, + request->zone_number)) + return search_sparse_cache_in_zone(zone, request, + request->virtual_chapter, found); + + return uds_search_volume_page_cache(volume, request, found); +} + +static int put_record_in_zone(struct index_zone *zone, struct uds_request *request, + const struct uds_record_data *metadata) +{ + unsigned int remaining; + + remaining = uds_put_open_chapter(zone->open_chapter, &request->record_name, + metadata); + if (remaining == 0) + return open_next_chapter(zone); + + return UDS_SUCCESS; +} + +static int search_index_zone(struct index_zone *zone, struct uds_request *request) +{ + int result; + struct volume_index_record record; + bool overflow_record, found = false; + struct uds_record_data *metadata; + u64 chapter; + + result = uds_get_volume_index_record(zone->index->volume_index, + &request->record_name, &record); + if (result != UDS_SUCCESS) + return result; + + if (record.is_found) { + if (request->requeued && request->virtual_chapter != record.virtual_chapter) + set_request_location(request, UDS_LOCATION_UNKNOWN); + + request->virtual_chapter = record.virtual_chapter; + result = get_record_from_zone(zone, request, &found); + if (result != UDS_SUCCESS) + return result; + } + + if (found) + set_chapter_location(request, zone, record.virtual_chapter); + + /* + * If a record has overflowed a chapter index in more than one chapter (or overflowed in + * one chapter and collided with an existing record), it will exist as a collision record + * in the volume index, but we won't find it in the volume. This case needs special + * handling. + */ + overflow_record = (record.is_found && record.is_collision && !found); + chapter = zone->newest_virtual_chapter; + if (found || overflow_record) { + if ((request->type == UDS_QUERY_NO_UPDATE) || + ((request->type == UDS_QUERY) && overflow_record)) { + /* There is nothing left to do. */ + return UDS_SUCCESS; + } + + if (record.virtual_chapter != chapter) { + /* + * Update the volume index to reference the new chapter for the block. If + * the record had been deleted or dropped from the chapter index, it will + * be back. + */ + result = uds_set_volume_index_record_chapter(&record, chapter); + } else if (request->type != UDS_UPDATE) { + /* The record is already in the open chapter. */ + return UDS_SUCCESS; + } + } else { + /* + * The record wasn't in the volume index, so check whether the + * name is in a cached sparse chapter. If we found the name on + * a previous search, use that result instead. + */ + if (request->location == UDS_LOCATION_RECORD_PAGE_LOOKUP) { + found = true; + } else if (request->location == UDS_LOCATION_UNAVAILABLE) { + found = false; + } else if (uds_is_sparse_index_geometry(zone->index->volume->geometry) && + !uds_is_volume_index_sample(zone->index->volume_index, + &request->record_name)) { + result = search_sparse_cache_in_zone(zone, request, NO_CHAPTER, + &found); + if (result != UDS_SUCCESS) + return result; + } + + if (found) + set_request_location(request, UDS_LOCATION_IN_SPARSE); + + if ((request->type == UDS_QUERY_NO_UPDATE) || + ((request->type == UDS_QUERY) && !found)) { + /* There is nothing left to do. */ + return UDS_SUCCESS; + } + + /* + * Add a new entry to the volume index referencing the open chapter. This needs to + * be done both for new records, and for records from cached sparse chapters. + */ + result = uds_put_volume_index_record(&record, chapter); + } + + if (result == UDS_OVERFLOW) { + /* + * The volume index encountered a delta list overflow. The condition was already + * logged. We will go on without adding the record to the open chapter. + */ + return UDS_SUCCESS; + } + + if (result != UDS_SUCCESS) + return result; + + if (!found || (request->type == UDS_UPDATE)) { + /* This is a new record or we're updating an existing record. */ + metadata = &request->new_metadata; + } else { + /* Move the existing record to the open chapter. */ + metadata = &request->old_metadata; + } + + return put_record_in_zone(zone, request, metadata); +} + +static int remove_from_index_zone(struct index_zone *zone, struct uds_request *request) +{ + int result; + struct volume_index_record record; + + result = uds_get_volume_index_record(zone->index->volume_index, + &request->record_name, &record); + if (result != UDS_SUCCESS) + return result; + + if (!record.is_found) + return UDS_SUCCESS; + + /* If the request was requeued, check whether the saved state is still valid. */ + + if (record.is_collision) { + set_chapter_location(request, zone, record.virtual_chapter); + } else { + /* Non-collision records are hints, so resolve the name in the chapter. */ + bool found; + + if (request->requeued && request->virtual_chapter != record.virtual_chapter) + set_request_location(request, UDS_LOCATION_UNKNOWN); + + request->virtual_chapter = record.virtual_chapter; + result = get_record_from_zone(zone, request, &found); + if (result != UDS_SUCCESS) + return result; + + if (!found) { + /* There is no record to remove. */ + return UDS_SUCCESS; + } + } + + set_chapter_location(request, zone, record.virtual_chapter); + + /* + * Delete the volume index entry for the named record only. Note that a later search might + * later return stale advice if there is a colliding name in the same chapter, but it's a + * very rare case (1 in 2^21). + */ + result = uds_remove_volume_index_record(&record); + if (result != UDS_SUCCESS) + return result; + + /* + * If the record is in the open chapter, we must remove it or mark it deleted to avoid + * trouble if the record is added again later. + */ + if (request->location == UDS_LOCATION_IN_OPEN_CHAPTER) + uds_remove_from_open_chapter(zone->open_chapter, &request->record_name); + + return UDS_SUCCESS; +} + +static int dispatch_index_request(struct uds_index *index, struct uds_request *request) +{ + int result; + struct index_zone *zone = index->zones[request->zone_number]; + + if (!request->requeued) { + result = simulate_index_zone_barrier_message(zone, request); + if (result != UDS_SUCCESS) + return result; + } + + switch (request->type) { + case UDS_POST: + case UDS_UPDATE: + case UDS_QUERY: + case UDS_QUERY_NO_UPDATE: + result = search_index_zone(zone, request); + break; + + case UDS_DELETE: + result = remove_from_index_zone(zone, request); + break; + + default: + result = vdo_log_warning_strerror(UDS_INVALID_ARGUMENT, + "invalid request type: %d", + request->type); + break; + } + + return result; +} + +/* This is the request processing function invoked by each zone's thread. */ +static void execute_zone_request(struct uds_request *request) +{ + int result; + struct uds_index *index = request->index; + + if (request->zone_message.type != UDS_MESSAGE_NONE) { + result = dispatch_index_zone_control_request(request); + if (result != UDS_SUCCESS) { + vdo_log_error_strerror(result, "error executing message: %d", + request->zone_message.type); + } + + /* Once the message is processed it can be freed. */ + vdo_free(vdo_forget(request)); + return; + } + + index->need_to_save = true; + if (request->requeued && (request->status != UDS_SUCCESS)) { + set_request_location(request, UDS_LOCATION_UNAVAILABLE); + index->callback(request); + return; + } + + result = dispatch_index_request(index, request); + if (result == UDS_QUEUED) { + /* The request has been requeued so don't let it complete. */ + return; + } + + if (!request->found) + set_request_location(request, UDS_LOCATION_UNAVAILABLE); + + request->status = result; + index->callback(request); +} + +static int initialize_index_queues(struct uds_index *index, + const struct index_geometry *geometry) +{ + int result; + unsigned int i; + + for (i = 0; i < index->zone_count; i++) { + result = uds_make_request_queue("indexW", &execute_zone_request, + &index->zone_queues[i]); + if (result != UDS_SUCCESS) + return result; + } + + /* The triage queue is only needed for sparse multi-zone indexes. */ + if ((index->zone_count > 1) && uds_is_sparse_index_geometry(geometry)) { + result = uds_make_request_queue("triageW", &triage_request, + &index->triage_queue); + if (result != UDS_SUCCESS) + return result; + } + + return UDS_SUCCESS; +} + +/* This is the driver function for the chapter writer thread. */ +static void close_chapters(void *arg) +{ + int result; + struct chapter_writer *writer = arg; + struct uds_index *index = writer->index; + + vdo_log_debug("chapter writer starting"); + mutex_lock(&writer->mutex); + for (;;) { + while (writer->zones_to_write < index->zone_count) { + if (writer->stop && (writer->zones_to_write == 0)) { + /* + * We've been told to stop, and all of the zones are in the same + * open chapter, so we can exit now. + */ + mutex_unlock(&writer->mutex); + vdo_log_debug("chapter writer stopping"); + return; + } + uds_wait_cond(&writer->cond, &writer->mutex); + } + + /* + * Release the lock while closing a chapter. We probably don't need to do this, but + * it seems safer in principle. It's OK to access the chapter and chapter_number + * fields without the lock since those aren't allowed to change until we're done. + */ + mutex_unlock(&writer->mutex); + + if (index->has_saved_open_chapter) { + /* + * Remove the saved open chapter the first time we close an open chapter + * after loading from a clean shutdown, or after doing a clean save. The + * lack of the saved open chapter will indicate that a recovery is + * necessary. + */ + index->has_saved_open_chapter = false; + result = uds_discard_open_chapter(index->layout); + if (result == UDS_SUCCESS) + vdo_log_debug("Discarding saved open chapter"); + } + + result = uds_close_open_chapter(writer->chapters, index->zone_count, + index->volume, + writer->open_chapter_index, + writer->collated_records, + index->newest_virtual_chapter); + + mutex_lock(&writer->mutex); + index->newest_virtual_chapter++; + index->oldest_virtual_chapter += + uds_chapters_to_expire(index->volume->geometry, + index->newest_virtual_chapter); + writer->result = result; + writer->zones_to_write = 0; + uds_broadcast_cond(&writer->cond); + } +} + +static void stop_chapter_writer(struct chapter_writer *writer) +{ + struct thread *writer_thread = NULL; + + mutex_lock(&writer->mutex); + if (writer->thread != NULL) { + writer_thread = writer->thread; + writer->thread = NULL; + writer->stop = true; + uds_broadcast_cond(&writer->cond); + } + mutex_unlock(&writer->mutex); + + if (writer_thread != NULL) + vdo_join_threads(writer_thread); +} + +static void free_chapter_writer(struct chapter_writer *writer) +{ + if (writer == NULL) + return; + + stop_chapter_writer(writer); + uds_free_open_chapter_index(writer->open_chapter_index); + vdo_free(writer->collated_records); + vdo_free(writer); +} + +static int make_chapter_writer(struct uds_index *index, + struct chapter_writer **writer_ptr) +{ + int result; + struct chapter_writer *writer; + size_t collated_records_size = + (sizeof(struct uds_volume_record) * index->volume->geometry->records_per_chapter); + + result = vdo_allocate_extended(struct chapter_writer, index->zone_count, + struct open_chapter_zone *, "Chapter Writer", + &writer); + if (result != VDO_SUCCESS) + return result; + + writer->index = index; + mutex_init(&writer->mutex); + uds_init_cond(&writer->cond); + + result = vdo_allocate_cache_aligned(collated_records_size, "collated records", + &writer->collated_records); + if (result != VDO_SUCCESS) { + free_chapter_writer(writer); + return result; + } + + result = uds_make_open_chapter_index(&writer->open_chapter_index, + index->volume->geometry, + index->volume->nonce); + if (result != UDS_SUCCESS) { + free_chapter_writer(writer); + return result; + } + + writer->memory_size = (sizeof(struct chapter_writer) + + index->zone_count * sizeof(struct open_chapter_zone *) + + collated_records_size + + writer->open_chapter_index->memory_size); + + result = vdo_create_thread(close_chapters, writer, "writer", &writer->thread); + if (result != VDO_SUCCESS) { + free_chapter_writer(writer); + return result; + } + + *writer_ptr = writer; + return UDS_SUCCESS; +} + +static int load_index(struct uds_index *index) +{ + int result; + u64 last_save_chapter; + + result = uds_load_index_state(index->layout, index); + if (result != UDS_SUCCESS) + return UDS_INDEX_NOT_SAVED_CLEANLY; + + last_save_chapter = ((index->last_save != NO_LAST_SAVE) ? index->last_save : 0); + + vdo_log_info("loaded index from chapter %llu through chapter %llu", + (unsigned long long) index->oldest_virtual_chapter, + (unsigned long long) last_save_chapter); + + return UDS_SUCCESS; +} + +static int rebuild_index_page_map(struct uds_index *index, u64 vcn) +{ + int result; + struct delta_index_page *chapter_index_page; + struct index_geometry *geometry = index->volume->geometry; + u32 chapter = uds_map_to_physical_chapter(geometry, vcn); + u32 expected_list_number = 0; + u32 index_page_number; + u32 lowest_delta_list; + u32 highest_delta_list; + + for (index_page_number = 0; + index_page_number < geometry->index_pages_per_chapter; + index_page_number++) { + result = uds_get_volume_index_page(index->volume, chapter, + index_page_number, + &chapter_index_page); + if (result != UDS_SUCCESS) { + return vdo_log_error_strerror(result, + "failed to read index page %u in chapter %u", + index_page_number, chapter); + } + + lowest_delta_list = chapter_index_page->lowest_list_number; + highest_delta_list = chapter_index_page->highest_list_number; + if (lowest_delta_list != expected_list_number) { + return vdo_log_error_strerror(UDS_CORRUPT_DATA, + "chapter %u index page %u is corrupt", + chapter, index_page_number); + } + + uds_update_index_page_map(index->volume->index_page_map, vcn, chapter, + index_page_number, highest_delta_list); + expected_list_number = highest_delta_list + 1; + } + + return UDS_SUCCESS; +} + +static int replay_record(struct uds_index *index, const struct uds_record_name *name, + u64 virtual_chapter, bool will_be_sparse_chapter) +{ + int result; + struct volume_index_record record; + bool update_record; + + if (will_be_sparse_chapter && + !uds_is_volume_index_sample(index->volume_index, name)) { + /* + * This entry will be in a sparse chapter after the rebuild completes, and it is + * not a sample, so just skip over it. + */ + return UDS_SUCCESS; + } + + result = uds_get_volume_index_record(index->volume_index, name, &record); + if (result != UDS_SUCCESS) + return result; + + if (record.is_found) { + if (record.is_collision) { + if (record.virtual_chapter == virtual_chapter) { + /* The record is already correct. */ + return UDS_SUCCESS; + } + + update_record = true; + } else if (record.virtual_chapter == virtual_chapter) { + /* + * There is a volume index entry pointing to the current chapter, but we + * don't know if it is for the same name as the one we are currently + * working on or not. For now, we're just going to assume that it isn't. + * This will create one extra collision record if there was a deleted + * record in the current chapter. + */ + update_record = false; + } else { + /* + * If we're rebuilding, we don't normally want to go to disk to see if the + * record exists, since we will likely have just read the record from disk + * (i.e. we know it's there). The exception to this is when we find an + * entry in the volume index that has a different chapter. In this case, we + * need to search that chapter to determine if the volume index entry was + * for the same record or a different one. + */ + result = uds_search_volume_page_cache_for_rebuild(index->volume, + name, + record.virtual_chapter, + &update_record); + if (result != UDS_SUCCESS) + return result; + } + } else { + update_record = false; + } + + if (update_record) { + /* + * Update the volume index to reference the new chapter for the block. If the + * record had been deleted or dropped from the chapter index, it will be back. + */ + result = uds_set_volume_index_record_chapter(&record, virtual_chapter); + } else { + /* + * Add a new entry to the volume index referencing the open chapter. This should be + * done regardless of whether we are a brand new record or a sparse record, i.e. + * one that doesn't exist in the index but does on disk, since for a sparse record, + * we would want to un-sparsify if it did exist. + */ + result = uds_put_volume_index_record(&record, virtual_chapter); + } + + if ((result == UDS_DUPLICATE_NAME) || (result == UDS_OVERFLOW)) { + /* The rebuilt index will lose these records. */ + return UDS_SUCCESS; + } + + return result; +} + +static bool check_for_suspend(struct uds_index *index) +{ + bool closing; + + if (index->load_context == NULL) + return false; + + mutex_lock(&index->load_context->mutex); + if (index->load_context->status != INDEX_SUSPENDING) { + mutex_unlock(&index->load_context->mutex); + return false; + } + + /* Notify that we are suspended and wait for the resume. */ + index->load_context->status = INDEX_SUSPENDED; + uds_broadcast_cond(&index->load_context->cond); + + while ((index->load_context->status != INDEX_OPENING) && + (index->load_context->status != INDEX_FREEING)) + uds_wait_cond(&index->load_context->cond, &index->load_context->mutex); + + closing = (index->load_context->status == INDEX_FREEING); + mutex_unlock(&index->load_context->mutex); + return closing; +} + +static int replay_chapter(struct uds_index *index, u64 virtual, bool sparse) +{ + int result; + u32 i; + u32 j; + const struct index_geometry *geometry; + u32 physical_chapter; + + if (check_for_suspend(index)) { + vdo_log_info("Replay interrupted by index shutdown at chapter %llu", + (unsigned long long) virtual); + return -EBUSY; + } + + geometry = index->volume->geometry; + physical_chapter = uds_map_to_physical_chapter(geometry, virtual); + uds_prefetch_volume_chapter(index->volume, physical_chapter); + uds_set_volume_index_open_chapter(index->volume_index, virtual); + + result = rebuild_index_page_map(index, virtual); + if (result != UDS_SUCCESS) { + return vdo_log_error_strerror(result, + "could not rebuild index page map for chapter %u", + physical_chapter); + } + + for (i = 0; i < geometry->record_pages_per_chapter; i++) { + u8 *record_page; + u32 record_page_number; + + record_page_number = geometry->index_pages_per_chapter + i; + result = uds_get_volume_record_page(index->volume, physical_chapter, + record_page_number, &record_page); + if (result != UDS_SUCCESS) { + return vdo_log_error_strerror(result, "could not get page %d", + record_page_number); + } + + for (j = 0; j < geometry->records_per_page; j++) { + const u8 *name_bytes; + struct uds_record_name name; + + name_bytes = record_page + (j * BYTES_PER_RECORD); + memcpy(&name.name, name_bytes, UDS_RECORD_NAME_SIZE); + result = replay_record(index, &name, virtual, sparse); + if (result != UDS_SUCCESS) + return result; + } + } + + return UDS_SUCCESS; +} + +static int replay_volume(struct uds_index *index) +{ + int result; + u64 old_map_update; + u64 new_map_update; + u64 virtual; + u64 from_virtual = index->oldest_virtual_chapter; + u64 upto_virtual = index->newest_virtual_chapter; + bool will_be_sparse; + + vdo_log_info("Replaying volume from chapter %llu through chapter %llu", + (unsigned long long) from_virtual, + (unsigned long long) upto_virtual); + + /* + * The index failed to load, so the volume index is empty. Add records to the volume index + * in order, skipping non-hooks in chapters which will be sparse to save time. + * + * Go through each record page of each chapter and add the records back to the volume + * index. This should not cause anything to be written to either the open chapter or the + * on-disk volume. Also skip the on-disk chapter corresponding to upto_virtual, as this + * would have already been purged from the volume index when the chapter was opened. + * + * Also, go through each index page for each chapter and rebuild the index page map. + */ + old_map_update = index->volume->index_page_map->last_update; + for (virtual = from_virtual; virtual < upto_virtual; virtual++) { + will_be_sparse = uds_is_chapter_sparse(index->volume->geometry, + from_virtual, upto_virtual, + virtual); + result = replay_chapter(index, virtual, will_be_sparse); + if (result != UDS_SUCCESS) + return result; + } + + /* Also reap the chapter being replaced by the open chapter. */ + uds_set_volume_index_open_chapter(index->volume_index, upto_virtual); + + new_map_update = index->volume->index_page_map->last_update; + if (new_map_update != old_map_update) { + vdo_log_info("replay changed index page map update from %llu to %llu", + (unsigned long long) old_map_update, + (unsigned long long) new_map_update); + } + + return UDS_SUCCESS; +} + +static int rebuild_index(struct uds_index *index) +{ + int result; + u64 lowest; + u64 highest; + bool is_empty = false; + u32 chapters_per_volume = index->volume->geometry->chapters_per_volume; + + index->volume->lookup_mode = LOOKUP_FOR_REBUILD; + result = uds_find_volume_chapter_boundaries(index->volume, &lowest, &highest, + &is_empty); + if (result != UDS_SUCCESS) { + return vdo_log_fatal_strerror(result, + "cannot rebuild index: unknown volume chapter boundaries"); + } + + if (is_empty) { + index->newest_virtual_chapter = 0; + index->oldest_virtual_chapter = 0; + index->volume->lookup_mode = LOOKUP_NORMAL; + return UDS_SUCCESS; + } + + index->newest_virtual_chapter = highest + 1; + index->oldest_virtual_chapter = lowest; + if (index->newest_virtual_chapter == + (index->oldest_virtual_chapter + chapters_per_volume)) { + /* Skip the chapter shadowed by the open chapter. */ + index->oldest_virtual_chapter++; + } + + result = replay_volume(index); + if (result != UDS_SUCCESS) + return result; + + index->volume->lookup_mode = LOOKUP_NORMAL; + return UDS_SUCCESS; +} + +static void free_index_zone(struct index_zone *zone) +{ + if (zone == NULL) + return; + + uds_free_open_chapter(zone->open_chapter); + uds_free_open_chapter(zone->writing_chapter); + vdo_free(zone); +} + +static int make_index_zone(struct uds_index *index, unsigned int zone_number) +{ + int result; + struct index_zone *zone; + + result = vdo_allocate(1, struct index_zone, "index zone", &zone); + if (result != VDO_SUCCESS) + return result; + + result = uds_make_open_chapter(index->volume->geometry, index->zone_count, + &zone->open_chapter); + if (result != UDS_SUCCESS) { + free_index_zone(zone); + return result; + } + + result = uds_make_open_chapter(index->volume->geometry, index->zone_count, + &zone->writing_chapter); + if (result != UDS_SUCCESS) { + free_index_zone(zone); + return result; + } + + zone->index = index; + zone->id = zone_number; + index->zones[zone_number] = zone; + + return UDS_SUCCESS; +} + +int uds_make_index(struct uds_configuration *config, enum uds_open_index_type open_type, + struct index_load_context *load_context, index_callback_fn callback, + struct uds_index **new_index) +{ + int result; + bool loaded = false; + bool new = (open_type == UDS_CREATE); + struct uds_index *index = NULL; + struct index_zone *zone; + u64 nonce; + unsigned int z; + + result = vdo_allocate_extended(struct uds_index, config->zone_count, + struct uds_request_queue *, "index", &index); + if (result != VDO_SUCCESS) + return result; + + index->zone_count = config->zone_count; + + result = uds_make_index_layout(config, new, &index->layout); + if (result != UDS_SUCCESS) { + uds_free_index(index); + return result; + } + + result = vdo_allocate(index->zone_count, struct index_zone *, "zones", + &index->zones); + if (result != VDO_SUCCESS) { + uds_free_index(index); + return result; + } + + result = uds_make_volume(config, index->layout, &index->volume); + if (result != UDS_SUCCESS) { + uds_free_index(index); + return result; + } + + index->volume->lookup_mode = LOOKUP_NORMAL; + for (z = 0; z < index->zone_count; z++) { + result = make_index_zone(index, z); + if (result != UDS_SUCCESS) { + uds_free_index(index); + return vdo_log_error_strerror(result, + "Could not create index zone"); + } + } + + nonce = uds_get_volume_nonce(index->layout); + result = uds_make_volume_index(config, nonce, &index->volume_index); + if (result != UDS_SUCCESS) { + uds_free_index(index); + return vdo_log_error_strerror(result, "could not make volume index"); + } + + index->load_context = load_context; + index->callback = callback; + + result = initialize_index_queues(index, config->geometry); + if (result != UDS_SUCCESS) { + uds_free_index(index); + return result; + } + + result = make_chapter_writer(index, &index->chapter_writer); + if (result != UDS_SUCCESS) { + uds_free_index(index); + return result; + } + + if (!new) { + result = load_index(index); + switch (result) { + case UDS_SUCCESS: + loaded = true; + break; + case -ENOMEM: + /* We should not try a rebuild for this error. */ + vdo_log_error_strerror(result, "index could not be loaded"); + break; + default: + vdo_log_error_strerror(result, "index could not be loaded"); + if (open_type == UDS_LOAD) { + result = rebuild_index(index); + if (result != UDS_SUCCESS) { + vdo_log_error_strerror(result, + "index could not be rebuilt"); + } + } + break; + } + } + + if (result != UDS_SUCCESS) { + uds_free_index(index); + return vdo_log_error_strerror(result, "fatal error in %s()", __func__); + } + + for (z = 0; z < index->zone_count; z++) { + zone = index->zones[z]; + zone->oldest_virtual_chapter = index->oldest_virtual_chapter; + zone->newest_virtual_chapter = index->newest_virtual_chapter; + } + + if (index->load_context != NULL) { + mutex_lock(&index->load_context->mutex); + index->load_context->status = INDEX_READY; + /* + * If we get here, suspend is meaningless, but notify any thread trying to suspend + * us so it doesn't hang. + */ + uds_broadcast_cond(&index->load_context->cond); + mutex_unlock(&index->load_context->mutex); + } + + index->has_saved_open_chapter = loaded; + index->need_to_save = !loaded; + *new_index = index; + return UDS_SUCCESS; +} + +void uds_free_index(struct uds_index *index) +{ + unsigned int i; + + if (index == NULL) + return; + + uds_request_queue_finish(index->triage_queue); + for (i = 0; i < index->zone_count; i++) + uds_request_queue_finish(index->zone_queues[i]); + + free_chapter_writer(index->chapter_writer); + + uds_free_volume_index(index->volume_index); + if (index->zones != NULL) { + for (i = 0; i < index->zone_count; i++) + free_index_zone(index->zones[i]); + vdo_free(index->zones); + } + + uds_free_volume(index->volume); + uds_free_index_layout(vdo_forget(index->layout)); + vdo_free(index); +} + +/* Wait for the chapter writer to complete any outstanding writes. */ +void uds_wait_for_idle_index(struct uds_index *index) +{ + struct chapter_writer *writer = index->chapter_writer; + + mutex_lock(&writer->mutex); + while (writer->zones_to_write > 0) + uds_wait_cond(&writer->cond, &writer->mutex); + mutex_unlock(&writer->mutex); +} + +/* This function assumes that all requests have been drained. */ +int uds_save_index(struct uds_index *index) +{ + int result; + + if (!index->need_to_save) + return UDS_SUCCESS; + + uds_wait_for_idle_index(index); + index->prev_save = index->last_save; + index->last_save = ((index->newest_virtual_chapter == 0) ? + NO_LAST_SAVE : index->newest_virtual_chapter - 1); + vdo_log_info("beginning save (vcn %llu)", (unsigned long long) index->last_save); + + result = uds_save_index_state(index->layout, index); + if (result != UDS_SUCCESS) { + vdo_log_info("save index failed"); + index->last_save = index->prev_save; + } else { + index->has_saved_open_chapter = true; + index->need_to_save = false; + vdo_log_info("finished save (vcn %llu)", + (unsigned long long) index->last_save); + } + + return result; +} + +int uds_replace_index_storage(struct uds_index *index, struct block_device *bdev) +{ + return uds_replace_volume_storage(index->volume, index->layout, bdev); +} + +/* Accessing statistics should be safe from any thread. */ +void uds_get_index_stats(struct uds_index *index, struct uds_index_stats *counters) +{ + struct volume_index_stats stats; + + uds_get_volume_index_stats(index->volume_index, &stats); + counters->entries_indexed = stats.record_count; + counters->collisions = stats.collision_count; + counters->entries_discarded = stats.discard_count; + + counters->memory_used = (index->volume_index->memory_size + + index->volume->cache_size + + index->chapter_writer->memory_size); +} + +void uds_enqueue_request(struct uds_request *request, enum request_stage stage) +{ + struct uds_index *index = request->index; + struct uds_request_queue *queue; + + switch (stage) { + case STAGE_TRIAGE: + if (index->triage_queue != NULL) { + queue = index->triage_queue; + break; + } + + fallthrough; + + case STAGE_INDEX: + request->zone_number = + uds_get_volume_index_zone(index->volume_index, &request->record_name); + fallthrough; + + case STAGE_MESSAGE: + queue = index->zone_queues[request->zone_number]; + break; + + default: + VDO_ASSERT_LOG_ONLY(false, "invalid index stage: %d", stage); + return; + } + + uds_request_queue_enqueue(queue, request); +} |