diff options
Diffstat (limited to 'storage/tokudb/PerconaFT/ft/serialize/block_table.cc')
-rw-r--r-- | storage/tokudb/PerconaFT/ft/serialize/block_table.cc | 1157 |
1 files changed, 1157 insertions, 0 deletions
diff --git a/storage/tokudb/PerconaFT/ft/serialize/block_table.cc b/storage/tokudb/PerconaFT/ft/serialize/block_table.cc new file mode 100644 index 00000000..e3606c11 --- /dev/null +++ b/storage/tokudb/PerconaFT/ft/serialize/block_table.cc @@ -0,0 +1,1157 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. +======= */ + +#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#include <my_global.h> +#include "portability/memory.h" +#include "portability/toku_assert.h" +#include "portability/toku_portability.h" +#include "portability/toku_pthread.h" + +// ugly but pragmatic, need access to dirty bits while holding translation lock +// TODO: Refactor this (possibly with FT-301) +#include "ft/ft-internal.h" + +// TODO: reorganize this dependency (FT-303) +#include "ft/ft-ops.h" // for toku_maybe_truncate_file +#include "ft/serialize/block_table.h" +#include "ft/serialize/rbuf.h" +#include "ft/serialize/wbuf.h" +#include "ft/serialize/block_allocator.h" +#include "util/nb_mutex.h" +#include "util/scoped_malloc.h" + + +toku_instr_key *block_table_mutex_key; +toku_instr_key *safe_file_size_lock_mutex_key; +toku_instr_key *safe_file_size_lock_rwlock_key; + +// indicates the end of a freelist +static const BLOCKNUM freelist_null = {-1}; + +// value of block_translation_pair.size if blocknum is unused +static const DISKOFF size_is_free = (DISKOFF)-1; + +// value of block_translation_pair.u.diskoff if blocknum is used but does not +// yet have a diskblock +static const DISKOFF diskoff_unused = (DISKOFF)-2; + +void block_table::_mutex_lock() { toku_mutex_lock(&_mutex); } + +void block_table::_mutex_unlock() { toku_mutex_unlock(&_mutex); } + +// TODO: Move lock to FT +void toku_ft_lock(FT ft) { + block_table *bt = &ft->blocktable; + bt->_mutex_lock(); +} + +// TODO: Move lock to FT +void toku_ft_unlock(FT ft) { + block_table *bt = &ft->blocktable; + toku_mutex_assert_locked(&bt->_mutex); + bt->_mutex_unlock(); +} + +// There are two headers: the reserve must fit them both and be suitably +// aligned. +static_assert(BlockAllocator::BLOCK_ALLOCATOR_HEADER_RESERVE % + BlockAllocator::BLOCK_ALLOCATOR_ALIGNMENT == + 0, + "Block allocator's header reserve must be suitibly aligned"); +static_assert( + BlockAllocator::BLOCK_ALLOCATOR_HEADER_RESERVE * 2 == + BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE, + "Block allocator's total header reserve must exactly fit two headers"); + +// does NOT initialize the block allocator: the caller is responsible +void block_table::_create_internal() { + memset(&_current, 0, sizeof(struct translation)); + memset(&_inprogress, 0, sizeof(struct translation)); + memset(&_checkpointed, 0, sizeof(struct translation)); + memset(&_mutex, 0, sizeof(_mutex)); + _bt_block_allocator = new BlockAllocator(); + toku_mutex_init(*block_table_mutex_key, &_mutex, nullptr); + nb_mutex_init(*safe_file_size_lock_mutex_key, + *safe_file_size_lock_rwlock_key, + &_safe_file_size_lock); +} + +// Fill in the checkpointed translation from buffer, and copy checkpointed to +// current. +// The one read from disk is the last known checkpointed one, so we are keeping +// it in +// place and then setting current (which is never stored on disk) for current +// use. +// The translation_buffer has translation only, we create the rest of the +// block_table. +int block_table::create_from_buffer( + int fd, + DISKOFF location_on_disk, // Location of translation_buffer + DISKOFF size_on_disk, + unsigned char *translation_buffer) { + // Does not initialize the block allocator + _create_internal(); + + // Deserialize the translation and copy it to current + int r = _translation_deserialize_from_buffer( + &_checkpointed, location_on_disk, size_on_disk, translation_buffer); + if (r != 0) { + return r; + } + _copy_translation(&_current, &_checkpointed, TRANSLATION_CURRENT); + + // Determine the file size + int64_t file_size = 0; + r = toku_os_get_file_size(fd, &file_size); + lazy_assert_zero(r); + invariant(file_size >= 0); + _safe_file_size = file_size; + + // Gather the non-empty translations and use them to create the block + // allocator + toku::scoped_malloc pairs_buf(_checkpointed.smallest_never_used_blocknum.b * + sizeof(struct BlockAllocator::BlockPair)); + struct BlockAllocator::BlockPair *CAST_FROM_VOIDP(pairs, pairs_buf.get()); + uint64_t n_pairs = 0; + for (int64_t i = 0; i < _checkpointed.smallest_never_used_blocknum.b; i++) { + struct block_translation_pair pair = _checkpointed.block_translation[i]; + if (pair.size > 0) { + invariant(pair.u.diskoff != diskoff_unused); + pairs[n_pairs++] = + BlockAllocator::BlockPair(pair.u.diskoff, pair.size); + } + } + + _bt_block_allocator->CreateFromBlockPairs( + BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE, + BlockAllocator::BLOCK_ALLOCATOR_ALIGNMENT, + pairs, + n_pairs); + + return 0; +} + +void block_table::create() { + // Does not initialize the block allocator + _create_internal(); + + _checkpointed.type = TRANSLATION_CHECKPOINTED; + _checkpointed.smallest_never_used_blocknum = + make_blocknum(RESERVED_BLOCKNUMS); + _checkpointed.length_of_array = + _checkpointed.smallest_never_used_blocknum.b; + _checkpointed.blocknum_freelist_head = freelist_null; + XMALLOC_N(_checkpointed.length_of_array, _checkpointed.block_translation); + for (int64_t i = 0; i < _checkpointed.length_of_array; i++) { + _checkpointed.block_translation[i].size = 0; + _checkpointed.block_translation[i].u.diskoff = diskoff_unused; + } + + // we just created a default checkpointed, now copy it to current. + _copy_translation(&_current, &_checkpointed, TRANSLATION_CURRENT); + + // Create an empty block allocator. + _bt_block_allocator->Create( + BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE, + BlockAllocator::BLOCK_ALLOCATOR_ALIGNMENT); +} + +// TODO: Refactor with FT-303 +static void ft_set_dirty(FT ft, bool for_checkpoint) { + invariant(ft->h->type == FT_CURRENT); + if (for_checkpoint) { + invariant(ft->checkpoint_header->type == FT_CHECKPOINT_INPROGRESS); + ft->checkpoint_header->set_dirty(); + } else { + ft->h->set_dirty(); + } +} + +void block_table::_maybe_truncate_file(int fd, uint64_t size_needed_before) { + toku_mutex_assert_locked(&_mutex); + uint64_t new_size_needed = _bt_block_allocator->AllocatedLimit(); + // Save a call to toku_os_get_file_size (kernel call) if unlikely to be + // useful. + if (new_size_needed < size_needed_before && + new_size_needed < _safe_file_size) { + nb_mutex_lock(&_safe_file_size_lock, &_mutex); + + // Must hold _safe_file_size_lock to change _safe_file_size. + if (new_size_needed < _safe_file_size) { + int64_t safe_file_size_before = _safe_file_size; + // Not safe to use the 'to-be-truncated' portion until truncate is + // done. + _safe_file_size = new_size_needed; + _mutex_unlock(); + + uint64_t size_after; + toku_maybe_truncate_file( + fd, new_size_needed, safe_file_size_before, &size_after); + _mutex_lock(); + + _safe_file_size = size_after; + } + nb_mutex_unlock(&_safe_file_size_lock); + } +} + +void block_table::maybe_truncate_file_on_open(int fd) { + _mutex_lock(); + _maybe_truncate_file(fd, _safe_file_size); + _mutex_unlock(); +} + +void block_table::_copy_translation(struct translation *dst, + struct translation *src, + enum translation_type newtype) { + // We intend to malloc a fresh block, so the incoming translation should be + // empty + invariant_null(dst->block_translation); + + invariant(src->length_of_array >= src->smallest_never_used_blocknum.b); + invariant(newtype == TRANSLATION_DEBUG || + (src->type == TRANSLATION_CURRENT && + newtype == TRANSLATION_INPROGRESS) || + (src->type == TRANSLATION_CHECKPOINTED && + newtype == TRANSLATION_CURRENT)); + dst->type = newtype; + dst->smallest_never_used_blocknum = src->smallest_never_used_blocknum; + dst->blocknum_freelist_head = src->blocknum_freelist_head; + + // destination btt is of fixed size. Allocate + memcpy the exact length + // necessary. + dst->length_of_array = dst->smallest_never_used_blocknum.b; + XMALLOC_N(dst->length_of_array, dst->block_translation); + memcpy(dst->block_translation, + src->block_translation, + dst->length_of_array * sizeof(*dst->block_translation)); + + // New version of btt is not yet stored on disk. + dst->block_translation[RESERVED_BLOCKNUM_TRANSLATION].size = 0; + dst->block_translation[RESERVED_BLOCKNUM_TRANSLATION].u.diskoff = + diskoff_unused; +} + +int64_t block_table::get_blocks_in_use_unlocked() { + BLOCKNUM b; + struct translation *t = &_current; + int64_t num_blocks = 0; + { + // Reserved blocknums do not get upgraded; They are part of the header. + for (b.b = RESERVED_BLOCKNUMS; b.b < t->smallest_never_used_blocknum.b; + b.b++) { + if (t->block_translation[b.b].size != size_is_free) { + num_blocks++; + } + } + } + return num_blocks; +} + +void block_table::_maybe_optimize_translation(struct translation *t) { + // Reduce 'smallest_never_used_blocknum.b' (completely free blocknums + // instead of just + // on a free list. Doing so requires us to regenerate the free list. + // This is O(n) work, so do it only if you're already doing that. + + BLOCKNUM b; + paranoid_invariant(t->smallest_never_used_blocknum.b >= RESERVED_BLOCKNUMS); + // Calculate how large the free suffix is. + int64_t freed; + { + for (b.b = t->smallest_never_used_blocknum.b; b.b > RESERVED_BLOCKNUMS; + b.b--) { + if (t->block_translation[b.b - 1].size != size_is_free) { + break; + } + } + freed = t->smallest_never_used_blocknum.b - b.b; + } + if (freed > 0) { + t->smallest_never_used_blocknum.b = b.b; + if (t->length_of_array / 4 > t->smallest_never_used_blocknum.b) { + // We're using more memory than necessary to represent this now. + // Reduce. + uint64_t new_length = t->smallest_never_used_blocknum.b * 2; + XREALLOC_N(new_length, t->block_translation); + t->length_of_array = new_length; + // No need to zero anything out. + } + + // Regenerate free list. + t->blocknum_freelist_head.b = freelist_null.b; + for (b.b = RESERVED_BLOCKNUMS; b.b < t->smallest_never_used_blocknum.b; + b.b++) { + if (t->block_translation[b.b].size == size_is_free) { + t->block_translation[b.b].u.next_free_blocknum = + t->blocknum_freelist_head; + t->blocknum_freelist_head = b; + } + } + } +} + +// block table must be locked by caller of this function +void block_table::note_start_checkpoint_unlocked() { + toku_mutex_assert_locked(&_mutex); + + // We're going to do O(n) work to copy the translation, so we + // can afford to do O(n) work by optimizing the translation + _maybe_optimize_translation(&_current); + + // Copy current translation to inprogress translation. + _copy_translation(&_inprogress, &_current, TRANSLATION_INPROGRESS); + + _checkpoint_skipped = false; +} + +void block_table::note_skipped_checkpoint() { + // Purpose, alert block translation that the checkpoint was skipped, e.x. + // for a non-dirty header + _mutex_lock(); + paranoid_invariant_notnull(_inprogress.block_translation); + _checkpoint_skipped = true; + _mutex_unlock(); +} + +// Purpose: free any disk space used by previous checkpoint that isn't in use by +// either +// - current state +// - in-progress checkpoint +// capture inprogress as new checkpointed. +// For each entry in checkpointBTT +// if offset does not match offset in inprogress +// assert offset does not match offset in current +// free (offset,len) from checkpoint +// move inprogress to checkpoint (resetting type) +// inprogress = NULL +void block_table::note_end_checkpoint(int fd) { + // Free unused blocks + _mutex_lock(); + uint64_t allocated_limit_at_start = _bt_block_allocator->AllocatedLimit(); + paranoid_invariant_notnull(_inprogress.block_translation); + if (_checkpoint_skipped) { + toku_free(_inprogress.block_translation); + memset(&_inprogress, 0, sizeof(_inprogress)); + goto end; + } + + // Make certain inprogress was allocated space on disk + invariant( + _inprogress.block_translation[RESERVED_BLOCKNUM_TRANSLATION].size > 0); + invariant( + _inprogress.block_translation[RESERVED_BLOCKNUM_TRANSLATION].u.diskoff > + 0); + + { + struct translation *t = &_checkpointed; + for (int64_t i = 0; i < t->length_of_array; i++) { + struct block_translation_pair *pair = &t->block_translation[i]; + if (pair->size > 0 && + !_translation_prevents_freeing( + &_inprogress, make_blocknum(i), pair)) { + invariant(!_translation_prevents_freeing( + &_current, make_blocknum(i), pair)); + _bt_block_allocator->FreeBlock(pair->u.diskoff, pair->size); + } + } + toku_free(_checkpointed.block_translation); + _checkpointed = _inprogress; + _checkpointed.type = TRANSLATION_CHECKPOINTED; + memset(&_inprogress, 0, sizeof(_inprogress)); + _maybe_truncate_file(fd, allocated_limit_at_start); + } +end: + _mutex_unlock(); +} + +bool block_table::_is_valid_blocknum(struct translation *t, BLOCKNUM b) { + invariant(t->length_of_array >= t->smallest_never_used_blocknum.b); + return b.b >= 0 && b.b < t->smallest_never_used_blocknum.b; +} + +void block_table::_verify_valid_blocknum(struct translation *UU(t), + BLOCKNUM UU(b)) { + invariant(_is_valid_blocknum(t, b)); +} + +bool block_table::_is_valid_freeable_blocknum(struct translation *t, + BLOCKNUM b) { + invariant(t->length_of_array >= t->smallest_never_used_blocknum.b); + return b.b >= RESERVED_BLOCKNUMS && b.b < t->smallest_never_used_blocknum.b; +} + +// should be freeable +void block_table::_verify_valid_freeable_blocknum(struct translation *UU(t), + BLOCKNUM UU(b)) { + invariant(_is_valid_freeable_blocknum(t, b)); +} + +// Also used only in ft-serialize-test. +void block_table::block_free(uint64_t offset, uint64_t size) { + _mutex_lock(); + _bt_block_allocator->FreeBlock(offset, size); + _mutex_unlock(); +} + +int64_t block_table::_calculate_size_on_disk(struct translation *t) { + return 8 + // smallest_never_used_blocknum + 8 + // blocknum_freelist_head + t->smallest_never_used_blocknum.b * 16 + // Array + 4; // 4 for checksum +} + +// We cannot free the disk space allocated to this blocknum if it is still in +// use by the given translation table. +bool block_table::_translation_prevents_freeing( + struct translation *t, + BLOCKNUM b, + struct block_translation_pair *old_pair) { + return t->block_translation && b.b < t->smallest_never_used_blocknum.b && + old_pair->u.diskoff == t->block_translation[b.b].u.diskoff; +} + +void block_table::_realloc_on_disk_internal(BLOCKNUM b, + DISKOFF size, + DISKOFF *offset, + FT ft, + bool for_checkpoint) { + toku_mutex_assert_locked(&_mutex); + ft_set_dirty(ft, for_checkpoint); + + struct translation *t = &_current; + struct block_translation_pair old_pair = t->block_translation[b.b]; + // Free the old block if it is not still in use by the checkpoint in + // progress or the previous checkpoint + bool cannot_free = + (!for_checkpoint && + _translation_prevents_freeing(&_inprogress, b, &old_pair)) || + _translation_prevents_freeing(&_checkpointed, b, &old_pair); + if (!cannot_free && old_pair.u.diskoff != diskoff_unused) { + _bt_block_allocator->FreeBlock(old_pair.u.diskoff, old_pair.size); + } + + uint64_t allocator_offset = diskoff_unused; + t->block_translation[b.b].size = size; + if (size > 0) { + // Allocate a new block if the size is greater than 0, + // if the size is just 0, offset will be set to diskoff_unused + _bt_block_allocator->AllocBlock(size, &allocator_offset); + } + t->block_translation[b.b].u.diskoff = allocator_offset; + *offset = allocator_offset; + + // Update inprogress btt if appropriate (if called because Pending bit is + // set). + if (for_checkpoint) { + paranoid_invariant(b.b < _inprogress.length_of_array); + _inprogress.block_translation[b.b] = t->block_translation[b.b]; + } +} + +void block_table::_ensure_safe_write_unlocked(int fd, + DISKOFF block_size, + DISKOFF block_offset) { + // Requires: holding _mutex + uint64_t size_needed = block_size + block_offset; + if (size_needed > _safe_file_size) { + // Must hold _safe_file_size_lock to change _safe_file_size. + nb_mutex_lock(&_safe_file_size_lock, &_mutex); + if (size_needed > _safe_file_size) { + _mutex_unlock(); + + int64_t size_after; + toku_maybe_preallocate_in_file( + fd, size_needed, _safe_file_size, &size_after); + + _mutex_lock(); + _safe_file_size = size_after; + } + nb_mutex_unlock(&_safe_file_size_lock); + } +} + +void block_table::realloc_on_disk(BLOCKNUM b, + DISKOFF size, + DISKOFF *offset, + FT ft, + int fd, + bool for_checkpoint) { + _mutex_lock(); + struct translation *t = &_current; + _verify_valid_freeable_blocknum(t, b); + _realloc_on_disk_internal(b, size, offset, ft, for_checkpoint); + + _ensure_safe_write_unlocked(fd, size, *offset); + _mutex_unlock(); +} + +bool block_table::_pair_is_unallocated(struct block_translation_pair *pair) { + return pair->size == 0 && pair->u.diskoff == diskoff_unused; +} + +// Effect: figure out where to put the inprogress btt on disk, allocate space +// for it there. +// The space must be 512-byte aligned (both the starting address and the +// size). +// As a result, the allcoated space may be a little bit bigger (up to the next +// 512-byte boundary) than the actual btt. +void block_table::_alloc_inprogress_translation_on_disk_unlocked() { + toku_mutex_assert_locked(&_mutex); + + struct translation *t = &_inprogress; + paranoid_invariant_notnull(t->block_translation); + BLOCKNUM b = make_blocknum(RESERVED_BLOCKNUM_TRANSLATION); + // Each inprogress is allocated only once + paranoid_invariant(_pair_is_unallocated(&t->block_translation[b.b])); + + // Allocate a new block + int64_t size = _calculate_size_on_disk(t); + uint64_t offset; + _bt_block_allocator->AllocBlock(size, &offset); + t->block_translation[b.b].u.diskoff = offset; + t->block_translation[b.b].size = size; +} + +// Effect: Serializes the blocktable to a wbuf (which starts uninitialized) +// A clean shutdown runs checkpoint start so that current and inprogress are +// copies. +// The resulting wbuf buffer is guaranteed to be be 512-byte aligned and the +// total length is a multiple of 512 (so we pad with zeros at the end if +// needd) +// The address is guaranteed to be 512-byte aligned, but the size is not +// guaranteed. +// It *is* guaranteed that we can read up to the next 512-byte boundary, +// however +void block_table::serialize_translation_to_wbuf(int fd, + struct wbuf *w, + int64_t *address, + int64_t *size) { + _mutex_lock(); + struct translation *t = &_inprogress; + + BLOCKNUM b = make_blocknum(RESERVED_BLOCKNUM_TRANSLATION); + _alloc_inprogress_translation_on_disk_unlocked(); // The allocated block + // must be 512-byte + // aligned to make + // O_DIRECT happy. + uint64_t size_translation = _calculate_size_on_disk(t); + uint64_t size_aligned = roundup_to_multiple(512, size_translation); + invariant((int64_t)size_translation == t->block_translation[b.b].size); + { + // Init wbuf + if (0) + printf( + "%s:%d writing translation table of size_translation %" PRIu64 + " at %" PRId64 "\n", + __FILE__, + __LINE__, + size_translation, + t->block_translation[b.b].u.diskoff); + char *XMALLOC_N_ALIGNED(512, size_aligned, buf); + for (uint64_t i = size_translation; i < size_aligned; i++) + buf[i] = 0; // fill in the end of the buffer with zeros. + wbuf_init(w, buf, size_aligned); + } + wbuf_BLOCKNUM(w, t->smallest_never_used_blocknum); + wbuf_BLOCKNUM(w, t->blocknum_freelist_head); + int64_t i; + for (i = 0; i < t->smallest_never_used_blocknum.b; i++) { + if (0) + printf("%s:%d %" PRId64 ",%" PRId64 "\n", + __FILE__, + __LINE__, + t->block_translation[i].u.diskoff, + t->block_translation[i].size); + wbuf_DISKOFF(w, t->block_translation[i].u.diskoff); + wbuf_DISKOFF(w, t->block_translation[i].size); + } + uint32_t checksum = toku_x1764_finish(&w->checksum); + wbuf_int(w, checksum); + *address = t->block_translation[b.b].u.diskoff; + *size = size_translation; + invariant((*address) % 512 == 0); + + _ensure_safe_write_unlocked(fd, size_aligned, *address); + _mutex_unlock(); +} + +// Perhaps rename: purpose is get disk address of a block, given its blocknum +// (blockid?) +void block_table::_translate_blocknum_to_offset_size_unlocked(BLOCKNUM b, + DISKOFF *offset, + DISKOFF *size) { + struct translation *t = &_current; + _verify_valid_blocknum(t, b); + if (offset) { + *offset = t->block_translation[b.b].u.diskoff; + } + if (size) { + *size = t->block_translation[b.b].size; + } +} + +// Perhaps rename: purpose is get disk address of a block, given its blocknum +// (blockid?) +void block_table::translate_blocknum_to_offset_size(BLOCKNUM b, + DISKOFF *offset, + DISKOFF *size) { + _mutex_lock(); + _translate_blocknum_to_offset_size_unlocked(b, offset, size); + _mutex_unlock(); +} + +// Only called by toku_allocate_blocknum +// Effect: expand the array to maintain size invariant +// given that one more never-used blocknum will soon be used. +void block_table::_maybe_expand_translation(struct translation *t) { + if (t->length_of_array <= t->smallest_never_used_blocknum.b) { + // expansion is necessary + uint64_t new_length = t->smallest_never_used_blocknum.b * 2; + XREALLOC_N(new_length, t->block_translation); + uint64_t i; + for (i = t->length_of_array; i < new_length; i++) { + t->block_translation[i].u.next_free_blocknum = freelist_null; + t->block_translation[i].size = size_is_free; + } + t->length_of_array = new_length; + } +} + +void block_table::_allocate_blocknum_unlocked(BLOCKNUM *res, FT ft) { + toku_mutex_assert_locked(&_mutex); + BLOCKNUM result; + struct translation *t = &_current; + if (t->blocknum_freelist_head.b == freelist_null.b) { + // no previously used blocknums are available + // use a never used blocknum + _maybe_expand_translation( + t); // Ensure a never used blocknums is available + result = t->smallest_never_used_blocknum; + t->smallest_never_used_blocknum.b++; + } else { // reuse a previously used blocknum + result = t->blocknum_freelist_head; + BLOCKNUM next = t->block_translation[result.b].u.next_free_blocknum; + t->blocknum_freelist_head = next; + } + // Verify the blocknum is free + paranoid_invariant(t->block_translation[result.b].size == size_is_free); + // blocknum is not free anymore + t->block_translation[result.b].u.diskoff = diskoff_unused; + t->block_translation[result.b].size = 0; + _verify_valid_freeable_blocknum(t, result); + *res = result; + ft_set_dirty(ft, false); +} + +void block_table::allocate_blocknum(BLOCKNUM *res, FT ft) { + _mutex_lock(); + _allocate_blocknum_unlocked(res, ft); + _mutex_unlock(); +} + +void block_table::_free_blocknum_in_translation(struct translation *t, + BLOCKNUM b) { + _verify_valid_freeable_blocknum(t, b); + paranoid_invariant(t->block_translation[b.b].size != size_is_free); + + t->block_translation[b.b].size = size_is_free; + t->block_translation[b.b].u.next_free_blocknum = t->blocknum_freelist_head; + t->blocknum_freelist_head = b; +} + +// Effect: Free a blocknum. +// If the blocknum holds the only reference to a block on disk, free that block +void block_table::_free_blocknum_unlocked(BLOCKNUM *bp, + FT ft, + bool for_checkpoint) { + toku_mutex_assert_locked(&_mutex); + BLOCKNUM b = *bp; + bp->b = 0; // Remove caller's reference. + + struct block_translation_pair old_pair = _current.block_translation[b.b]; + + _free_blocknum_in_translation(&_current, b); + if (for_checkpoint) { + paranoid_invariant(ft->checkpoint_header->type == + FT_CHECKPOINT_INPROGRESS); + _free_blocknum_in_translation(&_inprogress, b); + } + + // If the size is 0, no disk block has ever been assigned to this blocknum. + if (old_pair.size > 0) { + // Free the old block if it is not still in use by the checkpoint in + // progress or the previous checkpoint + bool cannot_free = + _translation_prevents_freeing(&_inprogress, b, &old_pair) || + _translation_prevents_freeing(&_checkpointed, b, &old_pair); + if (!cannot_free) { + _bt_block_allocator->FreeBlock(old_pair.u.diskoff, old_pair.size); + } + } else { + paranoid_invariant(old_pair.size == 0); + paranoid_invariant(old_pair.u.diskoff == diskoff_unused); + } + ft_set_dirty(ft, for_checkpoint); +} + +void block_table::free_blocknum(BLOCKNUM *bp, FT ft, bool for_checkpoint) { + _mutex_lock(); + _free_blocknum_unlocked(bp, ft, for_checkpoint); + _mutex_unlock(); +} + +// Verify there are no free blocks. +void block_table::verify_no_free_blocknums() { + invariant(_current.blocknum_freelist_head.b == freelist_null.b); +} + +// Frees blocknums that have a size of 0 and unused diskoff +// Currently used for eliminating unused cached rollback log nodes +void block_table::free_unused_blocknums(BLOCKNUM root) { + _mutex_lock(); + int64_t smallest = _current.smallest_never_used_blocknum.b; + for (int64_t i = RESERVED_BLOCKNUMS; i < smallest; i++) { + if (i == root.b) { + continue; + } + BLOCKNUM b = make_blocknum(i); + if (_current.block_translation[b.b].size == 0) { + invariant(_current.block_translation[b.b].u.diskoff == + diskoff_unused); + _free_blocknum_in_translation(&_current, b); + } + } + _mutex_unlock(); +} + +bool block_table::_no_data_blocks_except_root(BLOCKNUM root) { + bool ok = true; + _mutex_lock(); + int64_t smallest = _current.smallest_never_used_blocknum.b; + if (root.b < RESERVED_BLOCKNUMS) { + ok = false; + goto cleanup; + } + for (int64_t i = RESERVED_BLOCKNUMS; i < smallest; i++) { + if (i == root.b) { + continue; + } + BLOCKNUM b = make_blocknum(i); + if (_current.block_translation[b.b].size != size_is_free) { + ok = false; + goto cleanup; + } + } +cleanup: + _mutex_unlock(); + return ok; +} + +// Verify there are no data blocks except root. +// TODO(leif): This actually takes a lock, but I don't want to fix all the +// callers right now. +void block_table::verify_no_data_blocks_except_root(BLOCKNUM UU(root)) { + paranoid_invariant(_no_data_blocks_except_root(root)); +} + +bool block_table::_blocknum_allocated(BLOCKNUM b) { + _mutex_lock(); + struct translation *t = &_current; + _verify_valid_blocknum(t, b); + bool ok = t->block_translation[b.b].size != size_is_free; + _mutex_unlock(); + return ok; +} + +// Verify a blocknum is currently allocated. +void block_table::verify_blocknum_allocated(BLOCKNUM UU(b)) { + paranoid_invariant(_blocknum_allocated(b)); +} + +// Only used by toku_dump_translation table (debug info) +void block_table::_dump_translation_internal(FILE *f, struct translation *t) { + if (t->block_translation) { + BLOCKNUM b = make_blocknum(RESERVED_BLOCKNUM_TRANSLATION); + fprintf(f, " length_of_array[%" PRId64 "]", t->length_of_array); + fprintf(f, + " smallest_never_used_blocknum[%" PRId64 "]", + t->smallest_never_used_blocknum.b); + fprintf(f, + " blocknum_free_list_head[%" PRId64 "]", + t->blocknum_freelist_head.b); + fprintf( + f, " size_on_disk[%" PRId64 "]", t->block_translation[b.b].size); + fprintf(f, + " location_on_disk[%" PRId64 "]\n", + t->block_translation[b.b].u.diskoff); + int64_t i; + for (i = 0; i < t->length_of_array; i++) { + fprintf(f, + " %" PRId64 ": %" PRId64 " %" PRId64 "\n", + i, + t->block_translation[i].u.diskoff, + t->block_translation[i].size); + } + fprintf(f, "\n"); + } else { + fprintf(f, " does not exist\n"); + } +} + +// Only used by toku_ft_dump which is only for debugging purposes +// "pretty" just means we use tabs so we can parse output easier later +void block_table::dump_translation_table_pretty(FILE *f) { + _mutex_lock(); + struct translation *t = &_checkpointed; + invariant(t->block_translation != nullptr); + for (int64_t i = 0; i < t->length_of_array; ++i) { + fprintf(f, + "%" PRId64 "\t%" PRId64 "\t%" PRId64 "\n", + i, + t->block_translation[i].u.diskoff, + t->block_translation[i].size); + } + _mutex_unlock(); +} + +// Only used by toku_ft_dump which is only for debugging purposes +void block_table::dump_translation_table(FILE *f) { + _mutex_lock(); + fprintf(f, "Current block translation:"); + _dump_translation_internal(f, &_current); + fprintf(f, "Checkpoint in progress block translation:"); + _dump_translation_internal(f, &_inprogress); + fprintf(f, "Checkpointed block translation:"); + _dump_translation_internal(f, &_checkpointed); + _mutex_unlock(); +} + +// Only used by ftdump +void block_table::blocknum_dump_translation(BLOCKNUM b) { + _mutex_lock(); + + struct translation *t = &_current; + if (b.b < t->length_of_array) { + struct block_translation_pair *bx = &t->block_translation[b.b]; + printf("%" PRId64 ": %" PRId64 " %" PRId64 "\n", + b.b, + bx->u.diskoff, + bx->size); + } + _mutex_unlock(); +} + +// Must not call this function when anything else is using the blocktable. +// No one may use the blocktable afterwards. +void block_table::destroy(void) { + // TODO: translation.destroy(); + toku_free(_current.block_translation); + toku_free(_inprogress.block_translation); + toku_free(_checkpointed.block_translation); + + _bt_block_allocator->Destroy(); + delete _bt_block_allocator; + toku_mutex_destroy(&_mutex); + nb_mutex_destroy(&_safe_file_size_lock); +} + +int block_table::_translation_deserialize_from_buffer( + struct translation *t, + DISKOFF location_on_disk, + uint64_t size_on_disk, + // out: buffer with serialized translation + unsigned char *translation_buffer) { + int r = 0; + invariant(location_on_disk != 0); + t->type = TRANSLATION_CHECKPOINTED; + + // check the checksum + uint32_t x1764 = toku_x1764_memory(translation_buffer, size_on_disk - 4); + uint64_t offset = size_on_disk - 4; + uint32_t stored_x1764 = toku_dtoh32(*(int *)(translation_buffer + offset)); + if (x1764 != stored_x1764) { + fprintf(stderr, + "Translation table checksum failure: calc=0x%08x read=0x%08x\n", + x1764, + stored_x1764); + r = TOKUDB_BAD_CHECKSUM; + goto exit; + } + + struct rbuf rb; + rb.buf = translation_buffer; + rb.ndone = 0; + rb.size = size_on_disk - 4; // 4==checksum + + t->smallest_never_used_blocknum = rbuf_blocknum(&rb); + t->length_of_array = t->smallest_never_used_blocknum.b; + invariant(t->smallest_never_used_blocknum.b >= RESERVED_BLOCKNUMS); + t->blocknum_freelist_head = rbuf_blocknum(&rb); + XMALLOC_N(t->length_of_array, t->block_translation); + for (int64_t i = 0; i < t->length_of_array; i++) { + t->block_translation[i].u.diskoff = rbuf_DISKOFF(&rb); + t->block_translation[i].size = rbuf_DISKOFF(&rb); + } + invariant(_calculate_size_on_disk(t) == (int64_t)size_on_disk); + invariant(t->block_translation[RESERVED_BLOCKNUM_TRANSLATION].size == + (int64_t)size_on_disk); + invariant(t->block_translation[RESERVED_BLOCKNUM_TRANSLATION].u.diskoff == + location_on_disk); + +exit: + return r; +} + +int block_table::iterate(enum translation_type type, + BLOCKTABLE_CALLBACK f, + void *extra, + bool data_only, + bool used_only) { + struct translation *src; + + int r = 0; + switch (type) { + case TRANSLATION_CURRENT: + src = &_current; + break; + case TRANSLATION_INPROGRESS: + src = &_inprogress; + break; + case TRANSLATION_CHECKPOINTED: + src = &_checkpointed; + break; + default: + r = EINVAL; + } + + struct translation fakecurrent; + memset(&fakecurrent, 0, sizeof(struct translation)); + + struct translation *t = &fakecurrent; + if (r == 0) { + _mutex_lock(); + _copy_translation(t, src, TRANSLATION_DEBUG); + t->block_translation[RESERVED_BLOCKNUM_TRANSLATION] = + src->block_translation[RESERVED_BLOCKNUM_TRANSLATION]; + _mutex_unlock(); + int64_t i; + for (i = 0; i < t->smallest_never_used_blocknum.b; i++) { + struct block_translation_pair pair = t->block_translation[i]; + if (data_only && i < RESERVED_BLOCKNUMS) + continue; + if (used_only && pair.size <= 0) + continue; + r = f(make_blocknum(i), pair.size, pair.u.diskoff, extra); + if (r != 0) + break; + } + toku_free(t->block_translation); + } + return r; +} + +typedef struct { + int64_t used_space; + int64_t total_space; +} frag_extra; + +static int frag_helper(BLOCKNUM UU(b), + int64_t size, + int64_t address, + void *extra) { + frag_extra *info = (frag_extra *)extra; + + if (size + address > info->total_space) + info->total_space = size + address; + info->used_space += size; + return 0; +} + +void block_table::internal_fragmentation(int64_t *total_sizep, + int64_t *used_sizep) { + frag_extra info = {0, 0}; + int r = iterate(TRANSLATION_CHECKPOINTED, frag_helper, &info, false, true); + invariant_zero(r); + + if (total_sizep) + *total_sizep = info.total_space; + if (used_sizep) + *used_sizep = info.used_space; +} + +void block_table::_realloc_descriptor_on_disk_unlocked(DISKOFF size, + DISKOFF *offset, + FT ft) { + toku_mutex_assert_locked(&_mutex); + BLOCKNUM b = make_blocknum(RESERVED_BLOCKNUM_DESCRIPTOR); + _realloc_on_disk_internal(b, size, offset, ft, false); +} + +void block_table::realloc_descriptor_on_disk(DISKOFF size, + DISKOFF *offset, + FT ft, + int fd) { + _mutex_lock(); + _realloc_descriptor_on_disk_unlocked(size, offset, ft); + _ensure_safe_write_unlocked(fd, size, *offset); + _mutex_unlock(); +} + +void block_table::get_descriptor_offset_size(DISKOFF *offset, DISKOFF *size) { + _mutex_lock(); + BLOCKNUM b = make_blocknum(RESERVED_BLOCKNUM_DESCRIPTOR); + _translate_blocknum_to_offset_size_unlocked(b, offset, size); + _mutex_unlock(); +} + +void block_table::get_fragmentation_unlocked(TOKU_DB_FRAGMENTATION report) { + // Requires: blocktable lock is held. + // Requires: report->file_size_bytes is already filled in. + + // Count the headers. + report->data_bytes = BlockAllocator::BLOCK_ALLOCATOR_HEADER_RESERVE; + report->data_blocks = 1; + report->checkpoint_bytes_additional = + BlockAllocator::BLOCK_ALLOCATOR_HEADER_RESERVE; + report->checkpoint_blocks_additional = 1; + + struct translation *current = &_current; + for (int64_t i = 0; i < current->length_of_array; i++) { + struct block_translation_pair *pair = ¤t->block_translation[i]; + if (pair->size > 0) { + report->data_bytes += pair->size; + report->data_blocks++; + } + } + + struct translation *checkpointed = &_checkpointed; + for (int64_t i = 0; i < checkpointed->length_of_array; i++) { + struct block_translation_pair *pair = + &checkpointed->block_translation[i]; + if (pair->size > 0 && + !(i < current->length_of_array && + current->block_translation[i].size > 0 && + current->block_translation[i].u.diskoff == pair->u.diskoff)) { + report->checkpoint_bytes_additional += pair->size; + report->checkpoint_blocks_additional++; + } + } + + struct translation *inprogress = &_inprogress; + for (int64_t i = 0; i < inprogress->length_of_array; i++) { + struct block_translation_pair *pair = &inprogress->block_translation[i]; + if (pair->size > 0 && + !(i < current->length_of_array && + current->block_translation[i].size > 0 && + current->block_translation[i].u.diskoff == pair->u.diskoff) && + !(i < checkpointed->length_of_array && + checkpointed->block_translation[i].size > 0 && + checkpointed->block_translation[i].u.diskoff == + pair->u.diskoff)) { + report->checkpoint_bytes_additional += pair->size; + report->checkpoint_blocks_additional++; + } + } + + _bt_block_allocator->UnusedStatistics(report); +} + +void block_table::get_info64(struct ftinfo64 *s) { + _mutex_lock(); + + struct translation *current = &_current; + s->num_blocks_allocated = current->length_of_array; + s->num_blocks_in_use = 0; + s->size_allocated = 0; + s->size_in_use = 0; + + for (int64_t i = 0; i < current->length_of_array; ++i) { + struct block_translation_pair *block = ¤t->block_translation[i]; + if (block->size != size_is_free) { + ++s->num_blocks_in_use; + s->size_in_use += block->size; + if (block->u.diskoff != diskoff_unused) { + uint64_t limit = block->u.diskoff + block->size; + if (limit > s->size_allocated) { + s->size_allocated = limit; + } + } + } + } + + _mutex_unlock(); +} + +int block_table::iterate_translation_tables( + uint64_t checkpoint_count, + int (*iter)(uint64_t checkpoint_count, + int64_t total_num_rows, + int64_t blocknum, + int64_t diskoff, + int64_t size, + void *extra), + void *iter_extra) { + int error = 0; + _mutex_lock(); + + int64_t total_num_rows = + _current.length_of_array + _checkpointed.length_of_array; + for (int64_t i = 0; error == 0 && i < _current.length_of_array; ++i) { + struct block_translation_pair *block = &_current.block_translation[i]; + error = iter(checkpoint_count, + total_num_rows, + i, + block->u.diskoff, + block->size, + iter_extra); + } + for (int64_t i = 0; error == 0 && i < _checkpointed.length_of_array; ++i) { + struct block_translation_pair *block = + &_checkpointed.block_translation[i]; + error = iter(checkpoint_count - 1, + total_num_rows, + i, + block->u.diskoff, + block->size, + iter_extra); + } + + _mutex_unlock(); + return error; +} |