diff options
Diffstat (limited to 'common/freelist.c')
-rw-r--r-- | common/freelist.c | 747 |
1 files changed, 747 insertions, 0 deletions
diff --git a/common/freelist.c b/common/freelist.c new file mode 100644 index 0000000..046c747 --- /dev/null +++ b/common/freelist.c @@ -0,0 +1,747 @@ + /* + Unix SMB/CIFS implementation. + + trivial database library + + Copyright (C) Andrew Tridgell 1999-2005 + Copyright (C) Paul `Rusty' Russell 2000 + Copyright (C) Jeremy Allison 2000-2003 + + ** NOTE! The following LGPL license applies to the tdb + ** library. This does NOT imply that all of Samba is released + ** under the LGPL + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 3 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, see <http://www.gnu.org/licenses/>. +*/ + +#include "tdb_private.h" + +/* read a freelist record and check for simple errors */ +int tdb_rec_free_read(struct tdb_context *tdb, tdb_off_t off, struct tdb_record *rec) +{ + if (tdb->methods->tdb_read(tdb, off, rec, sizeof(*rec),DOCONV()) == -1) + return -1; + + if (rec->magic == TDB_MAGIC) { + /* this happens when a app is showdown while deleting a record - we should + not completely fail when this happens */ + TDB_LOG((tdb, TDB_DEBUG_WARNING, "tdb_rec_free_read non-free magic 0x%x at offset=%u - fixing\n", + rec->magic, off)); + rec->magic = TDB_FREE_MAGIC; + if (tdb_rec_write(tdb, off, rec) == -1) + return -1; + } + + if (rec->magic != TDB_FREE_MAGIC) { + /* Ensure ecode is set for log fn. */ + tdb->ecode = TDB_ERR_CORRUPT; + TDB_LOG((tdb, TDB_DEBUG_WARNING, "tdb_rec_free_read bad magic 0x%x at offset=%u\n", + rec->magic, off)); + return -1; + } + if (tdb_oob(tdb, rec->next, sizeof(*rec), 0) != 0) + return -1; + return 0; +} + +/* update a record tailer (must hold allocation lock) */ +static int update_tailer(struct tdb_context *tdb, tdb_off_t offset, + const struct tdb_record *rec) +{ + tdb_off_t totalsize; + + /* Offset of tailer from record header */ + totalsize = sizeof(*rec) + rec->rec_len; + return tdb_ofs_write(tdb, offset + totalsize - sizeof(tdb_off_t), + &totalsize); +} + +/** + * Read the record directly on the left. + * Fail if there is no record on the left. + */ +static int read_record_on_left(struct tdb_context *tdb, tdb_off_t rec_ptr, + tdb_off_t *left_p, + struct tdb_record *left_r) +{ + tdb_off_t left_ptr; + tdb_off_t left_size; + struct tdb_record left_rec; + int ret; + + left_ptr = rec_ptr - sizeof(tdb_off_t); + + if (left_ptr <= TDB_DATA_START(tdb->hash_size)) { + /* no record on the left */ + return -1; + } + + /* Read in tailer and jump back to header */ + ret = tdb_ofs_read(tdb, left_ptr, &left_size); + if (ret == -1) { + TDB_LOG((tdb, TDB_DEBUG_FATAL, + "tdb_free: left offset read failed at %u\n", left_ptr)); + return -1; + } + + /* it could be uninitialised data */ + if (left_size == 0 || left_size == TDB_PAD_U32) { + return -1; + } + + if (left_size > rec_ptr) { + return -1; + } + + left_ptr = rec_ptr - left_size; + + if (left_ptr < TDB_DATA_START(tdb->hash_size)) { + return -1; + } + + /* Now read in the left record */ + ret = tdb->methods->tdb_read(tdb, left_ptr, &left_rec, + sizeof(left_rec), DOCONV()); + if (ret == -1) { + TDB_LOG((tdb, TDB_DEBUG_FATAL, + "tdb_free: left read failed at %u (%u)\n", + left_ptr, left_size)); + return -1; + } + + *left_p = left_ptr; + *left_r = left_rec; + + return 0; +} + +/** + * Merge new freelist record with the direct left neighbour. + * This assumes that left_rec represents the record + * directly to the left of right_rec and that this is + * a freelist record. + */ +static int merge_with_left_record(struct tdb_context *tdb, + tdb_off_t left_ptr, + struct tdb_record *left_rec, + struct tdb_record *right_rec) +{ + int ret; + + left_rec->rec_len += sizeof(*right_rec) + right_rec->rec_len; + + ret = tdb_rec_write(tdb, left_ptr, left_rec); + if (ret == -1) { + TDB_LOG((tdb, TDB_DEBUG_FATAL, + "merge_with_left_record: update_left failed at %u\n", + left_ptr)); + return -1; + } + + ret = update_tailer(tdb, left_ptr, left_rec); + if (ret == -1) { + TDB_LOG((tdb, TDB_DEBUG_FATAL, + "merge_with_left_record: update_tailer failed at %u\n", + left_ptr)); + return -1; + } + + return 0; +} + +/** + * Check whether the record left of a given freelist record is + * also a freelist record, and if so, merge the two records. + * + * Return code: + * -1 upon error + * 0 if left was not a free record + * 1 if left was free and successfully merged. + * + * The current record is handed in with pointer and fully read record. + * + * The left record pointer and struct can be retrieved as result + * in lp and lr; + */ +static int check_merge_with_left_record(struct tdb_context *tdb, + tdb_off_t rec_ptr, + struct tdb_record *rec, + tdb_off_t *lp, + struct tdb_record *lr) +{ + tdb_off_t left_ptr; + struct tdb_record left_rec; + int ret; + + ret = read_record_on_left(tdb, rec_ptr, &left_ptr, &left_rec); + if (ret != 0) { + return 0; + } + + if (left_rec.magic != TDB_FREE_MAGIC) { + return 0; + } + + /* It's free - expand to include it. */ + ret = merge_with_left_record(tdb, left_ptr, &left_rec, rec); + if (ret != 0) { + return -1; + } + + if (lp != NULL) { + *lp = left_ptr; + } + + if (lr != NULL) { + *lr = left_rec; + } + + return 1; +} + +/** + * Check whether the record left of a given freelist record is + * also a freelist record, and if so, merge the two records. + * + * Return code: + * -1 upon error + * 0 if left was not a free record + * 1 if left was free and successfully merged. + * + * In this variant, the input record is specified just as the pointer + * and is read from the database if needed. + * + * next_ptr will contain the original record's next pointer after + * successful merging (which will be lost after merging), so that + * the caller can update the last pointer. + */ +static int check_merge_ptr_with_left_record(struct tdb_context *tdb, + tdb_off_t rec_ptr, + tdb_off_t *next_ptr) +{ + tdb_off_t left_ptr; + struct tdb_record rec, left_rec; + int ret; + + ret = read_record_on_left(tdb, rec_ptr, &left_ptr, &left_rec); + if (ret != 0) { + return 0; + } + + if (left_rec.magic != TDB_FREE_MAGIC) { + return 0; + } + + /* It's free - expand to include it. */ + + ret = tdb->methods->tdb_read(tdb, rec_ptr, &rec, + sizeof(rec), DOCONV()); + if (ret != 0) { + return -1; + } + + ret = merge_with_left_record(tdb, left_ptr, &left_rec, &rec); + if (ret != 0) { + return -1; + } + + if (next_ptr != NULL) { + *next_ptr = rec.next; + } + + return 1; +} + +/** + * Add an element into the freelist. + * + * We merge the new record into the left record if it is also a + * free record, but not with the right one. This makes the + * operation O(1) instead of O(n): merging with the right record + * requires a traverse of the freelist to find the previous + * record in the free list. + * + * This prevents db traverses from being O(n^2) after a lot of deletes. + */ +int tdb_free(struct tdb_context *tdb, tdb_off_t offset, struct tdb_record *rec) +{ + int ret; + + /* Allocation and tailer lock */ + if (tdb_lock(tdb, -1, F_WRLCK) != 0) + return -1; + + /* set an initial tailer, so if we fail we don't leave a bogus record */ + if (update_tailer(tdb, offset, rec) != 0) { + TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_free: update_tailer failed!\n")); + goto fail; + } + + ret = check_merge_with_left_record(tdb, offset, rec, NULL, NULL); + if (ret == -1) { + goto fail; + } + if (ret == 1) { + /* merged */ + goto done; + } + + /* Nothing to merge, prepend to free list */ + + rec->magic = TDB_FREE_MAGIC; + + if (tdb_ofs_read(tdb, FREELIST_TOP, &rec->next) == -1 || + tdb_rec_write(tdb, offset, rec) == -1 || + tdb_ofs_write(tdb, FREELIST_TOP, &offset) == -1) { + TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_free record write failed at offset=%u\n", offset)); + goto fail; + } + +done: + /* And we're done. */ + tdb_unlock(tdb, -1, F_WRLCK); + return 0; + + fail: + tdb_unlock(tdb, -1, F_WRLCK); + return -1; +} + + + +/* + the core of tdb_allocate - called when we have decided which + free list entry to use + + Note that we try to allocate by grabbing data from the end of an existing record, + not the beginning. This is so the left merge in a free is more likely to be + able to free up the record without fragmentation + */ +static tdb_off_t tdb_allocate_ofs(struct tdb_context *tdb, + tdb_len_t length, tdb_off_t rec_ptr, + struct tdb_record *rec, tdb_off_t last_ptr) +{ +#define MIN_REC_SIZE (sizeof(struct tdb_record) + sizeof(tdb_off_t) + 8) + + if (rec->rec_len < length + MIN_REC_SIZE) { + /* we have to grab the whole record */ + + /* unlink it from the previous record */ + if (tdb_ofs_write(tdb, last_ptr, &rec->next) == -1) { + return 0; + } + + /* mark it not free */ + rec->magic = TDB_MAGIC; + if (tdb_rec_write(tdb, rec_ptr, rec) == -1) { + return 0; + } + return rec_ptr; + } + + /* we're going to just shorten the existing record */ + rec->rec_len -= (length + sizeof(*rec)); + if (tdb_rec_write(tdb, rec_ptr, rec) == -1) { + return 0; + } + if (update_tailer(tdb, rec_ptr, rec) == -1) { + return 0; + } + + /* and setup the new record */ + rec_ptr += sizeof(*rec) + rec->rec_len; + + memset(rec, '\0', sizeof(*rec)); + rec->rec_len = length; + rec->magic = TDB_MAGIC; + + if (tdb_rec_write(tdb, rec_ptr, rec) == -1) { + return 0; + } + + if (update_tailer(tdb, rec_ptr, rec) == -1) { + return 0; + } + + return rec_ptr; +} + +/* allocate some space from the free list. The offset returned points + to a unconnected tdb_record within the database with room for at + least length bytes of total data + + 0 is returned if the space could not be allocated + */ +static tdb_off_t tdb_allocate_from_freelist( + struct tdb_context *tdb, tdb_len_t length, struct tdb_record *rec) +{ + tdb_off_t rec_ptr, last_ptr, newrec_ptr; + struct tdb_chainwalk_ctx chainwalk; + bool modified; + struct { + tdb_off_t rec_ptr, last_ptr; + tdb_len_t rec_len; + } bestfit; + float multiplier = 1.0; + bool merge_created_candidate; + + /* over-allocate to reduce fragmentation */ + length *= 1.25; + + /* Extra bytes required for tailer */ + length += sizeof(tdb_off_t); + length = TDB_ALIGN(length, TDB_ALIGNMENT); + + again: + merge_created_candidate = false; + last_ptr = FREELIST_TOP; + + /* read in the freelist top */ + if (tdb_ofs_read(tdb, FREELIST_TOP, &rec_ptr) == -1) + return 0; + + modified = false; + tdb_chainwalk_init(&chainwalk, rec_ptr); + + bestfit.rec_ptr = 0; + bestfit.last_ptr = 0; + bestfit.rec_len = 0; + + /* + this is a best fit allocation strategy. Originally we used + a first fit strategy, but it suffered from massive fragmentation + issues when faced with a slowly increasing record size. + */ + while (rec_ptr) { + int ret; + tdb_off_t left_ptr; + struct tdb_record left_rec; + + if (tdb_rec_free_read(tdb, rec_ptr, rec) == -1) { + return 0; + } + + ret = check_merge_with_left_record(tdb, rec_ptr, rec, + &left_ptr, &left_rec); + if (ret == -1) { + return 0; + } + if (ret == 1) { + /* merged */ + rec_ptr = rec->next; + ret = tdb_ofs_write(tdb, last_ptr, &rec->next); + if (ret == -1) { + return 0; + } + + /* + * We have merged the current record into the left + * neighbour. So our traverse of the freelist will + * skip it and consider the next record in the chain. + * + * But the enlarged left neighbour may be a candidate. + * If it is, we can not directly use it, though. + * The only thing we can do and have to do here is to + * update the current best fit size in the chain if the + * current best fit is the left record. (By that we may + * worsen the best fit we already had, bit this is not a + * problem.) + * + * If the current best fit is not the left record, + * all we can do is remember the fact that a merge + * created a new candidate so that we can trigger + * a second walk of the freelist if at the end of + * the first walk we have not found any fit. + * This way we can avoid expanding the database. + */ + + if (bestfit.rec_ptr == left_ptr) { + bestfit.rec_len = left_rec.rec_len; + } + + if (left_rec.rec_len > length) { + merge_created_candidate = true; + } + + modified = true; + + continue; + } + + if (rec->rec_len >= length) { + if (bestfit.rec_ptr == 0 || + rec->rec_len < bestfit.rec_len) { + bestfit.rec_len = rec->rec_len; + bestfit.rec_ptr = rec_ptr; + bestfit.last_ptr = last_ptr; + } + } + + /* move to the next record */ + last_ptr = rec_ptr; + rec_ptr = rec->next; + + if (!modified) { + bool ok; + ok = tdb_chainwalk_check(tdb, &chainwalk, rec_ptr); + if (!ok) { + return 0; + } + } + + /* if we've found a record that is big enough, then + stop searching if its also not too big. The + definition of 'too big' changes as we scan + through */ + if (bestfit.rec_len > 0 && + bestfit.rec_len < length * multiplier) { + break; + } + + /* this multiplier means we only extremely rarely + search more than 50 or so records. At 50 records we + accept records up to 11 times larger than what we + want */ + multiplier *= 1.05; + } + + if (bestfit.rec_ptr != 0) { + if (tdb_rec_free_read(tdb, bestfit.rec_ptr, rec) == -1) { + return 0; + } + + newrec_ptr = tdb_allocate_ofs(tdb, length, bestfit.rec_ptr, + rec, bestfit.last_ptr); + return newrec_ptr; + } + + if (merge_created_candidate) { + goto again; + } + + /* we didn't find enough space. See if we can expand the + database and if we can then try again */ + if (tdb_expand(tdb, length + sizeof(*rec)) == 0) + goto again; + + return 0; +} + +static bool tdb_alloc_dead( + struct tdb_context *tdb, int hash, tdb_len_t length, + tdb_off_t *rec_ptr, struct tdb_record *rec) +{ + tdb_off_t last_ptr; + + *rec_ptr = tdb_find_dead(tdb, hash, rec, length, &last_ptr); + if (*rec_ptr == 0) { + return false; + } + /* + * Unlink the record from the hash chain, it's about to be moved into + * another one. + */ + return (tdb_ofs_write(tdb, last_ptr, &rec->next) == 0); +} + +static void tdb_purge_dead(struct tdb_context *tdb, uint32_t hash) +{ + int max_dead_records = tdb->max_dead_records; + + tdb->max_dead_records = 0; + + tdb_trim_dead(tdb, hash); + + tdb->max_dead_records = max_dead_records; +} + +/* + * Chain "hash" is assumed to be locked + */ + +tdb_off_t tdb_allocate(struct tdb_context *tdb, int hash, tdb_len_t length, + struct tdb_record *rec) +{ + tdb_off_t ret; + uint32_t i; + + if (tdb->max_dead_records == 0) { + /* + * No dead records to expect anywhere. Do the blocking + * freelist lock without trying to steal from others + */ + goto blocking_freelist_allocate; + } + + /* + * The following loop tries to get the freelist lock nonblocking. If + * it gets the lock, allocate from there. If the freelist is busy, + * instead of waiting we try to steal dead records from other hash + * chains. + * + * Be aware that we do nonblocking locks on the other hash chains as + * well and fail gracefully. This way we avoid deadlocks (we block two + * hash chains, something which is pretty bad normally) + */ + + for (i=0; i<tdb->hash_size; i++) { + + int list; + + list = BUCKET(hash+i); + + if (tdb_lock_nonblock(tdb, list, F_WRLCK) == 0) { + bool got_dead; + + got_dead = tdb_alloc_dead(tdb, list, length, &ret, rec); + tdb_unlock(tdb, list, F_WRLCK); + + if (got_dead) { + return ret; + } + } + + if (tdb_lock_nonblock(tdb, -1, F_WRLCK) == 0) { + /* + * Under the freelist lock take the chance to give + * back our dead records. + */ + tdb_purge_dead(tdb, hash); + + ret = tdb_allocate_from_freelist(tdb, length, rec); + tdb_unlock(tdb, -1, F_WRLCK); + return ret; + } + } + +blocking_freelist_allocate: + + if (tdb_lock(tdb, -1, F_WRLCK) == -1) { + return 0; + } + /* + * Dead records can happen even if max_dead_records==0, they + * are older than the max_dead_records concept: They happen if + * tdb_delete happens concurrently with a traverse. + */ + tdb_purge_dead(tdb, hash); + ret = tdb_allocate_from_freelist(tdb, length, rec); + tdb_unlock(tdb, -1, F_WRLCK); + return ret; +} + +/** + * Merge adjacent records in the freelist. + */ +static int tdb_freelist_merge_adjacent(struct tdb_context *tdb, + int *count_records, int *count_merged) +{ + tdb_off_t cur, next; + int count = 0; + int merged = 0; + int ret; + + ret = tdb_lock(tdb, -1, F_RDLCK); + if (ret == -1) { + return -1; + } + + cur = FREELIST_TOP; + while (tdb_ofs_read(tdb, cur, &next) == 0 && next != 0) { + tdb_off_t next2; + + count++; + + ret = check_merge_ptr_with_left_record(tdb, next, &next2); + if (ret == -1) { + goto done; + } + if (ret == 1) { + /* + * merged: + * now let cur->next point to next2 instead of next + */ + + ret = tdb_ofs_write(tdb, cur, &next2); + if (ret != 0) { + goto done; + } + + next = next2; + merged++; + } + + cur = next; + } + + if (count_records != NULL) { + *count_records = count; + } + + if (count_merged != NULL) { + *count_merged = merged; + } + + ret = 0; + +done: + tdb_unlock(tdb, -1, F_RDLCK); + return ret; +} + +/** + * return the size of the freelist - no merging done + */ +static int tdb_freelist_size_no_merge(struct tdb_context *tdb) +{ + tdb_off_t ptr; + int count=0; + + if (tdb_lock(tdb, -1, F_RDLCK) == -1) { + return -1; + } + + ptr = FREELIST_TOP; + while (tdb_ofs_read(tdb, ptr, &ptr) == 0 && ptr != 0) { + count++; + } + + tdb_unlock(tdb, -1, F_RDLCK); + return count; +} + +/** + * return the size of the freelist - used to decide if we should repack + * + * As a side effect, adjacent records are merged unless the + * database is read-only, in order to reduce the fragmentation + * without repacking. + */ +_PUBLIC_ int tdb_freelist_size(struct tdb_context *tdb) +{ + + int count = 0; + + if (tdb->read_only) { + count = tdb_freelist_size_no_merge(tdb); + } else { + int ret; + ret = tdb_freelist_merge_adjacent(tdb, &count, NULL); + if (ret != 0) { + return -1; + } + } + + return count; +} |