summaryrefslogtreecommitdiffstats
path: root/src/rocksdb/table/block_based/block_prefix_index.cc
blob: f9d92c74cb143219a73e7bf9177f47f2353eef33 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
//  This source code is licensed under both the GPLv2 (found in the
//  COPYING file in the root directory) and Apache 2.0 License
//  (found in the LICENSE.Apache file in the root directory).

#include "table/block_based/block_prefix_index.h"

#include <vector>

#include "memory/arena.h"
#include "rocksdb/comparator.h"
#include "rocksdb/slice.h"
#include "rocksdb/slice_transform.h"
#include "util/coding.h"
#include "util/hash.h"

namespace ROCKSDB_NAMESPACE {

inline uint32_t Hash(const Slice& s) {
  return ROCKSDB_NAMESPACE::Hash(s.data(), s.size(), 0);
}

inline uint32_t PrefixToBucket(const Slice& prefix, uint32_t num_buckets) {
  return Hash(prefix) % num_buckets;
}

// The prefix block index is simply a bucket array, with each entry pointing to
// the blocks that span the prefixes hashed to this bucket.
//
// To reduce memory footprint, if there is only one block per bucket, the entry
// stores the block id directly. If there are more than one blocks per bucket,
// because of hash collision or a single prefix spanning multiple blocks,
// the entry points to an array of block ids. The block array is an array of
// uint32_t's. The first uint32_t indicates the total number of blocks, followed
// by the block ids.
//
// To differentiate the two cases, the high order bit of the entry indicates
// whether it is a 'pointer' into a separate block array.
// 0x7FFFFFFF is reserved for empty bucket.

const uint32_t kNoneBlock = 0x7FFFFFFF;
const uint32_t kBlockArrayMask = 0x80000000;

inline bool IsNone(uint32_t block_id) { return block_id == kNoneBlock; }

inline bool IsBlockId(uint32_t block_id) {
  return (block_id & kBlockArrayMask) == 0;
}

inline uint32_t DecodeIndex(uint32_t block_id) {
  uint32_t index = block_id ^ kBlockArrayMask;
  assert(index < kBlockArrayMask);
  return index;
}

inline uint32_t EncodeIndex(uint32_t index) {
  assert(index < kBlockArrayMask);
  return index | kBlockArrayMask;
}

// temporary storage for prefix information during index building
struct PrefixRecord {
  Slice prefix;
  uint32_t start_block;
  uint32_t end_block;
  uint32_t num_blocks;
  PrefixRecord* next;
};

class BlockPrefixIndex::Builder {
 public:
  explicit Builder(const SliceTransform* internal_prefix_extractor)
      : internal_prefix_extractor_(internal_prefix_extractor) {}

  void Add(const Slice& key_prefix, uint32_t start_block, uint32_t num_blocks) {
    PrefixRecord* record = reinterpret_cast<PrefixRecord*>(
        arena_.AllocateAligned(sizeof(PrefixRecord)));
    record->prefix = key_prefix;
    record->start_block = start_block;
    record->end_block = start_block + num_blocks - 1;
    record->num_blocks = num_blocks;
    prefixes_.push_back(record);
  }

  BlockPrefixIndex* Finish() {
    // For now, use roughly 1:1 prefix to bucket ratio.
    uint32_t num_buckets = static_cast<uint32_t>(prefixes_.size()) + 1;

    // Collect prefix records that hash to the same bucket, into a single
    // linklist.
    std::vector<PrefixRecord*> prefixes_per_bucket(num_buckets, nullptr);
    std::vector<uint32_t> num_blocks_per_bucket(num_buckets, 0);
    for (PrefixRecord* current : prefixes_) {
      uint32_t bucket = PrefixToBucket(current->prefix, num_buckets);
      // merge the prefix block span if the first block of this prefix is
      // connected to the last block of the previous prefix.
      PrefixRecord* prev = prefixes_per_bucket[bucket];
      if (prev) {
        assert(current->start_block >= prev->end_block);
        auto distance = current->start_block - prev->end_block;
        if (distance <= 1) {
          prev->end_block = current->end_block;
          prev->num_blocks = prev->end_block - prev->start_block + 1;
          num_blocks_per_bucket[bucket] += (current->num_blocks + distance - 1);
          continue;
        }
      }
      current->next = prev;
      prefixes_per_bucket[bucket] = current;
      num_blocks_per_bucket[bucket] += current->num_blocks;
    }

    // Calculate the block array buffer size
    uint32_t total_block_array_entries = 0;
    for (uint32_t i = 0; i < num_buckets; i++) {
      uint32_t num_blocks = num_blocks_per_bucket[i];
      if (num_blocks > 1) {
        total_block_array_entries += (num_blocks + 1);
      }
    }

    // Populate the final prefix block index
    uint32_t* block_array_buffer = new uint32_t[total_block_array_entries];
    uint32_t* buckets = new uint32_t[num_buckets];
    uint32_t offset = 0;
    for (uint32_t i = 0; i < num_buckets; i++) {
      uint32_t num_blocks = num_blocks_per_bucket[i];
      if (num_blocks == 0) {
        assert(prefixes_per_bucket[i] == nullptr);
        buckets[i] = kNoneBlock;
      } else if (num_blocks == 1) {
        assert(prefixes_per_bucket[i] != nullptr);
        assert(prefixes_per_bucket[i]->next == nullptr);
        buckets[i] = prefixes_per_bucket[i]->start_block;
      } else {
        assert(total_block_array_entries > 0);
        assert(prefixes_per_bucket[i] != nullptr);
        buckets[i] = EncodeIndex(offset);
        block_array_buffer[offset] = num_blocks;
        uint32_t* last_block = &block_array_buffer[offset + num_blocks];
        auto current = prefixes_per_bucket[i];
        // populate block ids from largest to smallest
        while (current != nullptr) {
          for (uint32_t iter = 0; iter < current->num_blocks; iter++) {
            *last_block = current->end_block - iter;
            last_block--;
          }
          current = current->next;
        }
        assert(last_block == &block_array_buffer[offset]);
        offset += (num_blocks + 1);
      }
    }

    assert(offset == total_block_array_entries);

    return new BlockPrefixIndex(internal_prefix_extractor_, num_buckets,
                                buckets, total_block_array_entries,
                                block_array_buffer);
  }

 private:
  const SliceTransform* internal_prefix_extractor_;

  std::vector<PrefixRecord*> prefixes_;
  Arena arena_;
};

Status BlockPrefixIndex::Create(const SliceTransform* internal_prefix_extractor,
                                const Slice& prefixes, const Slice& prefix_meta,
                                BlockPrefixIndex** prefix_index) {
  uint64_t pos = 0;
  auto meta_pos = prefix_meta;
  Status s;
  Builder builder(internal_prefix_extractor);

  while (!meta_pos.empty()) {
    uint32_t prefix_size = 0;
    uint32_t entry_index = 0;
    uint32_t num_blocks = 0;
    if (!GetVarint32(&meta_pos, &prefix_size) ||
        !GetVarint32(&meta_pos, &entry_index) ||
        !GetVarint32(&meta_pos, &num_blocks)) {
      s = Status::Corruption(
          "Corrupted prefix meta block: unable to read from it.");
      break;
    }
    if (pos + prefix_size > prefixes.size()) {
      s = Status::Corruption(
          "Corrupted prefix meta block: size inconsistency.");
      break;
    }
    Slice prefix(prefixes.data() + pos, prefix_size);
    builder.Add(prefix, entry_index, num_blocks);

    pos += prefix_size;
  }

  if (s.ok() && pos != prefixes.size()) {
    s = Status::Corruption("Corrupted prefix meta block");
  }

  if (s.ok()) {
    *prefix_index = builder.Finish();
  }

  return s;
}

uint32_t BlockPrefixIndex::GetBlocks(const Slice& key, uint32_t** blocks) {
  Slice prefix = internal_prefix_extractor_->Transform(key);

  uint32_t bucket = PrefixToBucket(prefix, num_buckets_);
  uint32_t block_id = buckets_[bucket];

  if (IsNone(block_id)) {
    return 0;
  } else if (IsBlockId(block_id)) {
    *blocks = &buckets_[bucket];
    return 1;
  } else {
    uint32_t index = DecodeIndex(block_id);
    assert(index < num_block_array_buffer_entries_);
    *blocks = &block_array_buffer_[index + 1];
    uint32_t num_blocks = block_array_buffer_[index];
    assert(num_blocks > 1);
    assert(index + num_blocks < num_block_array_buffer_entries_);
    return num_blocks;
  }
}

}  // namespace ROCKSDB_NAMESPACE