summaryrefslogtreecommitdiffstats
path: root/src/rocksdb/table/plain_table_index.cc
blob: 437409239740091397b52602004c476a0eddbb91 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
//  This source code is licensed under both the GPLv2 (found in the
//  COPYING file in the root directory) and Apache 2.0 License
//  (found in the LICENSE.Apache file in the root directory).

#ifndef ROCKSDB_LITE

#ifndef __STDC_FORMAT_MACROS
#define __STDC_FORMAT_MACROS
#endif

#include <inttypes.h>

#include "table/plain_table_index.h"
#include "util/coding.h"
#include "util/hash.h"

namespace rocksdb {

namespace {
inline uint32_t GetBucketIdFromHash(uint32_t hash, uint32_t num_buckets) {
  assert(num_buckets > 0);
  return hash % num_buckets;
}
}

Status PlainTableIndex::InitFromRawData(Slice data) {
  if (!GetVarint32(&data, &index_size_)) {
    return Status::Corruption("Couldn't read the index size!");
  }
  assert(index_size_ > 0);
  if (!GetVarint32(&data, &num_prefixes_)) {
    return Status::Corruption("Couldn't read the index size!");
  }
  sub_index_size_ =
      static_cast<uint32_t>(data.size()) - index_size_ * kOffsetLen;

  char* index_data_begin = const_cast<char*>(data.data());
  index_ = reinterpret_cast<uint32_t*>(index_data_begin);
  sub_index_ = reinterpret_cast<char*>(index_ + index_size_);
  return Status::OK();
}

PlainTableIndex::IndexSearchResult PlainTableIndex::GetOffset(
    uint32_t prefix_hash, uint32_t* bucket_value) const {
  int bucket = GetBucketIdFromHash(prefix_hash, index_size_);
  GetUnaligned(index_ + bucket, bucket_value);
  if ((*bucket_value & kSubIndexMask) == kSubIndexMask) {
    *bucket_value ^= kSubIndexMask;
    return kSubindex;
  }
  if (*bucket_value >= kMaxFileSize) {
    return kNoPrefixForBucket;
  } else {
    // point directly to the file
    return kDirectToFile;
  }
}

void PlainTableIndexBuilder::IndexRecordList::AddRecord(uint32_t hash,
                                                        uint32_t offset) {
  if (num_records_in_current_group_ == kNumRecordsPerGroup) {
    current_group_ = AllocateNewGroup();
    num_records_in_current_group_ = 0;
  }
  auto& new_record = current_group_[num_records_in_current_group_++];
  new_record.hash = hash;
  new_record.offset = offset;
  new_record.next = nullptr;
}

void PlainTableIndexBuilder::AddKeyPrefix(Slice key_prefix_slice,
                                          uint32_t key_offset) {
  if (is_first_record_ || prev_key_prefix_ != key_prefix_slice.ToString()) {
    ++num_prefixes_;
    if (!is_first_record_) {
      keys_per_prefix_hist_.Add(num_keys_per_prefix_);
    }
    num_keys_per_prefix_ = 0;
    prev_key_prefix_ = key_prefix_slice.ToString();
    prev_key_prefix_hash_ = GetSliceHash(key_prefix_slice);
    due_index_ = true;
  }

  if (due_index_) {
    // Add an index key for every kIndexIntervalForSamePrefixKeys keys
    record_list_.AddRecord(prev_key_prefix_hash_, key_offset);
    due_index_ = false;
  }

  num_keys_per_prefix_++;
  if (index_sparseness_ == 0 || num_keys_per_prefix_ % index_sparseness_ == 0) {
    due_index_ = true;
  }
  is_first_record_ = false;
}

Slice PlainTableIndexBuilder::Finish() {
  AllocateIndex();
  std::vector<IndexRecord*> hash_to_offsets(index_size_, nullptr);
  std::vector<uint32_t> entries_per_bucket(index_size_, 0);
  BucketizeIndexes(&hash_to_offsets, &entries_per_bucket);

  keys_per_prefix_hist_.Add(num_keys_per_prefix_);
  ROCKS_LOG_INFO(ioptions_.info_log, "Number of Keys per prefix Histogram: %s",
                 keys_per_prefix_hist_.ToString().c_str());

  // From the temp data structure, populate indexes.
  return FillIndexes(hash_to_offsets, entries_per_bucket);
}

void PlainTableIndexBuilder::AllocateIndex() {
  if (prefix_extractor_ == nullptr || hash_table_ratio_ <= 0) {
    // Fall back to pure binary search if the user fails to specify a prefix
    // extractor.
    index_size_ = 1;
  } else {
    double hash_table_size_multipier = 1.0 / hash_table_ratio_;
    index_size_ =
      static_cast<uint32_t>(num_prefixes_ * hash_table_size_multipier) + 1;
    assert(index_size_ > 0);
  }
}

void PlainTableIndexBuilder::BucketizeIndexes(
    std::vector<IndexRecord*>* hash_to_offsets,
    std::vector<uint32_t>* entries_per_bucket) {
  bool first = true;
  uint32_t prev_hash = 0;
  size_t num_records = record_list_.GetNumRecords();
  for (size_t i = 0; i < num_records; i++) {
    IndexRecord* index_record = record_list_.At(i);
    uint32_t cur_hash = index_record->hash;
    if (first || prev_hash != cur_hash) {
      prev_hash = cur_hash;
      first = false;
    }
    uint32_t bucket = GetBucketIdFromHash(cur_hash, index_size_);
    IndexRecord* prev_bucket_head = (*hash_to_offsets)[bucket];
    index_record->next = prev_bucket_head;
    (*hash_to_offsets)[bucket] = index_record;
    (*entries_per_bucket)[bucket]++;
  }

  sub_index_size_ = 0;
  for (auto entry_count : *entries_per_bucket) {
    if (entry_count <= 1) {
      continue;
    }
    // Only buckets with more than 1 entry will have subindex.
    sub_index_size_ += VarintLength(entry_count);
    // total bytes needed to store these entries' in-file offsets.
    sub_index_size_ += entry_count * PlainTableIndex::kOffsetLen;
  }
}

Slice PlainTableIndexBuilder::FillIndexes(
    const std::vector<IndexRecord*>& hash_to_offsets,
    const std::vector<uint32_t>& entries_per_bucket) {
  ROCKS_LOG_DEBUG(ioptions_.info_log,
                  "Reserving %" PRIu32 " bytes for plain table's sub_index",
                  sub_index_size_);
  auto total_allocate_size = GetTotalSize();
  char* allocated = arena_->AllocateAligned(
      total_allocate_size, huge_page_tlb_size_, ioptions_.info_log);

  auto temp_ptr = EncodeVarint32(allocated, index_size_);
  uint32_t* index =
      reinterpret_cast<uint32_t*>(EncodeVarint32(temp_ptr, num_prefixes_));
  char* sub_index = reinterpret_cast<char*>(index + index_size_);

  uint32_t sub_index_offset = 0;
  for (uint32_t i = 0; i < index_size_; i++) {
    uint32_t num_keys_for_bucket = entries_per_bucket[i];
    switch (num_keys_for_bucket) {
      case 0:
        // No key for bucket
        PutUnaligned(index + i, (uint32_t)PlainTableIndex::kMaxFileSize);
        break;
      case 1:
        // point directly to the file offset
        PutUnaligned(index + i, hash_to_offsets[i]->offset);
        break;
      default:
        // point to second level indexes.
        PutUnaligned(index + i, sub_index_offset | PlainTableIndex::kSubIndexMask);
        char* prev_ptr = &sub_index[sub_index_offset];
        char* cur_ptr = EncodeVarint32(prev_ptr, num_keys_for_bucket);
        sub_index_offset += static_cast<uint32_t>(cur_ptr - prev_ptr);
        char* sub_index_pos = &sub_index[sub_index_offset];
        IndexRecord* record = hash_to_offsets[i];
        int j;
        for (j = num_keys_for_bucket - 1; j >= 0 && record;
             j--, record = record->next) {
          EncodeFixed32(sub_index_pos + j * sizeof(uint32_t), record->offset);
        }
        assert(j == -1 && record == nullptr);
        sub_index_offset += PlainTableIndex::kOffsetLen * num_keys_for_bucket;
        assert(sub_index_offset <= sub_index_size_);
        break;
    }
  }
  assert(sub_index_offset == sub_index_size_);

  ROCKS_LOG_DEBUG(ioptions_.info_log,
                  "hash table size: %" PRIu32 ", suffix_map length %" PRIu32,
                  index_size_, sub_index_size_);
  return Slice(allocated, GetTotalSize());
}

const std::string PlainTableIndexBuilder::kPlainTableIndexBlock =
    "PlainTableIndexBlock";
};  // namespace rocksdb

#endif  // ROCKSDB_LITE