src/rocksdb/table/block_based/data_block_hash_index.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137

// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
//  This source code is licensed under both the GPLv2 (found in the
//  COPYING file in the root directory) and Apache 2.0 License
//  (found in the LICENSE.Apache file in the root directory).

#pragma once

#include <cstdint>
#include <string>
#include <vector>

#include "rocksdb/slice.h"

namespace ROCKSDB_NAMESPACE {
// This is an experimental feature aiming to reduce the CPU utilization of
// point-lookup within a data-block. It is only used in data blocks, and not
// in meta-data blocks or per-table index blocks.
//
// It only used to support BlockBasedTable::Get().
//
// A serialized hash index is appended to the data-block. The new block data
// format is as follows:
//
// DATA_BLOCK: [RI RI RI ... RI RI_IDX HASH_IDX FOOTER]
//
// RI:       Restart Interval (the same as the default data-block format)
// RI_IDX:   Restart Interval index (the same as the default data-block format)
// HASH_IDX: The new data-block hash index feature.
// FOOTER:   A 32bit block footer, which is the NUM_RESTARTS with the MSB as
//           the flag indicating if this hash index is in use. Note that
//           given a data block < 32KB, the MSB is never used. So we can
//           borrow the MSB as the hash index flag. Therefore, this format is
//           compatible with the legacy data-blocks with num_restarts < 32768,
//           as the MSB is 0.
//
// The format of the data-block hash index is as follows:
//
// HASH_IDX: [B B B ... B NUM_BUCK]
//
// B:         bucket, an array of restart index. Each buckets is uint8_t.
// NUM_BUCK:  Number of buckets, which is the length of the bucket array.
//
// We reserve two special flag:
//    kNoEntry=255,
//    kCollision=254.
//
// Therefore, the max number of restarts this hash index can supoport is 253.
//
// Buckets are initialized to be kNoEntry.
//
// When storing a key in the hash index, the key is first hashed to a bucket.
// If there the bucket is empty (kNoEntry), the restart index is stored in
// the bucket. If there is already a restart index there, we will update the
// existing restart index to a collision marker (kCollision). If the
// the bucket is already marked as collision, we do not store the restart
// index either.
//
// During query process, a key is first hashed to a bucket. Then we examine if
// the buckets store nothing (kNoEntry) or the bucket had a collision
// (kCollision). If either of those happens, we get the restart index of
// the key and will directly go to the restart interval to search the key.
//
// Note that we only support blocks with #restart_interval < 254. If a block
// has more restart interval than that, hash index will not be create for it.

const uint8_t kNoEntry = 255;
const uint8_t kCollision = 254;
const uint8_t kMaxRestartSupportedByHashIndex = 253;

// Because we use uint16_t address, we only support block no more than 64KB
const size_t kMaxBlockSizeSupportedByHashIndex = 1u << 16;
const double kDefaultUtilRatio = 0.75;

class DataBlockHashIndexBuilder {
 public:
  DataBlockHashIndexBuilder()
      : bucket_per_key_(-1 /*uninitialized marker*/),
        estimated_num_buckets_(0),
        valid_(false) {}

  void Initialize(double util_ratio) {
    if (util_ratio <= 0) {
      util_ratio = kDefaultUtilRatio;  // sanity check
    }
    bucket_per_key_ = 1 / util_ratio;
    valid_ = true;
  }

  inline bool Valid() const { return valid_ && bucket_per_key_ > 0; }
  void Add(const Slice& key, const size_t restart_index);
  void Finish(std::string& buffer);
  void Reset();
  inline size_t EstimateSize() const {
    uint16_t estimated_num_buckets =
        static_cast<uint16_t>(estimated_num_buckets_);

    // Maching the num_buckets number in DataBlockHashIndexBuilder::Finish.
    estimated_num_buckets |= 1;

    return sizeof(uint16_t) +
           static_cast<size_t>(estimated_num_buckets * sizeof(uint8_t));
  }

 private:
  double bucket_per_key_;  // is the multiplicative inverse of util_ratio_
  double estimated_num_buckets_;

  // Now the only usage for `valid_` is to mark false when the inserted
  // restart_index is larger than supported. In this case HashIndex is not
  // appended to the block content.
  bool valid_;

  std::vector<std::pair<uint32_t, uint8_t>> hash_and_restart_pairs_;
  friend class DataBlockHashIndex_DataBlockHashTestSmall_Test;
};

class DataBlockHashIndex {
 public:
  DataBlockHashIndex() : num_buckets_(0) {}

  void Initialize(const char* data, uint16_t size, uint16_t* map_offset);

  uint8_t Lookup(const char* data, uint32_t map_offset, const Slice& key) const;

  inline bool Valid() { return num_buckets_ != 0; }

 private:
  // To make the serialized hash index compact and to save the space overhead,
  // here all the data fields persisted in the block are in uint16 format.
  // We find that a uint16 is large enough to index every offset of a 64KiB
  // block.
  // So in other words, DataBlockHashIndex does not support block size equal
  // or greater then 64KiB.
  uint16_t num_buckets_;
};

}  // namespace ROCKSDB_NAMESPACE