1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
#ifndef ROCKSDB_LITE
#include "utilities/transactions/transaction_util.h"
#include <cinttypes>
#include <string>
#include <vector>
#include "db/db_impl/db_impl.h"
#include "rocksdb/status.h"
#include "rocksdb/utilities/write_batch_with_index.h"
#include "util/cast_util.h"
#include "util/string_util.h"
namespace ROCKSDB_NAMESPACE {
Status TransactionUtil::CheckKeyForConflicts(
DBImpl* db_impl, ColumnFamilyHandle* column_family, const std::string& key,
SequenceNumber snap_seq, const std::string* const read_ts, bool cache_only,
ReadCallback* snap_checker, SequenceNumber min_uncommitted) {
Status result;
auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
auto cfd = cfh->cfd();
SuperVersion* sv = db_impl->GetAndRefSuperVersion(cfd);
if (sv == nullptr) {
result = Status::InvalidArgument("Could not access column family " +
cfh->GetName());
}
if (result.ok()) {
SequenceNumber earliest_seq =
db_impl->GetEarliestMemTableSequenceNumber(sv, true);
result = CheckKey(db_impl, sv, earliest_seq, snap_seq, key, read_ts,
cache_only, snap_checker, min_uncommitted);
db_impl->ReturnAndCleanupSuperVersion(cfd, sv);
}
return result;
}
Status TransactionUtil::CheckKey(DBImpl* db_impl, SuperVersion* sv,
SequenceNumber earliest_seq,
SequenceNumber snap_seq,
const std::string& key,
const std::string* const read_ts,
bool cache_only, ReadCallback* snap_checker,
SequenceNumber min_uncommitted) {
// When `min_uncommitted` is provided, keys are not always committed
// in sequence number order, and `snap_checker` is used to check whether
// specific sequence number is in the database is visible to the transaction.
// So `snap_checker` must be provided.
assert(min_uncommitted == kMaxSequenceNumber || snap_checker != nullptr);
Status result;
bool need_to_read_sst = false;
// Since it would be too slow to check the SST files, we will only use
// the memtables to check whether there have been any recent writes
// to this key after it was accessed in this transaction. But if the
// Memtables do not contain a long enough history, we must fail the
// transaction.
if (earliest_seq == kMaxSequenceNumber) {
// The age of this memtable is unknown. Cannot rely on it to check
// for recent writes. This error shouldn't happen often in practice as
// the Memtable should have a valid earliest sequence number except in some
// corner cases (such as error cases during recovery).
need_to_read_sst = true;
if (cache_only) {
result = Status::TryAgain(
"Transaction could not check for conflicts as the MemTable does not "
"contain a long enough history to check write at SequenceNumber: ",
std::to_string(snap_seq));
}
} else if (snap_seq < earliest_seq || min_uncommitted <= earliest_seq) {
// Use <= for min_uncommitted since earliest_seq is actually the largest sec
// before this memtable was created
need_to_read_sst = true;
if (cache_only) {
// The age of this memtable is too new to use to check for recent
// writes.
char msg[300];
snprintf(msg, sizeof(msg),
"Transaction could not check for conflicts for operation at "
"SequenceNumber %" PRIu64
" as the MemTable only contains changes newer than "
"SequenceNumber %" PRIu64
". Increasing the value of the "
"max_write_buffer_size_to_maintain option could reduce the "
"frequency "
"of this error.",
snap_seq, earliest_seq);
result = Status::TryAgain(msg);
}
}
if (result.ok()) {
SequenceNumber seq = kMaxSequenceNumber;
std::string timestamp;
bool found_record_for_key = false;
// When min_uncommitted == kMaxSequenceNumber, writes are committed in
// sequence number order, so only keys larger than `snap_seq` can cause
// conflict.
// When min_uncommitted != kMaxSequenceNumber, keys lower than
// min_uncommitted will not triggered conflicts, while keys larger than
// min_uncommitted might create conflicts, so we need to read them out
// from the DB, and call callback to snap_checker to determine. So only
// keys lower than min_uncommitted can be skipped.
SequenceNumber lower_bound_seq =
(min_uncommitted == kMaxSequenceNumber) ? snap_seq : min_uncommitted;
Status s = db_impl->GetLatestSequenceForKey(
sv, key, !need_to_read_sst, lower_bound_seq, &seq,
!read_ts ? nullptr : ×tamp, &found_record_for_key,
/*is_blob_index=*/nullptr);
if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) {
result = s;
} else if (found_record_for_key) {
bool write_conflict = snap_checker == nullptr
? snap_seq < seq
: !snap_checker->IsVisible(seq);
// Perform conflict checking based on timestamp if applicable.
if (!write_conflict && read_ts != nullptr) {
ColumnFamilyData* cfd = sv->cfd;
assert(cfd);
const Comparator* const ucmp = cfd->user_comparator();
assert(ucmp);
assert(read_ts->size() == ucmp->timestamp_size());
assert(read_ts->size() == timestamp.size());
// Write conflict if *ts < timestamp.
write_conflict = ucmp->CompareTimestamp(*read_ts, timestamp) < 0;
}
if (write_conflict) {
result = Status::Busy();
}
}
}
return result;
}
Status TransactionUtil::CheckKeysForConflicts(DBImpl* db_impl,
const LockTracker& tracker,
bool cache_only) {
Status result;
std::unique_ptr<LockTracker::ColumnFamilyIterator> cf_it(
tracker.GetColumnFamilyIterator());
assert(cf_it != nullptr);
while (cf_it->HasNext()) {
ColumnFamilyId cf = cf_it->Next();
SuperVersion* sv = db_impl->GetAndRefSuperVersion(cf);
if (sv == nullptr) {
result = Status::InvalidArgument("Could not access column family " +
std::to_string(cf));
break;
}
SequenceNumber earliest_seq =
db_impl->GetEarliestMemTableSequenceNumber(sv, true);
// For each of the keys in this transaction, check to see if someone has
// written to this key since the start of the transaction.
std::unique_ptr<LockTracker::KeyIterator> key_it(
tracker.GetKeyIterator(cf));
assert(key_it != nullptr);
while (key_it->HasNext()) {
const std::string& key = key_it->Next();
PointLockStatus status = tracker.GetPointLockStatus(cf, key);
const SequenceNumber key_seq = status.seq;
// TODO: support timestamp-based conflict checking.
// CheckKeysForConflicts() is currently used only by optimistic
// transactions.
result = CheckKey(db_impl, sv, earliest_seq, key_seq, key,
/*read_ts=*/nullptr, cache_only);
if (!result.ok()) {
break;
}
}
db_impl->ReturnAndCleanupSuperVersion(cf, sv);
if (!result.ok()) {
break;
}
}
return result;
}
} // namespace ROCKSDB_NAMESPACE
#endif // ROCKSDB_LITE
|