-
Notifications
You must be signed in to change notification settings - Fork 3.9k
/
Copy pathlibroach.h
394 lines (321 loc) · 14.6 KB
/
libroach.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
// Copyright 2014 The Cockroach Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.
#pragma once
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
#ifdef __cplusplus
extern "C" {
#endif
// A DBSlice contains read-only data that does not need to be freed.
typedef struct {
char* data;
int len;
} DBSlice;
// A DBString is structurally identical to a DBSlice, but the data it
// contains must be freed via a call to free().
typedef struct {
char* data;
int len;
} DBString;
// A DBStatus is an alias for DBString and is used to indicate that
// the return value indicates the success or failure of an
// operation. If DBStatus.data == NULL the operation succeeded.
typedef DBString DBStatus;
typedef struct {
DBSlice key;
int64_t wall_time;
int32_t logical;
} DBKey;
typedef struct {
int64_t wall_time;
int32_t logical;
} DBTimestamp;
typedef struct {
bool valid;
DBKey key;
DBSlice value;
DBStatus status;
} DBIterState;
typedef struct DBCache DBCache;
typedef struct DBEngine DBEngine;
typedef struct DBIterator DBIterator;
// DBOptions contains local database options.
typedef struct {
DBCache* cache;
bool logging_enabled;
int num_cpu;
int max_open_files;
bool use_switching_env;
bool must_exist;
bool read_only;
DBSlice rocksdb_options;
DBSlice extra_options;
} DBOptions;
// Create a new cache with the specified size.
DBCache* DBNewCache(uint64_t size);
// Add a reference to an existing cache. Note that the underlying
// RocksDB cache is shared between the original and new reference.
DBCache* DBRefCache(DBCache* cache);
// Release a cache, decrementing the reference count on the underlying
// RocksDB cache. Note that the RocksDB cache will not be freed until
// all of the references have been released.
void DBReleaseCache(DBCache* cache);
// Opens the database located in "dir", creating it if it doesn't
// exist.
DBStatus DBOpen(DBEngine** db, DBSlice dir, DBOptions options);
// Destroys the database located in "dir". As the name implies, this
// operation is destructive. Use with caution.
DBStatus DBDestroy(DBSlice dir);
// Closes the database, freeing memory and other resources.
DBStatus DBClose(DBEngine* db);
// Flushes all mem-table data to disk, blocking until the operation is
// complete.
DBStatus DBFlush(DBEngine* db);
// Syncs the RocksDB WAL ensuring all data is persisted to
// disk. Blocks until the operation is complete.
DBStatus DBSyncWAL(DBEngine* db);
// Forces an immediate compaction over all keys.
DBStatus DBCompact(DBEngine* db);
// Forces an immediate compaction over keys in the specified range.
// Note that if start is empty, it indicates the start of the database.
// If end is empty, it indicates the end of the database.
DBStatus DBCompactRange(DBEngine* db, DBSlice start, DBSlice end, bool force_bottommost);
// Stores the approximate on-disk size of the given key range into the
// supplied uint64.
DBStatus DBApproximateDiskBytes(DBEngine* db, DBKey start, DBKey end, uint64_t* size);
// Sets the database entry for "key" to "value".
DBStatus DBPut(DBEngine* db, DBKey key, DBSlice value);
// Merge the database entry (if any) for "key" with "value".
DBStatus DBMerge(DBEngine* db, DBKey key, DBSlice value);
// Retrieves the database entry for "key".
DBStatus DBGet(DBEngine* db, DBKey key, DBString* value);
// Deletes the database entry for "key".
DBStatus DBDelete(DBEngine* db, DBKey key);
// Deletes a range of keys from start (inclusive) to end (exclusive).
DBStatus DBDeleteRange(DBEngine* db, DBKey start, DBKey end);
// Deletes a range of keys from start (inclusive) to end
// (exclusive). Unlike DBDeleteRange, this function finds the keys to
// delete by iterating over the supplied iterator and creating
// tombstones for the individual keys.
DBStatus DBDeleteIterRange(DBEngine* db, DBIterator* iter, DBKey start, DBKey end);
// Applies a batch of operations (puts, merges and deletes) to the
// database atomically and closes the batch. It is only valid to call
// this function on an engine created by DBNewBatch. If an error is
// returned, the batch is not closed and it is the caller's
// responsibility to call DBClose.
DBStatus DBCommitAndCloseBatch(DBEngine* db, bool sync);
// ApplyBatchRepr applies a batch of mutations encoded using that
// batch representation returned by DBBatchRepr(). It is only valid to
// call this function on an engine created by DBOpen() or DBNewBatch()
// (i.e. not a snapshot).
DBStatus DBApplyBatchRepr(DBEngine* db, DBSlice repr, bool sync);
// Returns the internal batch representation. The returned value is
// only valid until the next call to a method using the DBEngine and
// should thus be copied immediately. It is only valid to call this
// function on an engine created by DBNewBatch.
DBSlice DBBatchRepr(DBEngine* db);
// Creates a new snapshot of the database for use in DBGet() and
// DBNewIter(). It is the caller's responsibility to call DBClose().
DBEngine* DBNewSnapshot(DBEngine* db);
// Creates a new batch for performing a series of operations
// atomically. Use DBCommitBatch() on the returned engine to apply the
// batch to the database. The writeOnly parameter controls whether the
// batch can be used for reads or only for writes. A writeOnly batch
// does not need to index keys for reading and can be faster if the
// number of keys is large (and reads are not necessary). It is the
// caller's responsibility to call DBClose().
DBEngine* DBNewBatch(DBEngine* db, bool writeOnly);
// Creates a new database iterator. When prefix is true, Seek will use
// the user-key prefix of the key supplied to DBIterSeek() to restrict
// which sstables are searched, but iteration (using Next) over keys
// without the same user-key prefix will not work correctly (keys may
// be skipped). When stats is true, the iterator will collect RocksDB
// performance counters which can be retrieved via `DBIterStats`.
//
// It is the caller's responsibility to call DBIterDestroy().
DBIterator* DBNewIter(DBEngine* db, bool prefix, bool stats);
DBIterator* DBNewTimeBoundIter(DBEngine* db, DBTimestamp min_ts, DBTimestamp max_ts,
bool with_stats);
// Destroys an iterator, freeing up any associated memory.
void DBIterDestroy(DBIterator* iter);
// Positions the iterator at the first key that is >= "key".
DBIterState DBIterSeek(DBIterator* iter, DBKey key);
typedef struct {
uint64_t internal_delete_skipped_count;
// the number of SSTables touched (only for time bound iterators).
// This field is populated from the table filter, not from the
// RocksDB perf counters.
//
// TODO(tschottdorf): populate this field for all iterators.
uint64_t timebound_num_ssts;
// New fields added here must also be added in various other places;
// just grep the repo for internal_delete_skipped_count. Sorry.
} IteratorStats;
IteratorStats DBIterStats(DBIterator* iter);
// Positions the iterator at the first key in the database.
DBIterState DBIterSeekToFirst(DBIterator* iter);
// Positions the iterator at the last key in the database.
DBIterState DBIterSeekToLast(DBIterator* iter);
// Advances the iterator to the next key. If skip_current_key_versions
// is true, any remaining versions for the current key are
// skipped. After this call, DBIterValid() returns 1 iff the iterator
// was not positioned at the last key.
DBIterState DBIterNext(DBIterator* iter, bool skip_current_key_versions);
// Moves the iterator back to the previous key. If
// skip_current_key_versions is true, any remaining versions for the
// current key are skipped. After this call, DBIterValid() returns 1
// iff the iterator was not positioned at the first key.
DBIterState DBIterPrev(DBIterator* iter, bool skip_current_key_versions);
// Implements the merge operator on a single pair of values. update is
// merged with existing. This method is provided for invocation from
// Go code.
DBStatus DBMergeOne(DBSlice existing, DBSlice update, DBString* new_value);
// NB: The function (cStatsToGoStats) that converts these to the go
// representation is unfortunately duplicated in engine and engineccl. If this
// struct is changed, both places need to be updated.
typedef struct {
DBStatus status;
int64_t live_bytes;
int64_t key_bytes;
int64_t val_bytes;
int64_t intent_bytes;
int64_t live_count;
int64_t key_count;
int64_t val_count;
int64_t intent_count;
int64_t intent_age;
int64_t gc_bytes_age;
int64_t sys_bytes;
int64_t sys_count;
int64_t last_update_nanos;
} MVCCStatsResult;
MVCCStatsResult MVCCComputeStats(DBIterator* iter, DBKey start, DBKey end, int64_t now_nanos);
bool MVCCIsValidSplitKey(DBSlice key, bool allow_meta2_splits);
DBStatus MVCCFindSplitKey(DBIterator* iter, DBKey start, DBKey end, DBKey min_split,
int64_t target_size, bool allow_meta2_splits, DBString* split_key);
// DBTxn contains the fields from a roachpb.Transaction that are
// necessary for MVCC Get and Scan operations. Note that passing a
// serialized roachpb.Transaction appears to be a non-starter as an
// alternative due to the performance overhead.
//
// TODO(peter): We could investigate using
// https://github.com/petermattis/cppgo to generate C++ code that can
// read the Go roachpb.Transaction structure.
typedef struct {
DBSlice id;
uint32_t epoch;
DBTimestamp max_timestamp;
} DBTxn;
typedef struct {
DBSlice* bufs;
// len is the number of DBSlices in bufs.
int32_t len;
// count is the number of key/value pairs in bufs.
int32_t count;
} DBChunkedBuffer;
// DBScanResults contains the key/value pairs and intents encoded
// using the RocksDB batch repr format.
typedef struct {
DBStatus status;
DBChunkedBuffer data;
DBSlice intents;
DBTimestamp uncertainty_timestamp;
} DBScanResults;
DBScanResults MVCCGet(DBIterator* iter, DBSlice key, DBTimestamp timestamp, DBTxn txn,
bool consistent, bool tombstones);
DBScanResults MVCCScan(DBIterator* iter, DBSlice start, DBSlice end, DBTimestamp timestamp,
int64_t max_keys, DBTxn txn, bool consistent, bool reverse, bool tombstones);
// DBStatsResult contains various runtime stats for RocksDB.
typedef struct {
int64_t block_cache_hits;
int64_t block_cache_misses;
size_t block_cache_usage;
size_t block_cache_pinned_usage;
int64_t bloom_filter_prefix_checked;
int64_t bloom_filter_prefix_useful;
int64_t memtable_total_size;
int64_t flushes;
int64_t compactions;
int64_t table_readers_mem_estimate;
int64_t pending_compaction_bytes_estimate;
} DBStatsResult;
DBStatus DBGetStats(DBEngine* db, DBStatsResult* stats);
DBString DBGetCompactionStats(DBEngine* db);
typedef struct {
int level;
uint64_t size;
DBKey start_key;
DBKey end_key;
} DBSSTable;
// Retrieve stats about all of the live sstables. Note that the tables
// array must be freed along with the start_key and end_key of each
// table.
DBSSTable* DBGetSSTables(DBEngine* db, int* n);
// DBGetUserProperties fetches the user properties stored in each sstable's
// metadata. These are returned as a serialized SSTUserPropertiesCollection
// proto.
DBString DBGetUserProperties(DBEngine* db);
// Bulk adds the files at the given paths to a database, all atomically. See the
// RocksDB documentation on `IngestExternalFile` for the various restrictions on
// what can be added. If move_files is true, the files will be moved instead of
// copied. If allow_file_modifications is false, RocksDB will return an error if
// it would have tried to modify any of the files' sequence numbers rather than
// editing the files in place.
DBStatus DBIngestExternalFiles(DBEngine* db, char** paths, size_t len, bool move_files,
bool allow_file_modifications);
typedef struct DBSstFileWriter DBSstFileWriter;
// Creates a new SstFileWriter with the default configuration.
DBSstFileWriter* DBSstFileWriterNew();
// Opens an in-memory file for output of an sstable.
DBStatus DBSstFileWriterOpen(DBSstFileWriter* fw);
// Adds a Put key with the provided value into the sstable being built. An error
// is returned if it is not greater than any previously added entry (according
// to the comparator configured during writer creation). Open must have been
// called. Close cannot have been called.
DBStatus DBSstFileWriterPut(DBSstFileWriter* fw, DBKey key, DBSlice val);
// Adds a Merge key with the provided value into the sstable being built. The
// function has the same restrictions as DBSstFileWriterPut.
DBStatus DBSstFileWriterMerge(DBSstFileWriter* fw, DBKey key, DBSlice val);
// Adds a Deletetion key into the sstable being built. The function has the same
// restrictions as DBSstFileWriterPut.
DBStatus DBSstFileWriterDelete(DBSstFileWriter* fw, DBKey key);
// Adds a RangeDeletion tombstone into the sstable being built. The function can
// be called at any time with respect to DBSstFileWriter{Put,Merge,Delete}. Open
// must have been called. Close cannot have been called.
DBStatus DBSstFileWriterDeleteRange(DBSstFileWriter* fw, DBKey start, DBKey end);
// Truncate the writer and stores the constructed file's contents in *data. May
// be called multiple times. The function may not truncate and return all keys
// if the underlying RocksDB blocks have not been flushed. Close cannot have
// been called.
DBStatus DBSstFileWriterTruncate(DBSstFileWriter* fw, DBString* data);
// Finalizes the writer and stores the constructed file's contents in *data. At
// least one kv entry must have been added. May only be called once.
DBStatus DBSstFileWriterFinish(DBSstFileWriter* fw, DBString* data);
// Closes the writer and frees memory and other resources. May only be called
// once.
void DBSstFileWriterClose(DBSstFileWriter* fw);
void DBRunLDB(int argc, char** argv);
// DBEnvWriteFile writes the given data as a new "file" in the given engine.
DBStatus DBEnvWriteFile(DBEngine* db, DBSlice path, DBSlice contents);
// DBFileLock contains various parameters set during DBLockFile and required for DBUnlockFile.
typedef void* DBFileLock;
// DBLockFile sets a lock on the specified file using RocksDB's file locking interface.
DBStatus DBLockFile(DBSlice filename, DBFileLock* lock);
// DBUnlockFile unlocks the file asscoiated with the specified lock and GCs any allocated memory for
// the lock.
DBStatus DBUnlockFile(DBFileLock lock);
#ifdef __cplusplus
} // extern "C"
#endif