-
Notifications
You must be signed in to change notification settings - Fork 3.9k
/
Copy pathmetadata.proto
457 lines (418 loc) · 23.9 KB
/
metadata.proto
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
// Copyright 2014 The Cockroach Authors.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.txt.
//
// As of the Change Date specified in that file, in accordance with
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0, included in the file
// licenses/APL.txt.
syntax = "proto2";
package cockroach.roachpb;
option go_package = "roachpb";
import "util/unresolved_addr.proto";
import "util/hlc/timestamp.proto";
import "gogoproto/gogo.proto";
// Attributes specifies a list of arbitrary strings describing
// node topology, store type, and machine capabilities.
message Attributes {
option (gogoproto.goproto_stringer) = false;
option (gogoproto.equal) = true;
repeated string attrs = 1 [(gogoproto.moretags) = "yaml:\"attrs,flow\""];
}
// ReplicationTarget identifies a node/store pair.
//
// TODO(aayush): There are a bunch of usages of ReplicaDescriptor in allocator
// methods where we should really be using ReplicationTarget. We should instead
// have something like a `ReplicationTargetI` interface that both
// `ReplicaDescriptor` and `ReplicationTarget` implement and refactor our
// utility methods for these operate on the interface instead.
message ReplicationTarget {
option (gogoproto.goproto_stringer) = false;
option (gogoproto.equal) = true;
optional int32 node_id = 1 [(gogoproto.nullable) = false,
(gogoproto.customname) = "NodeID", (gogoproto.casttype) = "NodeID"];
optional int32 store_id = 2 [(gogoproto.nullable) = false,
(gogoproto.customname) = "StoreID", (gogoproto.casttype) = "StoreID"];
}
// ReplicaType identifies which raft activities a replica participates in. In
// normal operation, VOTER_FULL, NON_VOTER, and LEARNER are the only used
// states. However, atomic replication changes require a transition through a
// "joint config"; in this joint config, the VOTER_DEMOTING_{LEARNER, NON_VOTER}
// and VOTER_INCOMING types are used as well to denote voters which are being
// downgraded to learners and newly added by the change, respectively. When
// being removed, a demoting voter is turning into a learner, which we prefer
// over a direct removal, which was used prior to v20.1 and uses the
// VOTER_OUTGOING type instead (see VersionChangeReplicasDemotion for details on
// why we're not doing that any more).
//
// All VOTER* types indicate a replica that participates in all raft activities,
// including voting for leadership and committing entries. Typically, this
// requires a majority of voters to reach a decision. In a joint config, two
// separate majorities are required: one from the set of replicas that have
// either type VOTER or VOTER_OUTGOING or VOTER_DEMOTING_{LEARNER, NON_VOTER},
// as well as that of the set of types VOTER and VOTER_INCOMING . For example,
// when type VOTER_FULL is assigned to replicas 1 and 2, while 3 is
// VOTER_OUTGOING and 4 is VOTER_INCOMING, then the two sets over which quorums
// need to be achieved are {1,2,3} and {1,2,4}. Thus, {1,2} is a quorum of both,
// {1,3} is a quorum of the first but not the second, {1,4} is a quorum of the
// second but not the first, and {3,4} is a quorum of neither.
enum ReplicaType {
option (gogoproto.goproto_enum_prefix) = false;
// VOTER_FULL indicates a replica that is a voter both in the
// incoming and outgoing set.
VOTER_FULL = 0;
// VOTER_INCOMING indicates a voting replica that will be a
// VOTER_FULL once the ongoing atomic replication change is finalized; that is,
// it is in the process of being added. In practice, this replica type should
// be treated like a VOTER_FULL.
VOTER_INCOMING = 2;
// VOTER_OUTGOING indicates a voting replica that will not be part
// of the descriptor once the ongoing atomic replication change is finalized;
// that is, it is in the process of being removed. In practice, a replica of
// this type should be treated accordingly and no work should be assigned to
// it.
//
// Note: We're not using VOTER_OUTGOING since 20.1. We're using VOTER_DEMOTING
// instead. See #42251.
VOTER_OUTGOING = 3;
// VOTER_DEMOTING_LEARNER indicates a voting replica that will become a
// learner once the ongoing atomic replication change is finalized; that is,
// it is in the process of being demoted. Since learners are currently
// short-lived, this replica is really being removed, with an intermediate
// step, and no work should be assigned to it.
VOTER_DEMOTING_LEARNER = 4;
// LEARNER indicates a replica that applies committed entries, but does not
// count towards the quorum(s). Candidates will not ask for (or take into
// account) votes of (peers they consider) LEARNERs for leadership nor do
// their acknowledged log entries get taken into account for determining the
// committed index. Learners in CockroachDB are a short-term transient state:
// a replica being added and on its way to being a VOTER_{FULL,INCOMING}, or a
// VOTER_DEMOTING_LEARNER being removed.
//
// Note that once these replicas upreplicate after receiving their initial
// snapshot, they will count towards the raft leader's quota pool and throttle
// incoming proposals if they fall "too far behind".
LEARNER = 1;
// NON_VOTER indicates a replica that applies committed entries, but does not
// count towards the quorum(s). Candidates will not ask for (or take into
// account) votes of (peers they consider) NON_VOTERs for leadership nor do
// their acknowledged log entries get taken into account for determining the
// committed index.
//
// Under the hood, it is based on an etcd/raft LearnerNode, like the LEARNER
// replica type defined above. They will also cause the quota pool on the
// leader to throttle incoming proposals if they fall behind.
//
// Unlike LEARNERs, these are a persistent state meant to serve user traffic
// via follower reads. See comment above ReplicaDescriptors.NonVoters() for
// differences in how LEARNERs and NON_VOTERs are handled internally.
NON_VOTER = 5;
// VOTER_DEMOTING_NON_VOTER indicates a voting replica in the outgoing group
// of a joint state, which will become a non-voter when the atomic replication
// change is finalized (i.e. when we exit the joint state).
VOTER_DEMOTING_NON_VOTER = 6;
}
// ReplicaDescriptor describes a replica location by node ID
// (corresponds to a host:port via lookup on gossip network) and store
// ID (identifies the device).
// TODO(jeffreyxiao): All nullable fields in ReplicaDescriptor can be made
// non-nullable if #38302 is guaranteed to be on all nodes (I.E. 20.1).
message ReplicaDescriptor {
option (gogoproto.goproto_stringer) = false;
option (gogoproto.equal) = true;
option (gogoproto.populate) = true;
optional int32 node_id = 1 [(gogoproto.nullable) = false,
(gogoproto.customname) = "NodeID", (gogoproto.casttype) = "NodeID",
(gogoproto.moretags) = "yaml:\"NodeID\""];
optional int32 store_id = 2 [(gogoproto.nullable) = false,
(gogoproto.customname) = "StoreID", (gogoproto.casttype) = "StoreID",
(gogoproto.moretags) = "yaml:\"StoreID\""];
// replica_id uniquely identifies a replica instance. If a range is removed from
// a store and then re-added to the same store, the new instance will have a
// higher replica_id.
optional int32 replica_id = 3 [(gogoproto.nullable) = false,
(gogoproto.customname) = "ReplicaID", (gogoproto.casttype) = "ReplicaID",
(gogoproto.moretags) = "yaml:\"ReplicaID\""];
// Type indicates which raft activities a replica participates in. A nil type
// is equivalent to VOTER.
optional ReplicaType type = 4 [(gogoproto.moretags) = "yaml:\"ReplicaType,omitempty\""];
}
// ReplicaIdent uniquely identifies a specific replica.
message ReplicaIdent {
optional int64 range_id = 1 [(gogoproto.nullable) = false,
(gogoproto.customname) = "RangeID", (gogoproto.casttype) = "RangeID"];
optional ReplicaDescriptor replica = 2 [(gogoproto.nullable) = false];
}
// RangeDescriptor is the value stored in a range metadata key.
// A range is described using an inclusive start key, a non-inclusive end key,
// and a list of replicas where the range is stored.
//
// NOTE: Care must be taken when adding or removing fields from this proto
// because we have code relies on the descriptor comparing Equal() after
// round-tripping through a previous/next version node (i.e. in mixed-version
// clusters). Note that we don't need to proto encoding to be stable since, when
// doing CPuts we use the raw bytes we've read from the DB as the expected value
// (instead of re-marshaling the proto), but unfortunately we also need the
// Equal() method to work. Also note that we configure our protos to not
// maintain unrecognized fields.
//
// TODO(jeffreyxiao): All nullable fields in RangeDescriptor can be made
// non-nullable if #38302 is guaranteed to be on all nodes (I.E. 20.1).
message RangeDescriptor {
option (gogoproto.goproto_stringer) = false;
// We implement the Equal method by hand so that it can ignore deprecated
// fields. This can be reverted in 21.1 once the "previous version" no longer
// populates deprecated_generation_comparable.
option (gogoproto.equal) = false;
option (gogoproto.populate) = true;
optional int64 range_id = 1 [(gogoproto.nullable) = false,
(gogoproto.customname) = "RangeID", (gogoproto.casttype) = "RangeID"];
// start_key is the first key which may be contained by this range.
optional bytes start_key = 2 [(gogoproto.casttype) = "RKey"];
// end_key marks the end of the range's possible keys. EndKey itself is not
// contained in this range - it will be contained in the immediately
// subsequent range.
optional bytes end_key = 3 [(gogoproto.casttype) = "RKey"];
// InternalReplicas is the is the set of nodes/stores on which replicas of
// this range are stored. DO NOT USE this field directly, use the `Replicas`
// method instead. The ordering is arbitrary and subject to permutation.
repeated ReplicaDescriptor internal_replicas = 4 [(gogoproto.nullable) = false];
// next_replica_id is a counter used to generate replica IDs.
optional int32 next_replica_id = 5 [(gogoproto.nullable) = false,
(gogoproto.customname) = "NextReplicaID", (gogoproto.casttype) = "ReplicaID"];
// generation is incremented on every split, merge, and every replica change,
// i.e., whenever the span of the range or replica set changes. It is
// initialized to zero when the range is first created. The generation
// counter was first introduced to allow the range descriptor resulting from
// a split and then merge to be distinguishable from the initial range
// descriptor. This is important since changes to the range descriptors use
// CPuts to ensure mutual exclusion.
//
// See #28071 for details on the above.
//
// Generations are also useful to make local replicaGC decisions when applying
// a snapshot on keyspace that has overlapping replicas (but note that we do
// not use this at the time of writing due to migration concerns; see below).
//
// We want to be able to compare the snapshot range's generation counter to
// that of the overlapping replicas to draw a conclusion about whether the
// snapshot can be applied (in which case the overlapping replicas need to be
// safely removable). To that end, on a split, not only do we increment the
// left hand side's generation, we also copy the resultant generation to the
// newly created right hand side. On merges, we update the left hand side's
// generation so that it exceeds by one the maximum of the left hand side and
// the right hand side's generations from before the merge.
//
// If two replicas (perhaps one of them represented by a raft or preemptive
// snapshot) as defined by their full range descriptor (including, notably,
// the generation) overlap, then one of them has to be stale. This is because
// the keyspace cleanly shards into non-overlapping ranges at all times (i.e.
// for all consistent snapshots). Since meta ranges (or more generally, range
// descriptors) are only ever updated transactionally, mutations to the meta
// ranges can be serialized (i.e. put into some sequential ordering). We know
// that the descriptors corresponding to both of our replicas can't be from
// the same consistent snapshot of the meta ranges, so there is a version of
// the meta ranges that includes only the first replica, and there is a
// version that includes only the second replica. Without loss of generality,
// assume that the first version is "older". This means that there is a finite
// sequence of splits and merges that were applied to the consistent snapshot
// corresponding to the first version which resulted in the second version of
// the meta ranges.
//
// Each individual operation, thanks to the generational semantics above, has
// the invariant that the resulting descriptors have a strictly larger
// generation than any descriptors from the previous version that they cover.
// For example, if a descriptor [a,c) at generation 5 is split into [a,b) and
// [b,c), both of those latter range descriptors have generation 6. If [c,d)
// is at generation 12 and [d, f) is at generation 17, then the resulting
// merged range [c,f) will have generation 18.
//
// At the end of the day, for incoming snapshots, this means that we only have
// to collect the overlapping replicas and their generations. Any replica with
// a smaller generation is stale by the above argument and can be replicaGC'ed
// right away. Any replica with a larger generation indicates that the snapshot
// is stale and should be discarded. A replica with the same generation is
// necessarily a replica of the range the snapshot is addressing (this is the
// usual case, in which a snapshot "overlaps" precisely one replica, which is
// the replica it's supposed to update, and no splits and merges have taken
// place at all).
//
// For a third note, observe that the generational semantics above may
// possibly allow range merges without colocation, at least in the sense that
// the counter examples in #28071 are defused. This is because the
// generational counter can answer the question whether the overlapping
// replica is gc'able or not. If it is not gc'able, then by definition the
// replica applying the merge is.
optional int64 generation = 6 [(gogoproto.nullable) = false, (gogoproto.casttype) = "RangeGeneration"];
// This field is not used any more, but we need to maintain it in 20.2 because
// 20.1 nodes need descriptors to round-trip through 20.2 nodes and compare
// Equal() when they come back. 20.2 nodes know to ignore this field when
// comparing, so the field can be removed in 21.1.
optional bool deprecated_generation_comparable = 8;
// The presence of the sticky_bit indicates that the range should not be
// automatically merged by the merge queue with the range to its left. It is
// set during a split operation and unset during an unsplit operation. Note
// that the unsplit operation is a different operation from the merge
// operation. Unsplit only unsets sticky_bit. It is represented by a
// timestamp that indicates when it expires. After the expiration time has
// passed, the split is eligible for automatic merging. A nil sticky bit is
// equivalent to hlc.Timestamp{}.
//
// The reason the sticky_bit exists is because when the merge queue is
// enabled and a manual split happens, the split ranges would immediately be
// merged by the merge queue. Previous, we threw an error when a user
// attempted to execute ALTER TABLE/INDEX ... SPLIT AT ... when the merge
// queue is enabled. With sticky_bit, users can manually split ranges without
// disabling the merge queue.
optional util.hlc.Timestamp sticky_bit = 7;
}
// Percentiles contains a handful of hard-coded percentiles meant to summarize
// a distribution.
message Percentiles {
option (gogoproto.goproto_stringer) = false;
optional double p10 = 1 [(gogoproto.nullable) = false];
optional double p25 = 2 [(gogoproto.nullable) = false];
optional double p50 = 3 [(gogoproto.nullable) = false];
optional double p75 = 4 [(gogoproto.nullable) = false];
optional double p90 = 5 [(gogoproto.nullable) = false];
optional double pMax = 6 [(gogoproto.nullable) = false];
}
// StoreCapacity contains capacity information for a storage device.
message StoreCapacity {
option (gogoproto.goproto_stringer) = false;
// Total capacity of the disk used by the store, including space used by the
// operating system and other applications.
optional int64 capacity = 1 [(gogoproto.nullable) = false];
// Available space remaining on the disk used by the store.
optional int64 available = 2 [(gogoproto.nullable) = false];
// Amount of disk space used by the data in the CockroachDB store. Note that
// this is going to be less than (capacity - available), because those two
// fields consider the entire disk and everything on it, while this only
// tracks the store's disk usage.
optional int64 used = 8 [(gogoproto.nullable) = false];
// Amount of logical bytes stored in the store, ignoring RocksDB space
// overhead. Useful for rebalancing so that moving a replica from one store
// to another actually removes its bytes from the source store even though
// RocksDB may not actually reclaim the physical disk space for a while.
optional int64 logical_bytes = 9 [(gogoproto.nullable) = false];
optional int32 range_count = 3 [(gogoproto.nullable) = false];
optional int32 lease_count = 4 [(gogoproto.nullable) = false];
// queries_per_second tracks the average number of queries processed per
// second by replicas in the store. The stat is tracked over the time period
// defined in storage/replica_stats.go, which as of July 2018 is 30 minutes.
optional double queries_per_second = 10 [(gogoproto.nullable) = false];
// writes_per_second tracks the average number of keys written per second
// by ranges in the store. The stat is tracked over the time period defined
// in storage/replica_stats.go, which as of July 2018 is 30 minutes.
optional double writes_per_second = 5 [(gogoproto.nullable) = false];
// read_amplification tracks the current read amplification in the store.
optional int64 read_amplification = 11 [(gogoproto.nullable) = false];
// bytes_per_replica and writes_per_replica contain percentiles for the
// number of bytes and writes-per-second to each replica in the store.
// This information can be used for rebalancing decisions.
optional Percentiles bytes_per_replica = 6 [(gogoproto.nullable) = false];
optional Percentiles writes_per_replica = 7 [(gogoproto.nullable) = false];
}
// StoreProperties contains configuration and OS-level details for a storage device.
message StoreProperties {
// encrypted indicates whether the store is encrypted.
optional bool encrypted = 1 [(gogoproto.nullable) = false];
// read_only indicates whether the store is attached read_only.
optional bool read_only = 2 [(gogoproto.nullable) = false];
// disk_properties reports details about the underlying filesystem,
// when the store is supported by a file store. Unset otherwise.
optional FileStoreProperties file_store_properties = 3;
}
// FileStoreProperties contains configuration and OS-level details for a file store.
message FileStoreProperties {
option (gogoproto.goproto_stringer) = false;
// path reports the configured filesystem path for the store.
optional string path = 1 [(gogoproto.nullable) = false];
// fs_type reports the external filesystem type (ufs, ext4, etc), if known.
optional string fs_type = 2 [(gogoproto.nullable) = false];
// block_device reports which block devices supports the filesystem, if known.
optional string block_device = 3 [(gogoproto.nullable) = false];
// mount_point reports the mount point of the filesystem, if known.
optional string mount_point = 4 [(gogoproto.nullable) = false];
// mount_options reports the mount options, if known.
optional string mount_options = 5 [(gogoproto.nullable) = false];
}
// NodeDescriptor holds details on node physical/network topology.
message NodeDescriptor {
option (gogoproto.equal) = true;
optional int32 node_id = 1 [(gogoproto.nullable) = false,
(gogoproto.customname) = "NodeID", (gogoproto.casttype) = "NodeID"];
optional util.UnresolvedAddr address = 2 [(gogoproto.nullable) = false];
optional Attributes attrs = 3 [(gogoproto.nullable) = false];
optional Locality locality = 4 [(gogoproto.nullable) = false];
optional Version ServerVersion = 5 [(gogoproto.nullable) = false];
optional string build_tag = 6 [(gogoproto.nullable) = false];
optional int64 started_at = 7 [(gogoproto.nullable) = false];
repeated LocalityAddress locality_address = 8 [(gogoproto.nullable) = false];
optional string cluster_name = 9 [(gogoproto.nullable) = false];
// The SQL address. If empty, indicates that the base address field
// is also used to accept SQL connections.
optional util.UnresolvedAddr sql_address = 10 [(gogoproto.nullable) = false,
(gogoproto.customname) = "SQLAddress"];
// The HTTP address. If empty, will prevent nodes from proxying HTTP
// requests to each other.
optional util.UnresolvedAddr http_address = 11 [(gogoproto.nullable) = false,
(gogoproto.customname) = "HTTPAddress"];
}
// LocalityAddress holds the private address accessible only from other nodes
// in the corresponding locality.
message LocalityAddress {
option (gogoproto.equal) = true;
optional util.UnresolvedAddr address = 1 [(gogoproto.nullable) = false];
optional Tier locality_tier = 2 [(gogoproto.nullable) = false];
}
// StoreDescriptor holds store information including store attributes, node
// descriptor and store capacity.
message StoreDescriptor {
optional int32 store_id = 1 [(gogoproto.nullable) = false,
(gogoproto.customname) = "StoreID", (gogoproto.casttype) = "StoreID"];
optional Attributes attrs = 2 [(gogoproto.nullable) = false];
optional NodeDescriptor node = 3 [(gogoproto.nullable) = false];
optional StoreCapacity capacity = 4 [(gogoproto.nullable) = false];
optional StoreProperties properties = 5 [(gogoproto.nullable) = false];
}
// StoreDeadReplicas holds a storeID and a list of dead replicas on that store.
// Used to let the range lease holder know about corrupted or otherwise
// destroyed replicas that should be transferred to a different store.
message StoreDeadReplicas {
optional int32 store_id = 1 [(gogoproto.nullable) = false,
(gogoproto.customname) = "StoreID", (gogoproto.casttype) = "StoreID"];
repeated ReplicaIdent replicas = 2 [(gogoproto.nullable) = false];
}
// Locality is an ordered set of key value Tiers that describe a node's
// location. The tier keys should be the same across all nodes.
message Locality {
option (gogoproto.equal) = true;
option (gogoproto.goproto_stringer) = false;
repeated Tier tiers = 1 [(gogoproto.nullable) = false];
}
// Tier represents one level of the locality hierarchy.
message Tier {
option (gogoproto.equal) = true;
option (gogoproto.goproto_stringer) = false;
// Key is the name of tier and should match all other nodes.
optional string key = 1 [(gogoproto.nullable) = false];
// Value is node specific value corresponding to the key.
optional string value = 2 [(gogoproto.nullable) = false];
}
message Version {
option (gogoproto.equal) = true;
option (gogoproto.goproto_stringer) = false;
// The names "major" and "minor" are reserved in C in
// some platforms (e.g. FreeBSD).
optional int32 major_val = 1 [(gogoproto.nullable) = false, (gogoproto.customname) = "Major"];
optional int32 minor_val = 2 [(gogoproto.nullable) = false, (gogoproto.customname) = "Minor"];
// Note that patch is a placeholder and will always be zero.
optional int32 patch = 3 [(gogoproto.nullable) = false];
// The internal version is used to introduce migrations during the development
// cycle. They are subversions that are never the end versions of a release,
// i.e. users of stable, public release will only use binaries with the
// internal version set to 0.
optional int32 internal = 4 [(gogoproto.nullable) = false];
}