Skip to content

Commit a0af9bd

Browse files
committed
Define ControlService interface
Adds RPCs and messages that provides administration and management operations for FDB cluster.
1 parent 53fe3ec commit a0af9bd

File tree

1 file changed

+394
-0
lines changed

1 file changed

+394
-0
lines changed

fdbctl/protos/control_service.proto

Lines changed: 394 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,5 +2,399 @@ syntax = "proto3";
22

33
package fdbctl;
44

5+
option go_package = "github.com/apple/foundationdb/fdbctl";
6+
option java_package = "com.apple.foundationdb.fdbctl";
7+
8+
//------ RPCs ------
9+
10+
// ControlService provides administrative and management operations for a FoundationDB cluster.
11+
// This service allows control over cluster configuration, coordinator management, worker lifecycle,
12+
// and monitoring of cluster health and status.
513
service ControlService {
14+
// Retrieves the current list of coordinators for the cluster.
15+
rpc GetCoordinators(GetCoordinatorsRequest) returns (GetCoordinatorsReply);
16+
17+
// Changes the cluster's coordinators. This is a critical operation that should be
18+
// performed carefully as coordinators maintain cluster configuration state.
19+
rpc ChangeCoordinators(ChangeCoordinatorsRequest) returns (ChangeCoordinatorsReply);
20+
21+
// Suggests optimal configuration parameters based on current cluster topology.
22+
rpc ConfigureAutoSuggest(ConfigureAutoSuggestRequest) returns (ConfigureAutoSuggestReply);
23+
24+
// Configures database settings including redundancy mode, storage engine, process counts,
25+
// encryption, and other critical cluster parameters.
26+
rpc Configure(ConfigureRequest) returns (ConfigureReply);
27+
28+
// Retrieves the current read version (transaction version) of the database.
29+
rpc GetReadVersion(GetReadVersionRequest) returns (GetReadVersionReply);
30+
31+
// Retrieves comprehensive cluster status including health, performance, and configuration.
32+
rpc GetStatus(GetStatusRequest) returns (GetStatusReply);
33+
34+
// Retrieves the list of all worker processes in the cluster.
35+
rpc GetWorkers(GetWorkersRequest) returns (GetWorkersReply);
36+
37+
// Re-enables previously excluded workers, allowing them to rejoin the cluster.
38+
rpc Include(IncludeRequest) returns (IncludeReply);
39+
40+
// Excludes workers from the cluster, triggering graceful data migration away from them.
41+
rpc Exclude(ExcludeRequest) returns (ExcludeReply);
42+
43+
// Retrieves the status of worker exclusions including in-progress migrations.
44+
rpc ExcludeStatus(ExcludeStatusRequest) returns (ExcludeStatusReply);
45+
46+
// Forcefully terminates worker processes. Use with caution - prefer Exclude for graceful removal.
47+
rpc Kill(KillRequest) returns (KillReply);
48+
}
49+
50+
//------ Messages -------
51+
52+
// Worker represents a FoundationDB process in the cluster.
53+
// Workers can have different roles (storage, transaction, etc.) and are
54+
// identified by their network address and locality information.
55+
message Worker {
56+
// Locality describes the physical or logical location of a worker process.
57+
// These fields help FoundationDB make intelligent placement decisions for
58+
// data replication and fault tolerance.
59+
message Locality {
60+
// Unique identifier for this process
61+
optional string process_id = 1;
62+
63+
// Zone identifier - typically represents a failure domain (e.g., rack, availability zone)
64+
optional string zone_id = 2;
65+
66+
// Machine identifier - identifies the physical or virtual machine
67+
optional string machine_id = 3;
68+
69+
// Datacenter identifier - identifies which datacenter this worker is in
70+
optional string dc_id = 4;
71+
72+
// Data hall identifier - identifies which data hall within a datacenter
73+
optional string data_hall_id = 5;
74+
}
75+
76+
// Network address where this worker can be reached (e.g., "127.0.0.1:4500")
77+
optional string address = 1;
78+
79+
// gRPC address for this worker if gRPC is enabled
80+
optional string grpc_address = 2;
81+
82+
// Process class determines the worker's role (e.g., "storage", "transaction", "stateless")
83+
optional string process_class = 3;
84+
85+
// Locality information describing where this worker is physically/logically located
86+
optional Locality locality = 4;
87+
}
88+
89+
// Request to retrieve the current cluster coordinators.
90+
// This returns the list of processes that are currently acting as coordinators
91+
// for the cluster's coordination state.
92+
message GetCoordinatorsRequest {}
93+
94+
// Response containing the current cluster coordinators.
95+
message GetCoordinatorsReply {
96+
// List of coordinator addresses in the format "ip:port"
97+
repeated string coordinators = 1;
98+
}
99+
100+
// Request to change the cluster's coordinators.
101+
// Coordinators maintain the cluster's configuration and state. Changing them
102+
// is a critical operation that should be done carefully.
103+
message ChangeCoordinatorsRequest {
104+
// Human-readable description for the cluster (e.g., cluster name)
105+
optional string cluster_description = 1;
106+
107+
// If true, disables the configuration database
108+
optional bool disable_config_db = 2;
109+
110+
// If true, automatically selects coordinators based on the current cluster topology.
111+
// When false, uses the addresses specified in new_coordinator_addresses.
112+
optional bool automatic_coordinators = 3;
113+
114+
// List of addresses to use as new coordinators (when automatic_coordinators is false).
115+
// Each address should be in the format "ip:port".
116+
repeated string new_coordinator_addresses = 4;
117+
}
118+
119+
// Response to a coordinator change operation.
120+
message ChangeCoordinatorsReply {
121+
// True if the coordinators were actually changed, false if they remained the same
122+
optional bool changed = 1;
123+
124+
// The current list of coordinator addresses after the operation
125+
repeated string coordinators = 2;
126+
}
127+
128+
// Request for automatic configuration suggestions.
129+
// Analyzes the current cluster and returns suggested configuration parameters.
130+
message ConfigureAutoSuggestRequest {}
131+
132+
// Response containing suggested configuration parameters.
133+
message ConfigureAutoSuggestReply {
134+
// Suggested configuration that can be used with the Configure RPC
135+
ConfigureRequest configure_request = 1;
136+
}
137+
138+
// Request to configure database settings.
139+
// This is one of the most critical operations for a FoundationDB cluster,
140+
// affecting redundancy, storage engines, process roles, and various other settings.
141+
message ConfigureRequest {
142+
// Database creation flags
143+
144+
// If true, initialize a new database (only valid on first use of cluster)
145+
optional bool new_database = 1;
146+
147+
// If true, enables testing storage server configuration
148+
optional bool tss = 2;
149+
150+
// Redundancy mode determines how many copies of data are maintained
151+
// and what failure scenarios the cluster can survive
152+
enum RedundancyMode {
153+
UNSET_REDUNDANCY = 0; // No change to redundancy mode
154+
SINGLE = 1; // One copy, not fault tolerant (for testing only)
155+
DOUBLE = 2; // Two copies, survives one failure
156+
TRIPLE = 3; // Three copies, survives two failures
157+
THREE_DATA_HALL = 4; // Three data hall configuration for geographic redundancy
158+
THREE_DATACENTER = 5; // Three datacenter configuration for maximum availability
159+
}
160+
// Desired redundancy mode for the database
161+
optional RedundancyMode redundancy_mode = 3;
162+
163+
// Storage engine determines the underlying storage technology and performance characteristics
164+
enum StorageEngine {
165+
UNSET_STORAGE = 0; // No change to storage engine
166+
SSD = 1; // B-Tree optimized for SSDs (default)
167+
SSD_1 = 2; // ssd-1 variant
168+
SSD_2 = 3; // ssd-2 variant (newer redwood engine)
169+
MEMORY = 4; // In-memory storage (for testing or caching)
170+
MEMORY_1 = 5; // memory-1 variant
171+
MEMORY_2 = 6; // memory-2 variant
172+
MEMORY_RADIXTREE = 7; // memory-radixtree variant
173+
}
174+
// Desired storage engine for the database
175+
optional StorageEngine storage_engine = 4;
176+
177+
// Process role counts control how many processes are assigned to each role.
178+
// -1 means restore to the default value for the current cluster size.
179+
180+
// Number of transaction log processes
181+
optional int32 logs = 5;
182+
183+
// Number of commit proxy processes (handle transaction commits)
184+
optional int32 commit_proxies = 6;
185+
186+
// Number of GRV (GetReadVersion) proxy processes (handle read version requests)
187+
optional int32 grv_proxies = 7;
188+
189+
// Number of resolver processes (handle transaction conflicts)
190+
optional int32 resolvers = 8;
191+
192+
// Perpetual storage wiggle settings control automatic storage server replacement
193+
// to proactively detect hardware issues
194+
195+
// Enables automatic cycling of storage servers
196+
optional bool perpetual_storage_wiggle = 9;
197+
198+
// Locality filter for wiggle: "locality_key:locality_value" or "0" to disable filtering
199+
optional string perpetual_storage_wiggle_locality = 10;
200+
201+
// Storage engine type to wiggle (optional filter)
202+
optional string perpetual_storage_wiggle_engine = 11;
203+
204+
// Storage migration controls how data moves between different storage types
205+
enum StorageMigrationType {
206+
UNSET_MIGRATION = 0; // No change to migration type
207+
DISABLED = 1; // Disable storage migration
208+
GRADUAL = 2; // Gradually migrate data
209+
AGGRESSIVE = 3; // Aggressively migrate data
210+
}
211+
// How to handle storage migration when changing storage engines
212+
optional StorageMigrationType storage_migration_type = 12;
213+
214+
// Encryption at rest settings control how data is encrypted on disk
215+
enum EncryptionAtRestMode {
216+
UNSET_ENCRYPTION = 0; // No change to encryption mode
217+
DISABLED_ENCRYPTION = 1; // Disable encryption at rest
218+
DOMAIN_AWARE = 2; // Domain-aware encryption (tenant-based)
219+
CLUSTER_AWARE = 3; // Cluster-wide encryption
220+
}
221+
// Encryption at rest mode for the database
222+
optional EncryptionAtRestMode encryption_at_rest_mode = 13;
223+
224+
// Other database features
225+
226+
// If true, enables blob granules for storing large values
227+
optional bool blob_granules_enabled = 14;
228+
229+
// List of addresses to exclude during recruitment (format: "ip" or "ip:port")
230+
repeated string exclude_addresses = 15;
231+
232+
// Number of testing storage servers to maintain
233+
optional int32 tss_count = 16;
234+
235+
// Control flags
236+
237+
// If true, skip safety checks (dangerous - use with caution)
238+
optional bool force = 17;
239+
}
240+
241+
// Response to a database configuration request.
242+
message ConfigureReply {
243+
// Result codes indicating the outcome of the configuration operation
244+
enum Result {
245+
SUCCESS = 0; // Configuration succeeded
246+
NO_OPTIONS_PROVIDED = 1; // No configuration options were provided
247+
CONFLICTING_OPTIONS = 2; // Conflicting configuration options were specified
248+
UNKNOWN_OPTION = 3; // An unknown configuration option was provided
249+
INCOMPLETE_CONFIGURATION = 4; // Configuration is incomplete
250+
INVALID_CONFIGURATION = 5; // Configuration is invalid
251+
STORAGE_MIGRATION_DISABLED = 6; // Storage migration is disabled
252+
DATABASE_ALREADY_CREATED = 7; // Database has already been created
253+
DATABASE_CREATED = 8; // Database was newly created
254+
DATABASE_UNAVAILABLE = 9; // Database is unavailable
255+
STORAGE_IN_UNKNOWN_DCID = 10; // Storage servers in unknown datacenter ID
256+
REGION_NOT_FULLY_REPLICATED = 11; // Region is not fully replicated
257+
MULTIPLE_ACTIVE_REGIONS = 12; // Multiple active regions detected
258+
REGIONS_CHANGED = 13; // Regions configuration changed
259+
NOT_ENOUGH_WORKERS = 14; // Not enough workers to satisfy configuration
260+
REGION_REPLICATION_MISMATCH = 15; // Region replication mismatch
261+
DCID_MISSING = 16; // Datacenter ID is missing
262+
LOCKED_NOT_NEW = 17; // Database is locked but not new
263+
SUCCESS_WARN_PPW_GRADUAL = 18; // Success with warning about perpetual wiggle gradual mode
264+
SUCCESS_WARN_SHARDED_ROCKSDB_EXPERIMENTAL = 19; // Success with warning about experimental sharded RocksDB
265+
DATABASE_IS_REGISTERED = 20; // Database is already registered
266+
ENCRYPTION_AT_REST_MODE_ALREADY_SET = 21; // Encryption mode is already set
267+
INVALID_STORAGE_TYPE = 22; // Invalid storage type specified
268+
}
269+
270+
// Result code indicating success or specific error/warning
271+
Result result = 1;
272+
273+
// Additional human-readable details for errors or warnings
274+
optional string message = 2;
6275
}
276+
277+
// Request to get the current read version of the database.
278+
// The read version is a monotonically increasing transaction version number
279+
// used for snapshot isolation.
280+
message GetReadVersionRequest {}
281+
282+
// Response containing the current read version.
283+
message GetReadVersionReply {
284+
// Current read version (transaction version number)
285+
optional int64 version = 1;
286+
}
287+
288+
// Request to retrieve the cluster status.
289+
// This provides comprehensive information about the cluster's health and state.
290+
message GetStatusRequest {}
291+
292+
// Response containing the cluster status.
293+
message GetStatusReply {
294+
// JSON-formatted status information containing cluster health, performance metrics,
295+
// configuration, processes, and workload information
296+
optional string result = 1;
297+
}
298+
299+
// Request to retrieve all workers in the cluster.
300+
message GetWorkersRequest {}
301+
302+
// Response containing the list of all workers.
303+
message GetWorkersReply {
304+
// List of all worker processes in the cluster
305+
repeated Worker workers = 1;
306+
}
307+
308+
// Request to include (re-enable) previously excluded workers.
309+
// This reverses the effect of an Exclude operation, allowing workers to
310+
// participate in the cluster again.
311+
message IncludeRequest {
312+
// If true, include all previously excluded workers
313+
optional bool all = 1;
314+
315+
// List of specific addresses to include (format: "ip" or "ip:port")
316+
repeated string addresses = 2;
317+
318+
// List of localities to include (format: "locality_key:locality_value")
319+
repeated string localities = 3;
320+
321+
// If true, include workers that were marked as failed
322+
optional bool failed = 4;
323+
}
324+
325+
// Response to an include operation.
326+
message IncludeReply {
327+
// Number of workers included.
328+
optional int32 num_included = 1;
329+
}
330+
331+
// Request to exclude workers from the cluster.
332+
// Excluding a worker prevents it from being assigned new data or roles,
333+
// and the cluster will migrate existing data away from it.
334+
message ExcludeRequest {
335+
// If true, exclude all workers (rarely used, requires force flag)
336+
optional bool all = 1;
337+
338+
// If true, mark workers as failed (more aggressive than normal exclude)
339+
optional bool failed = 2;
340+
341+
// If true, don't wait for data migration to complete before returning
342+
optional bool no_wait = 3;
343+
344+
// If true, bypass safety checks (use with extreme caution)
345+
optional bool force = 4;
346+
347+
// List of localities to exclude (format: "locality_key:locality_value")
348+
repeated string localities = 5;
349+
350+
// List of process addresses to exclude (format: "ip:port")
351+
repeated string processes = 6;
352+
353+
// List of host addresses to exclude (format: "ip", excludes all ports on that host)
354+
repeated string hosts = 7;
355+
}
356+
357+
// Response to an exclude operation.
358+
message ExcludeReply {
359+
// Number of workers excluded
360+
optional int32 num_excluded = 1;
361+
362+
// True if data movement is complete.
363+
optional bool data_movement_complete = 2;
364+
}
365+
366+
// Request to get the status of exclusions.
367+
message ExcludeStatusRequest {}
368+
369+
// Response containing the status of all exclusions.
370+
// TODO: Use Locality structure instead of strings.
371+
message ExcludeStatusReply {
372+
// Addresses that are currently excluded
373+
repeated string excluded_addresses = 1;
374+
375+
// Localities that are currently excluded
376+
repeated string excluded_localities = 2;
377+
378+
// Addresses that are marked as failed
379+
repeated string failed_addresses = 3;
380+
381+
// Localities that are marked as failed
382+
repeated string failed_localities = 4;
383+
384+
// Exclusions that are currently in progress (data migration not yet complete)
385+
repeated string in_progress_excludes = 5;
386+
}
387+
388+
// Request to kill (terminate) worker processes.
389+
// This is a forceful operation that immediately stops processes.
390+
// Use with caution - prefer exclude for graceful removal.
391+
message KillRequest {
392+
// If true, kill all workers (requires extreme caution)
393+
optional bool all = 1;
394+
395+
// List of specific worker addresses to kill (format: "ip:port")
396+
repeated string addresses = 2;
397+
}
398+
399+
// Response to a kill operation.
400+
message KillReply {}

0 commit comments

Comments
 (0)