@@ -2,5 +2,399 @@ syntax = "proto3";
22
33package fdbctl ;
44
5+ option go_package = "github.com/apple/foundationdb/fdbctl" ;
6+ option java_package = "com.apple.foundationdb.fdbctl" ;
7+
8+ //------ RPCs ------
9+
10+ // ControlService provides administrative and management operations for a FoundationDB cluster.
11+ // This service allows control over cluster configuration, coordinator management, worker lifecycle,
12+ // and monitoring of cluster health and status.
513service ControlService {
14+ // Retrieves the current list of coordinators for the cluster.
15+ rpc GetCoordinators (GetCoordinatorsRequest ) returns (GetCoordinatorsReply );
16+
17+ // Changes the cluster's coordinators. This is a critical operation that should be
18+ // performed carefully as coordinators maintain cluster configuration state.
19+ rpc ChangeCoordinators (ChangeCoordinatorsRequest ) returns (ChangeCoordinatorsReply );
20+
21+ // Suggests optimal configuration parameters based on current cluster topology.
22+ rpc ConfigureAutoSuggest (ConfigureAutoSuggestRequest ) returns (ConfigureAutoSuggestReply );
23+
24+ // Configures database settings including redundancy mode, storage engine, process counts,
25+ // encryption, and other critical cluster parameters.
26+ rpc Configure (ConfigureRequest ) returns (ConfigureReply );
27+
28+ // Retrieves the current read version (transaction version) of the database.
29+ rpc GetReadVersion (GetReadVersionRequest ) returns (GetReadVersionReply );
30+
31+ // Retrieves comprehensive cluster status including health, performance, and configuration.
32+ rpc GetStatus (GetStatusRequest ) returns (GetStatusReply );
33+
34+ // Retrieves the list of all worker processes in the cluster.
35+ rpc GetWorkers (GetWorkersRequest ) returns (GetWorkersReply );
36+
37+ // Re-enables previously excluded workers, allowing them to rejoin the cluster.
38+ rpc Include (IncludeRequest ) returns (IncludeReply );
39+
40+ // Excludes workers from the cluster, triggering graceful data migration away from them.
41+ rpc Exclude (ExcludeRequest ) returns (ExcludeReply );
42+
43+ // Retrieves the status of worker exclusions including in-progress migrations.
44+ rpc ExcludeStatus (ExcludeStatusRequest ) returns (ExcludeStatusReply );
45+
46+ // Forcefully terminates worker processes. Use with caution - prefer Exclude for graceful removal.
47+ rpc Kill (KillRequest ) returns (KillReply );
48+ }
49+
50+ //------ Messages -------
51+
52+ // Worker represents a FoundationDB process in the cluster.
53+ // Workers can have different roles (storage, transaction, etc.) and are
54+ // identified by their network address and locality information.
55+ message Worker {
56+ // Locality describes the physical or logical location of a worker process.
57+ // These fields help FoundationDB make intelligent placement decisions for
58+ // data replication and fault tolerance.
59+ message Locality {
60+ // Unique identifier for this process
61+ optional string process_id = 1 ;
62+
63+ // Zone identifier - typically represents a failure domain (e.g., rack, availability zone)
64+ optional string zone_id = 2 ;
65+
66+ // Machine identifier - identifies the physical or virtual machine
67+ optional string machine_id = 3 ;
68+
69+ // Datacenter identifier - identifies which datacenter this worker is in
70+ optional string dc_id = 4 ;
71+
72+ // Data hall identifier - identifies which data hall within a datacenter
73+ optional string data_hall_id = 5 ;
74+ }
75+
76+ // Network address where this worker can be reached (e.g., "127.0.0.1:4500")
77+ optional string address = 1 ;
78+
79+ // gRPC address for this worker if gRPC is enabled
80+ optional string grpc_address = 2 ;
81+
82+ // Process class determines the worker's role (e.g., "storage", "transaction", "stateless")
83+ optional string process_class = 3 ;
84+
85+ // Locality information describing where this worker is physically/logically located
86+ optional Locality locality = 4 ;
87+ }
88+
89+ // Request to retrieve the current cluster coordinators.
90+ // This returns the list of processes that are currently acting as coordinators
91+ // for the cluster's coordination state.
92+ message GetCoordinatorsRequest {}
93+
94+ // Response containing the current cluster coordinators.
95+ message GetCoordinatorsReply {
96+ // List of coordinator addresses in the format "ip:port"
97+ repeated string coordinators = 1 ;
98+ }
99+
100+ // Request to change the cluster's coordinators.
101+ // Coordinators maintain the cluster's configuration and state. Changing them
102+ // is a critical operation that should be done carefully.
103+ message ChangeCoordinatorsRequest {
104+ // Human-readable description for the cluster (e.g., cluster name)
105+ optional string cluster_description = 1 ;
106+
107+ // If true, disables the configuration database
108+ optional bool disable_config_db = 2 ;
109+
110+ // If true, automatically selects coordinators based on the current cluster topology.
111+ // When false, uses the addresses specified in new_coordinator_addresses.
112+ optional bool automatic_coordinators = 3 ;
113+
114+ // List of addresses to use as new coordinators (when automatic_coordinators is false).
115+ // Each address should be in the format "ip:port".
116+ repeated string new_coordinator_addresses = 4 ;
117+ }
118+
119+ // Response to a coordinator change operation.
120+ message ChangeCoordinatorsReply {
121+ // True if the coordinators were actually changed, false if they remained the same
122+ optional bool changed = 1 ;
123+
124+ // The current list of coordinator addresses after the operation
125+ repeated string coordinators = 2 ;
126+ }
127+
128+ // Request for automatic configuration suggestions.
129+ // Analyzes the current cluster and returns suggested configuration parameters.
130+ message ConfigureAutoSuggestRequest {}
131+
132+ // Response containing suggested configuration parameters.
133+ message ConfigureAutoSuggestReply {
134+ // Suggested configuration that can be used with the Configure RPC
135+ ConfigureRequest configure_request = 1 ;
136+ }
137+
138+ // Request to configure database settings.
139+ // This is one of the most critical operations for a FoundationDB cluster,
140+ // affecting redundancy, storage engines, process roles, and various other settings.
141+ message ConfigureRequest {
142+ // Database creation flags
143+
144+ // If true, initialize a new database (only valid on first use of cluster)
145+ optional bool new_database = 1 ;
146+
147+ // If true, enables testing storage server configuration
148+ optional bool tss = 2 ;
149+
150+ // Redundancy mode determines how many copies of data are maintained
151+ // and what failure scenarios the cluster can survive
152+ enum RedundancyMode {
153+ UNSET_REDUNDANCY = 0 ; // No change to redundancy mode
154+ SINGLE = 1 ; // One copy, not fault tolerant (for testing only)
155+ DOUBLE = 2 ; // Two copies, survives one failure
156+ TRIPLE = 3 ; // Three copies, survives two failures
157+ THREE_DATA_HALL = 4 ; // Three data hall configuration for geographic redundancy
158+ THREE_DATACENTER = 5 ; // Three datacenter configuration for maximum availability
159+ }
160+ // Desired redundancy mode for the database
161+ optional RedundancyMode redundancy_mode = 3 ;
162+
163+ // Storage engine determines the underlying storage technology and performance characteristics
164+ enum StorageEngine {
165+ UNSET_STORAGE = 0 ; // No change to storage engine
166+ SSD = 1 ; // B-Tree optimized for SSDs (default)
167+ SSD_1 = 2 ; // ssd-1 variant
168+ SSD_2 = 3 ; // ssd-2 variant (newer redwood engine)
169+ MEMORY = 4 ; // In-memory storage (for testing or caching)
170+ MEMORY_1 = 5 ; // memory-1 variant
171+ MEMORY_2 = 6 ; // memory-2 variant
172+ MEMORY_RADIXTREE = 7 ; // memory-radixtree variant
173+ }
174+ // Desired storage engine for the database
175+ optional StorageEngine storage_engine = 4 ;
176+
177+ // Process role counts control how many processes are assigned to each role.
178+ // -1 means restore to the default value for the current cluster size.
179+
180+ // Number of transaction log processes
181+ optional int32 logs = 5 ;
182+
183+ // Number of commit proxy processes (handle transaction commits)
184+ optional int32 commit_proxies = 6 ;
185+
186+ // Number of GRV (GetReadVersion) proxy processes (handle read version requests)
187+ optional int32 grv_proxies = 7 ;
188+
189+ // Number of resolver processes (handle transaction conflicts)
190+ optional int32 resolvers = 8 ;
191+
192+ // Perpetual storage wiggle settings control automatic storage server replacement
193+ // to proactively detect hardware issues
194+
195+ // Enables automatic cycling of storage servers
196+ optional bool perpetual_storage_wiggle = 9 ;
197+
198+ // Locality filter for wiggle: "locality_key:locality_value" or "0" to disable filtering
199+ optional string perpetual_storage_wiggle_locality = 10 ;
200+
201+ // Storage engine type to wiggle (optional filter)
202+ optional string perpetual_storage_wiggle_engine = 11 ;
203+
204+ // Storage migration controls how data moves between different storage types
205+ enum StorageMigrationType {
206+ UNSET_MIGRATION = 0 ; // No change to migration type
207+ DISABLED = 1 ; // Disable storage migration
208+ GRADUAL = 2 ; // Gradually migrate data
209+ AGGRESSIVE = 3 ; // Aggressively migrate data
210+ }
211+ // How to handle storage migration when changing storage engines
212+ optional StorageMigrationType storage_migration_type = 12 ;
213+
214+ // Encryption at rest settings control how data is encrypted on disk
215+ enum EncryptionAtRestMode {
216+ UNSET_ENCRYPTION = 0 ; // No change to encryption mode
217+ DISABLED_ENCRYPTION = 1 ; // Disable encryption at rest
218+ DOMAIN_AWARE = 2 ; // Domain-aware encryption (tenant-based)
219+ CLUSTER_AWARE = 3 ; // Cluster-wide encryption
220+ }
221+ // Encryption at rest mode for the database
222+ optional EncryptionAtRestMode encryption_at_rest_mode = 13 ;
223+
224+ // Other database features
225+
226+ // If true, enables blob granules for storing large values
227+ optional bool blob_granules_enabled = 14 ;
228+
229+ // List of addresses to exclude during recruitment (format: "ip" or "ip:port")
230+ repeated string exclude_addresses = 15 ;
231+
232+ // Number of testing storage servers to maintain
233+ optional int32 tss_count = 16 ;
234+
235+ // Control flags
236+
237+ // If true, skip safety checks (dangerous - use with caution)
238+ optional bool force = 17 ;
239+ }
240+
241+ // Response to a database configuration request.
242+ message ConfigureReply {
243+ // Result codes indicating the outcome of the configuration operation
244+ enum Result {
245+ SUCCESS = 0 ; // Configuration succeeded
246+ NO_OPTIONS_PROVIDED = 1 ; // No configuration options were provided
247+ CONFLICTING_OPTIONS = 2 ; // Conflicting configuration options were specified
248+ UNKNOWN_OPTION = 3 ; // An unknown configuration option was provided
249+ INCOMPLETE_CONFIGURATION = 4 ; // Configuration is incomplete
250+ INVALID_CONFIGURATION = 5 ; // Configuration is invalid
251+ STORAGE_MIGRATION_DISABLED = 6 ; // Storage migration is disabled
252+ DATABASE_ALREADY_CREATED = 7 ; // Database has already been created
253+ DATABASE_CREATED = 8 ; // Database was newly created
254+ DATABASE_UNAVAILABLE = 9 ; // Database is unavailable
255+ STORAGE_IN_UNKNOWN_DCID = 10 ; // Storage servers in unknown datacenter ID
256+ REGION_NOT_FULLY_REPLICATED = 11 ; // Region is not fully replicated
257+ MULTIPLE_ACTIVE_REGIONS = 12 ; // Multiple active regions detected
258+ REGIONS_CHANGED = 13 ; // Regions configuration changed
259+ NOT_ENOUGH_WORKERS = 14 ; // Not enough workers to satisfy configuration
260+ REGION_REPLICATION_MISMATCH = 15 ; // Region replication mismatch
261+ DCID_MISSING = 16 ; // Datacenter ID is missing
262+ LOCKED_NOT_NEW = 17 ; // Database is locked but not new
263+ SUCCESS_WARN_PPW_GRADUAL = 18 ; // Success with warning about perpetual wiggle gradual mode
264+ SUCCESS_WARN_SHARDED_ROCKSDB_EXPERIMENTAL = 19 ; // Success with warning about experimental sharded RocksDB
265+ DATABASE_IS_REGISTERED = 20 ; // Database is already registered
266+ ENCRYPTION_AT_REST_MODE_ALREADY_SET = 21 ; // Encryption mode is already set
267+ INVALID_STORAGE_TYPE = 22 ; // Invalid storage type specified
268+ }
269+
270+ // Result code indicating success or specific error/warning
271+ Result result = 1 ;
272+
273+ // Additional human-readable details for errors or warnings
274+ optional string message = 2 ;
6275}
276+
277+ // Request to get the current read version of the database.
278+ // The read version is a monotonically increasing transaction version number
279+ // used for snapshot isolation.
280+ message GetReadVersionRequest {}
281+
282+ // Response containing the current read version.
283+ message GetReadVersionReply {
284+ // Current read version (transaction version number)
285+ optional int64 version = 1 ;
286+ }
287+
288+ // Request to retrieve the cluster status.
289+ // This provides comprehensive information about the cluster's health and state.
290+ message GetStatusRequest {}
291+
292+ // Response containing the cluster status.
293+ message GetStatusReply {
294+ // JSON-formatted status information containing cluster health, performance metrics,
295+ // configuration, processes, and workload information
296+ optional string result = 1 ;
297+ }
298+
299+ // Request to retrieve all workers in the cluster.
300+ message GetWorkersRequest {}
301+
302+ // Response containing the list of all workers.
303+ message GetWorkersReply {
304+ // List of all worker processes in the cluster
305+ repeated Worker workers = 1 ;
306+ }
307+
308+ // Request to include (re-enable) previously excluded workers.
309+ // This reverses the effect of an Exclude operation, allowing workers to
310+ // participate in the cluster again.
311+ message IncludeRequest {
312+ // If true, include all previously excluded workers
313+ optional bool all = 1 ;
314+
315+ // List of specific addresses to include (format: "ip" or "ip:port")
316+ repeated string addresses = 2 ;
317+
318+ // List of localities to include (format: "locality_key:locality_value")
319+ repeated string localities = 3 ;
320+
321+ // If true, include workers that were marked as failed
322+ optional bool failed = 4 ;
323+ }
324+
325+ // Response to an include operation.
326+ message IncludeReply {
327+ // Number of workers included.
328+ optional int32 num_included = 1 ;
329+ }
330+
331+ // Request to exclude workers from the cluster.
332+ // Excluding a worker prevents it from being assigned new data or roles,
333+ // and the cluster will migrate existing data away from it.
334+ message ExcludeRequest {
335+ // If true, exclude all workers (rarely used, requires force flag)
336+ optional bool all = 1 ;
337+
338+ // If true, mark workers as failed (more aggressive than normal exclude)
339+ optional bool failed = 2 ;
340+
341+ // If true, don't wait for data migration to complete before returning
342+ optional bool no_wait = 3 ;
343+
344+ // If true, bypass safety checks (use with extreme caution)
345+ optional bool force = 4 ;
346+
347+ // List of localities to exclude (format: "locality_key:locality_value")
348+ repeated string localities = 5 ;
349+
350+ // List of process addresses to exclude (format: "ip:port")
351+ repeated string processes = 6 ;
352+
353+ // List of host addresses to exclude (format: "ip", excludes all ports on that host)
354+ repeated string hosts = 7 ;
355+ }
356+
357+ // Response to an exclude operation.
358+ message ExcludeReply {
359+ // Number of workers excluded
360+ optional int32 num_excluded = 1 ;
361+
362+ // True if data movement is complete.
363+ optional bool data_movement_complete = 2 ;
364+ }
365+
366+ // Request to get the status of exclusions.
367+ message ExcludeStatusRequest {}
368+
369+ // Response containing the status of all exclusions.
370+ // TODO: Use Locality structure instead of strings.
371+ message ExcludeStatusReply {
372+ // Addresses that are currently excluded
373+ repeated string excluded_addresses = 1 ;
374+
375+ // Localities that are currently excluded
376+ repeated string excluded_localities = 2 ;
377+
378+ // Addresses that are marked as failed
379+ repeated string failed_addresses = 3 ;
380+
381+ // Localities that are marked as failed
382+ repeated string failed_localities = 4 ;
383+
384+ // Exclusions that are currently in progress (data migration not yet complete)
385+ repeated string in_progress_excludes = 5 ;
386+ }
387+
388+ // Request to kill (terminate) worker processes.
389+ // This is a forceful operation that immediately stops processes.
390+ // Use with caution - prefer exclude for graceful removal.
391+ message KillRequest {
392+ // If true, kill all workers (requires extreme caution)
393+ optional bool all = 1 ;
394+
395+ // List of specific worker addresses to kill (format: "ip:port")
396+ repeated string addresses = 2 ;
397+ }
398+
399+ // Response to a kill operation.
400+ message KillReply {}
0 commit comments