Define ControlService interface

vishesh · vishesh · commit a0af9bd652d5 · 2025-11-07T16:27:00.000-08:00
Adds RPCs and messages that provides administration and management
operations for FDB cluster.
diff --git a/fdbctl/protos/control_service.proto b/fdbctl/protos/control_service.proto
@@ -2,5 +2,399 @@ syntax = "proto3";
 
 package fdbctl;
 
+option go_package = "github.com/apple/foundationdb/fdbctl";
+option java_package = "com.apple.foundationdb.fdbctl";
+
+//------ RPCs ------
+
+// ControlService provides administrative and management operations for a FoundationDB cluster.
+// This service allows control over cluster configuration, coordinator management, worker lifecycle,
+// and monitoring of cluster health and status.
 service ControlService {
+    // Retrieves the current list of coordinators for the cluster.
+    rpc GetCoordinators(GetCoordinatorsRequest) returns (GetCoordinatorsReply);
+
+    // Changes the cluster's coordinators. This is a critical operation that should be
+    // performed carefully as coordinators maintain cluster configuration state.
+    rpc ChangeCoordinators(ChangeCoordinatorsRequest) returns (ChangeCoordinatorsReply);
+
+    // Suggests optimal configuration parameters based on current cluster topology.
+    rpc ConfigureAutoSuggest(ConfigureAutoSuggestRequest) returns (ConfigureAutoSuggestReply);
+
+    // Configures database settings including redundancy mode, storage engine, process counts,
+    // encryption, and other critical cluster parameters.
+    rpc Configure(ConfigureRequest) returns (ConfigureReply);
+
+    // Retrieves the current read version (transaction version) of the database.
+    rpc GetReadVersion(GetReadVersionRequest) returns (GetReadVersionReply);
+
+    // Retrieves comprehensive cluster status including health, performance, and configuration.
+    rpc GetStatus(GetStatusRequest) returns (GetStatusReply);
+
+    // Retrieves the list of all worker processes in the cluster.
+    rpc GetWorkers(GetWorkersRequest) returns (GetWorkersReply);
+
+    // Re-enables previously excluded workers, allowing them to rejoin the cluster.
+    rpc Include(IncludeRequest) returns (IncludeReply);
+
+    // Excludes workers from the cluster, triggering graceful data migration away from them.
+    rpc Exclude(ExcludeRequest) returns (ExcludeReply);
+
+    // Retrieves the status of worker exclusions including in-progress migrations.
+    rpc ExcludeStatus(ExcludeStatusRequest) returns (ExcludeStatusReply);
+
+    // Forcefully terminates worker processes. Use with caution - prefer Exclude for graceful removal.
+    rpc Kill(KillRequest) returns (KillReply);
+}
+
+//------ Messages -------
+
+// Worker represents a FoundationDB process in the cluster.
+// Workers can have different roles (storage, transaction, etc.) and are
+// identified by their network address and locality information.
+message Worker {
+    // Locality describes the physical or logical location of a worker process.
+    // These fields help FoundationDB make intelligent placement decisions for
+    // data replication and fault tolerance.
+    message Locality {
+        // Unique identifier for this process
+        optional string process_id = 1;
+
+        // Zone identifier - typically represents a failure domain (e.g., rack, availability zone)
+        optional string zone_id = 2;
+
+        // Machine identifier - identifies the physical or virtual machine
+        optional string machine_id = 3;
+
+        // Datacenter identifier - identifies which datacenter this worker is in
+        optional string dc_id = 4;
+
+        // Data hall identifier - identifies which data hall within a datacenter
+        optional string data_hall_id = 5;
+    }
+
+    // Network address where this worker can be reached (e.g., "127.0.0.1:4500")
+    optional string address = 1;
+
+    // gRPC address for this worker if gRPC is enabled
+    optional string grpc_address = 2;
+
+    // Process class determines the worker's role (e.g., "storage", "transaction", "stateless")
+    optional string process_class = 3;
+
+    // Locality information describing where this worker is physically/logically located
+    optional Locality locality = 4;
+}
+
+// Request to retrieve the current cluster coordinators.
+// This returns the list of processes that are currently acting as coordinators
+// for the cluster's coordination state.
+message GetCoordinatorsRequest {}
+
+// Response containing the current cluster coordinators.
+message GetCoordinatorsReply {
+    // List of coordinator addresses in the format "ip:port"
+    repeated string coordinators = 1;
+}
+
+// Request to change the cluster's coordinators.
+// Coordinators maintain the cluster's configuration and state. Changing them
+// is a critical operation that should be done carefully.
+message ChangeCoordinatorsRequest {
+    // Human-readable description for the cluster (e.g., cluster name)
+    optional string cluster_description = 1;
+
+    // If true, disables the configuration database
+    optional bool disable_config_db = 2;
+
+    // If true, automatically selects coordinators based on the current cluster topology.
+    // When false, uses the addresses specified in new_coordinator_addresses.
+    optional bool automatic_coordinators = 3;
+
+    // List of addresses to use as new coordinators (when automatic_coordinators is false).
+    // Each address should be in the format "ip:port".
+    repeated string new_coordinator_addresses = 4;
+}
+
+// Response to a coordinator change operation.
+message ChangeCoordinatorsReply {
+    // True if the coordinators were actually changed, false if they remained the same
+    optional bool changed = 1;
+
+    // The current list of coordinator addresses after the operation
+    repeated string coordinators = 2;
+}
+
+// Request for automatic configuration suggestions.
+// Analyzes the current cluster and returns suggested configuration parameters.
+message ConfigureAutoSuggestRequest {}
+
+// Response containing suggested configuration parameters.
+message ConfigureAutoSuggestReply {
+    // Suggested configuration that can be used with the Configure RPC
+    ConfigureRequest configure_request = 1;
+}
+
+// Request to configure database settings.
+// This is one of the most critical operations for a FoundationDB cluster,
+// affecting redundancy, storage engines, process roles, and various other settings.
+message ConfigureRequest {
+    // Database creation flags
+
+    // If true, initialize a new database (only valid on first use of cluster)
+    optional bool new_database = 1;
+
+    // If true, enables testing storage server configuration
+    optional bool tss = 2;
+
+    // Redundancy mode determines how many copies of data are maintained
+    // and what failure scenarios the cluster can survive
+    enum RedundancyMode {
+        UNSET_REDUNDANCY = 0;     // No change to redundancy mode
+        SINGLE = 1;               // One copy, not fault tolerant (for testing only)
+        DOUBLE = 2;               // Two copies, survives one failure
+        TRIPLE = 3;               // Three copies, survives two failures
+        THREE_DATA_HALL = 4;      // Three data hall configuration for geographic redundancy
+        THREE_DATACENTER = 5;     // Three datacenter configuration for maximum availability
+    }
+    // Desired redundancy mode for the database
+    optional RedundancyMode redundancy_mode = 3;
+
+    // Storage engine determines the underlying storage technology and performance characteristics
+    enum StorageEngine {
+        UNSET_STORAGE = 0;        // No change to storage engine
+        SSD = 1;                  // B-Tree optimized for SSDs (default)
+        SSD_1 = 2;                // ssd-1 variant
+        SSD_2 = 3;                // ssd-2 variant (newer redwood engine)
+        MEMORY = 4;               // In-memory storage (for testing or caching)
+        MEMORY_1 = 5;             // memory-1 variant
+        MEMORY_2 = 6;             // memory-2 variant
+        MEMORY_RADIXTREE = 7;     // memory-radixtree variant
+    }
+    // Desired storage engine for the database
+    optional StorageEngine storage_engine = 4;
+
+    // Process role counts control how many processes are assigned to each role.
+    // -1 means restore to the default value for the current cluster size.
+
+    // Number of transaction log processes
+    optional int32 logs = 5;
+
+    // Number of commit proxy processes (handle transaction commits)
+    optional int32 commit_proxies = 6;
+
+    // Number of GRV (GetReadVersion) proxy processes (handle read version requests)
+    optional int32 grv_proxies = 7;
+
+    // Number of resolver processes (handle transaction conflicts)
+    optional int32 resolvers = 8;
+
+    // Perpetual storage wiggle settings control automatic storage server replacement
+    // to proactively detect hardware issues
+
+    // Enables automatic cycling of storage servers
+    optional bool perpetual_storage_wiggle = 9;
+
+    // Locality filter for wiggle: "locality_key:locality_value" or "0" to disable filtering
+    optional string perpetual_storage_wiggle_locality = 10;
+
+    // Storage engine type to wiggle (optional filter)
+    optional string perpetual_storage_wiggle_engine = 11;
+
+    // Storage migration controls how data moves between different storage types
+    enum StorageMigrationType {
+        UNSET_MIGRATION = 0;      // No change to migration type
+        DISABLED = 1;             // Disable storage migration
+        GRADUAL = 2;              // Gradually migrate data
+        AGGRESSIVE = 3;           // Aggressively migrate data
+    }
+    // How to handle storage migration when changing storage engines
+    optional StorageMigrationType storage_migration_type = 12;
+
+    // Encryption at rest settings control how data is encrypted on disk
+    enum EncryptionAtRestMode {
+        UNSET_ENCRYPTION = 0;     // No change to encryption mode
+        DISABLED_ENCRYPTION = 1;  // Disable encryption at rest
+        DOMAIN_AWARE = 2;         // Domain-aware encryption (tenant-based)
+        CLUSTER_AWARE = 3;        // Cluster-wide encryption
+    }
+    // Encryption at rest mode for the database
+    optional EncryptionAtRestMode encryption_at_rest_mode = 13;
+
+    // Other database features
+
+    // If true, enables blob granules for storing large values
+    optional bool blob_granules_enabled = 14;
+
+    // List of addresses to exclude during recruitment (format: "ip" or "ip:port")
+    repeated string exclude_addresses = 15;
+
+    // Number of testing storage servers to maintain
+    optional int32 tss_count = 16;
+
+    // Control flags
+
+    // If true, skip safety checks (dangerous - use with caution)
+    optional bool force = 17;
+}
+
+// Response to a database configuration request.
+message ConfigureReply {
+    // Result codes indicating the outcome of the configuration operation
+    enum Result {
+        SUCCESS = 0;                                  // Configuration succeeded
+        NO_OPTIONS_PROVIDED = 1;                      // No configuration options were provided
+        CONFLICTING_OPTIONS = 2;                      // Conflicting configuration options were specified
+        UNKNOWN_OPTION = 3;                           // An unknown configuration option was provided
+        INCOMPLETE_CONFIGURATION = 4;                 // Configuration is incomplete
+        INVALID_CONFIGURATION = 5;                    // Configuration is invalid
+        STORAGE_MIGRATION_DISABLED = 6;               // Storage migration is disabled
+        DATABASE_ALREADY_CREATED = 7;                 // Database has already been created
+        DATABASE_CREATED = 8;                         // Database was newly created
+        DATABASE_UNAVAILABLE = 9;                     // Database is unavailable
+        STORAGE_IN_UNKNOWN_DCID = 10;                 // Storage servers in unknown datacenter ID
+        REGION_NOT_FULLY_REPLICATED = 11;             // Region is not fully replicated
+        MULTIPLE_ACTIVE_REGIONS = 12;                 // Multiple active regions detected
+        REGIONS_CHANGED = 13;                         // Regions configuration changed
+        NOT_ENOUGH_WORKERS = 14;                      // Not enough workers to satisfy configuration
+        REGION_REPLICATION_MISMATCH = 15;             // Region replication mismatch
+        DCID_MISSING = 16;                            // Datacenter ID is missing
+        LOCKED_NOT_NEW = 17;                          // Database is locked but not new
+        SUCCESS_WARN_PPW_GRADUAL = 18;                // Success with warning about perpetual wiggle gradual mode
+        SUCCESS_WARN_SHARDED_ROCKSDB_EXPERIMENTAL = 19; // Success with warning about experimental sharded RocksDB
+        DATABASE_IS_REGISTERED = 20;                  // Database is already registered
+        ENCRYPTION_AT_REST_MODE_ALREADY_SET = 21;     // Encryption mode is already set
+        INVALID_STORAGE_TYPE = 22;                    // Invalid storage type specified
+    }
+
+    // Result code indicating success or specific error/warning
+    Result result = 1;
+
+    // Additional human-readable details for errors or warnings
+    optional string message = 2;
 }
+
+// Request to get the current read version of the database.
+// The read version is a monotonically increasing transaction version number
+// used for snapshot isolation.
+message GetReadVersionRequest {}
+
+// Response containing the current read version.
+message GetReadVersionReply {
+    // Current read version (transaction version number)
+    optional int64 version = 1;
+}
+
+// Request to retrieve the cluster status.
+// This provides comprehensive information about the cluster's health and state.
+message GetStatusRequest {}
+
+// Response containing the cluster status.
+message GetStatusReply {
+    // JSON-formatted status information containing cluster health, performance metrics,
+    // configuration, processes, and workload information
+    optional string result = 1;
+}
+
+// Request to retrieve all workers in the cluster.
+message GetWorkersRequest {}
+
+// Response containing the list of all workers.
+message GetWorkersReply {
+    // List of all worker processes in the cluster
+    repeated Worker workers = 1;
+}
+
+// Request to include (re-enable) previously excluded workers.
+// This reverses the effect of an Exclude operation, allowing workers to
+// participate in the cluster again.
+message IncludeRequest {
+    // If true, include all previously excluded workers
+    optional bool all = 1;
+
+    // List of specific addresses to include (format: "ip" or "ip:port")
+    repeated string addresses = 2;
+
+    // List of localities to include (format: "locality_key:locality_value")
+    repeated string localities = 3;
+
+    // If true, include workers that were marked as failed
+    optional bool failed = 4;
+}
+
+// Response to an include operation.
+message IncludeReply {
+    // Number of workers included.
+    optional int32 num_included = 1;
+}
+
+// Request to exclude workers from the cluster.
+// Excluding a worker prevents it from being assigned new data or roles,
+// and the cluster will migrate existing data away from it.
+message ExcludeRequest {
+    // If true, exclude all workers (rarely used, requires force flag)
+    optional bool all = 1;
+
+    // If true, mark workers as failed (more aggressive than normal exclude)
+    optional bool failed = 2;
+
+    // If true, don't wait for data migration to complete before returning
+    optional bool no_wait = 3;
+
+    // If true, bypass safety checks (use with extreme caution)
+    optional bool force = 4;
+
+    // List of localities to exclude (format: "locality_key:locality_value")
+    repeated string localities = 5;
+
+    // List of process addresses to exclude (format: "ip:port")
+    repeated string processes = 6;
+
+    // List of host addresses to exclude (format: "ip", excludes all ports on that host)
+    repeated string hosts = 7;
+}
+
+// Response to an exclude operation.
+message ExcludeReply {
+    // Number of workers excluded
+    optional int32 num_excluded = 1;
+
+    // True if data movement is complete.
+    optional bool data_movement_complete = 2;
+}
+
+// Request to get the status of exclusions.
+message ExcludeStatusRequest {}
+
+// Response containing the status of all exclusions.
+// TODO: Use Locality structure instead of strings.
+message ExcludeStatusReply {
+    // Addresses that are currently excluded
+    repeated string excluded_addresses = 1;
+
+    // Localities that are currently excluded
+    repeated string excluded_localities = 2;
+
+    // Addresses that are marked as failed
+    repeated string failed_addresses = 3;
+
+    // Localities that are marked as failed
+    repeated string failed_localities = 4;
+
+    // Exclusions that are currently in progress (data migration not yet complete)
+    repeated string in_progress_excludes = 5;
+}
+
+// Request to kill (terminate) worker processes.
+// This is a forceful operation that immediately stops processes.
+// Use with caution - prefer exclude for graceful removal.
+message KillRequest {
+    // If true, kill all workers (requires extreme caution)
+    optional bool all = 1;
+
+    // List of specific worker addresses to kill (format: "ip:port")
+    repeated string addresses = 2;
+}
+
+// Response to a kill operation.
+message KillReply {}