Address review comments

vishesh · vishesh · commit 6a07470e3512 · 2025-11-10T15:29:50.000-08:00
diff --git a/fdbctl/protos/control_service.proto b/fdbctl/protos/control_service.proto
@@ -25,10 +25,8 @@ service ControlService {
     // encryption, and other critical cluster parameters.
     rpc Configure(ConfigureRequest) returns (ConfigureReply);
 
-    // Retrieves the current read version (transaction version) of the database.
-    rpc GetReadVersion(GetReadVersionRequest) returns (GetReadVersionReply);
-
     // Retrieves comprehensive cluster status including health, performance, and configuration.
+    // Docs: https://github.com/apple/foundationdb/blob/main/documentation/sphinx/source/mr-status.rst
     rpc GetStatus(GetStatusRequest) returns (GetStatusReply);
 
     // Retrieves the list of all worker processes in the cluster.
@@ -45,6 +43,15 @@ service ControlService {
 
     // Forcefully terminates worker processes. Use with caution - prefer Exclude for graceful removal.
     rpc Kill(KillRequest) returns (KillReply);
+
+    // Manages maintenance mode for zones. Maintenance mode prevents data distribution from moving
+    // data away from processes in the specified zone. A zone that is under maintenance will not
+    // have data moved away from it even if processes in that zone fail. In particular, this means
+    // the cluster will not attempt to heal the replication factor as a result of failures in the
+    // maintenance zone. This is useful when the amount of time that the processes in a fault domain
+    // are expected to be absent is reasonably short and you don’t want to move data to and from the
+    // affected processes.
+    rpc Maintenance(MaintenanceRequest) returns (MaintenanceReply);
 }
 
 //------ Messages -------
@@ -57,7 +64,7 @@ message Worker {
     // These fields help FoundationDB make intelligent placement decisions for
     // data replication and fault tolerance.
     message Locality {
-        // Unique identifier for this process
+        // unique identifier for this process
         optional string process_id = 1;
 
         // Zone identifier - typically represents a failure domain (e.g., rack, availability zone)
@@ -93,7 +100,7 @@ message GetCoordinatorsRequest {}
 
 // Response containing the current cluster coordinators.
 message GetCoordinatorsReply {
-    // List of coordinator addresses in the format "ip:port"
+    // List of coordinator addresses in the format "ip:port" or "host:port"
     repeated string coordinators = 1;
 }
 
@@ -104,16 +111,13 @@ message ChangeCoordinatorsRequest {
     // Human-readable description for the cluster (e.g., cluster name)
     optional string cluster_description = 1;
 
-    // If true, disables the configuration database
-    optional bool disable_config_db = 2;
-
     // If true, automatically selects coordinators based on the current cluster topology.
     // When false, uses the addresses specified in new_coordinator_addresses.
-    optional bool automatic_coordinators = 3;
+    optional bool automatic_coordinators = 2;
 
     // List of addresses to use as new coordinators (when automatic_coordinators is false).
-    // Each address should be in the format "ip:port".
-    repeated string new_coordinator_addresses = 4;
+    // Each address should be in the format "[ip|host]:port".
+    repeated string new_coordinator_addresses = 3;
 }
 
 // Response to a coordinator change operation.
@@ -215,27 +219,23 @@ message ConfigureRequest {
     enum EncryptionAtRestMode {
         UNSET_ENCRYPTION = 0;     // No change to encryption mode
         DISABLED_ENCRYPTION = 1;  // Disable encryption at rest
-        DOMAIN_AWARE = 2;         // Domain-aware encryption (tenant-based)
-        CLUSTER_AWARE = 3;        // Cluster-wide encryption
+        CLUSTER_AWARE = 2;        // Cluster-wide encryption
     }
     // Encryption at rest mode for the database
     optional EncryptionAtRestMode encryption_at_rest_mode = 13;
 
     // Other database features
 
-    // If true, enables blob granules for storing large values
-    optional bool blob_granules_enabled = 14;
-
     // List of addresses to exclude during recruitment (format: "ip" or "ip:port")
-    repeated string exclude_addresses = 15;
+    repeated string exclude_addresses = 14;
 
     // Number of testing storage servers to maintain
-    optional int32 tss_count = 16;
+    optional int32 tss_count = 15;
 
     // Control flags
 
     // If true, skip safety checks (dangerous - use with caution)
-    optional bool force = 17;
+    optional bool force = 16;
 }
 
 // Response to a database configuration request.
@@ -274,17 +274,6 @@ message ConfigureReply {
     optional string message = 2;
 }
 
-// Request to get the current read version of the database.
-// The read version is a monotonically increasing transaction version number
-// used for snapshot isolation.
-message GetReadVersionRequest {}
-
-// Response containing the current read version.
-message GetReadVersionReply {
-    // Current read version (transaction version number)
-    optional int64 version = 1;
-}
-
 // Request to retrieve the cluster status.
 // This provides comprehensive information about the cluster's health and state.
 message GetStatusRequest {}
@@ -318,7 +307,7 @@ message IncludeRequest {
     // List of localities to include (format: "locality_key:locality_value")
     repeated string localities = 3;
 
-    // If true, include workers that were marked as failed
+    // If true, only include workers that were marked as failed
     optional bool failed = 4;
 }
 
@@ -335,7 +324,8 @@ message ExcludeRequest {
     // If true, exclude all workers (rarely used, requires force flag)
     optional bool all = 1;
 
-    // If true, mark workers as failed (more aggressive than normal exclude)
+    // If true, mark workers as failed. This flag will drop all the data for the
+    // specified workers and could cause data loss.
     optional bool failed = 2;
 
     // If true, don't wait for data migration to complete before returning
@@ -387,7 +377,6 @@ message ExcludeStatusReply {
 
 // Request to kill (terminate) worker processes.
 // This is a forceful operation that immediately stops processes.
-// Use with caution - prefer exclude for graceful removal.
 message KillRequest {
     // If true, kill all workers (requires extreme caution)
     optional bool all = 1;
@@ -398,3 +387,45 @@ message KillRequest {
 
 // Response to a kill operation.
 message KillReply {}
+
+// Request to manage maintenance mode for zones.
+// Maintenance mode prevents data distribution from moving data away from the
+// specified zone, allowing safe maintenance operations (e.g., hardware upgrades).
+// Only one zone can be in maintenance mode at a time.
+message MaintenanceRequest {
+    // Operation type for maintenance
+    enum Operation {
+        GET = 0;     // Get current maintenance status
+        SET = 1;     // Set maintenance mode for a zone
+        CLEAR = 2;   // Clear maintenance mode
+    }
+
+    // The operation to perform
+    Operation operation = 1;
+
+    // Zone ID to place in maintenance mode (required for SET operation)
+    optional string zone_id = 2;
+
+    // Duration in seconds for maintenance mode (required for SET operation)
+    optional double duration_seconds = 3;
+}
+
+// Response to a maintenance operation.
+message MaintenanceReply {
+    enum Result {
+        SUCCESS = 0;                          // Operation succeeded
+        INVALID_PARAMETERS = 2;               // Invalid parameters for the operation
+    }
+
+    // Result code
+    optional Result result = 1;
+
+    // Current or active zone ID in maintenance (if any)
+    optional string zone_id = 2;
+
+    // Remaining seconds for the current maintenance (if active)
+    optional int64 remaining_seconds = 3;
+
+    // Human-readable message with additional details
+    optional string message = 4;
+}