worker.proto 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399
  1. syntax = "proto3";
  2. package worker_pb;
  3. option go_package = "github.com/seaweedfs/seaweedfs/weed/pb/worker_pb";
  4. // WorkerService provides bidirectional communication between admin and worker
  5. service WorkerService {
  6. // WorkerStream maintains a bidirectional stream for worker communication
  7. rpc WorkerStream(stream WorkerMessage) returns (stream AdminMessage);
  8. }
  9. // WorkerMessage represents messages from worker to admin
  10. message WorkerMessage {
  11. string worker_id = 1;
  12. int64 timestamp = 2;
  13. oneof message {
  14. WorkerRegistration registration = 3;
  15. WorkerHeartbeat heartbeat = 4;
  16. TaskRequest task_request = 5;
  17. TaskUpdate task_update = 6;
  18. TaskComplete task_complete = 7;
  19. WorkerShutdown shutdown = 8;
  20. TaskLogResponse task_log_response = 9;
  21. }
  22. }
  23. // AdminMessage represents messages from admin to worker
  24. message AdminMessage {
  25. string admin_id = 1;
  26. int64 timestamp = 2;
  27. oneof message {
  28. RegistrationResponse registration_response = 3;
  29. HeartbeatResponse heartbeat_response = 4;
  30. TaskAssignment task_assignment = 5;
  31. TaskCancellation task_cancellation = 6;
  32. AdminShutdown admin_shutdown = 7;
  33. TaskLogRequest task_log_request = 8;
  34. }
  35. }
  36. // WorkerRegistration message when worker connects
  37. message WorkerRegistration {
  38. string worker_id = 1;
  39. string address = 2;
  40. repeated string capabilities = 3;
  41. int32 max_concurrent = 4;
  42. map<string, string> metadata = 5;
  43. }
  44. // RegistrationResponse confirms worker registration
  45. message RegistrationResponse {
  46. bool success = 1;
  47. string message = 2;
  48. string assigned_worker_id = 3;
  49. }
  50. // WorkerHeartbeat sent periodically by worker
  51. message WorkerHeartbeat {
  52. string worker_id = 1;
  53. string status = 2;
  54. int32 current_load = 3;
  55. int32 max_concurrent = 4;
  56. repeated string current_task_ids = 5;
  57. int32 tasks_completed = 6;
  58. int32 tasks_failed = 7;
  59. int64 uptime_seconds = 8;
  60. }
  61. // HeartbeatResponse acknowledges heartbeat
  62. message HeartbeatResponse {
  63. bool success = 1;
  64. string message = 2;
  65. }
  66. // TaskRequest from worker asking for new tasks
  67. message TaskRequest {
  68. string worker_id = 1;
  69. repeated string capabilities = 2;
  70. int32 available_slots = 3;
  71. }
  72. // TaskAssignment from admin to worker
  73. message TaskAssignment {
  74. string task_id = 1;
  75. string task_type = 2;
  76. TaskParams params = 3;
  77. int32 priority = 4;
  78. int64 created_time = 5;
  79. map<string, string> metadata = 6;
  80. }
  81. // TaskParams contains task-specific parameters with typed variants
  82. message TaskParams {
  83. string task_id = 1; // ActiveTopology task ID for lifecycle management
  84. uint32 volume_id = 2; // Primary volume ID for the task
  85. string collection = 3; // Collection name
  86. string data_center = 4; // Primary data center
  87. string rack = 5; // Primary rack
  88. uint64 volume_size = 6; // Original volume size in bytes for tracking size changes
  89. // Unified source and target arrays for all task types
  90. repeated TaskSource sources = 7; // Source locations (volume replicas, EC shards, etc.)
  91. repeated TaskTarget targets = 8; // Target locations (destinations, new replicas, etc.)
  92. // Typed task parameters
  93. oneof task_params {
  94. VacuumTaskParams vacuum_params = 9;
  95. ErasureCodingTaskParams erasure_coding_params = 10;
  96. BalanceTaskParams balance_params = 11;
  97. ReplicationTaskParams replication_params = 12;
  98. }
  99. }
  100. // VacuumTaskParams for vacuum operations
  101. message VacuumTaskParams {
  102. double garbage_threshold = 1; // Minimum garbage ratio to trigger vacuum
  103. bool force_vacuum = 2; // Force vacuum even if below threshold
  104. int32 batch_size = 3; // Number of files to process per batch
  105. string working_dir = 4; // Working directory for temporary files
  106. bool verify_checksum = 5; // Verify file checksums during vacuum
  107. }
  108. // ErasureCodingTaskParams for EC encoding operations
  109. message ErasureCodingTaskParams {
  110. uint64 estimated_shard_size = 1; // Estimated size per shard
  111. int32 data_shards = 2; // Number of data shards (default: 10)
  112. int32 parity_shards = 3; // Number of parity shards (default: 4)
  113. string working_dir = 4; // Working directory for EC processing
  114. string master_client = 5; // Master server address
  115. bool cleanup_source = 6; // Whether to cleanup source volume after EC
  116. }
  117. // TaskSource represents a unified source location for any task type
  118. message TaskSource {
  119. string node = 1; // Source server address
  120. uint32 disk_id = 2; // Source disk ID
  121. string rack = 3; // Source rack for tracking
  122. string data_center = 4; // Source data center for tracking
  123. uint32 volume_id = 5; // Volume ID (for volume operations)
  124. repeated uint32 shard_ids = 6; // Shard IDs (for EC shard operations)
  125. uint64 estimated_size = 7; // Estimated size to be processed
  126. }
  127. // TaskTarget represents a unified target location for any task type
  128. message TaskTarget {
  129. string node = 1; // Target server address
  130. uint32 disk_id = 2; // Target disk ID
  131. string rack = 3; // Target rack for tracking
  132. string data_center = 4; // Target data center for tracking
  133. uint32 volume_id = 5; // Volume ID (for volume operations)
  134. repeated uint32 shard_ids = 6; // Shard IDs (for EC shard operations)
  135. uint64 estimated_size = 7; // Estimated size to be created
  136. }
  137. // BalanceTaskParams for volume balancing operations
  138. message BalanceTaskParams {
  139. bool force_move = 1; // Force move even with conflicts
  140. int32 timeout_seconds = 2; // Operation timeout
  141. }
  142. // ReplicationTaskParams for adding replicas
  143. message ReplicationTaskParams {
  144. int32 replica_count = 1; // Target replica count
  145. bool verify_consistency = 2; // Verify replica consistency after creation
  146. }
  147. // TaskUpdate reports task progress
  148. message TaskUpdate {
  149. string task_id = 1;
  150. string worker_id = 2;
  151. string status = 3;
  152. float progress = 4;
  153. string message = 5;
  154. map<string, string> metadata = 6;
  155. }
  156. // TaskComplete reports task completion
  157. message TaskComplete {
  158. string task_id = 1;
  159. string worker_id = 2;
  160. bool success = 3;
  161. string error_message = 4;
  162. int64 completion_time = 5;
  163. map<string, string> result_metadata = 6;
  164. }
  165. // TaskCancellation from admin to cancel a task
  166. message TaskCancellation {
  167. string task_id = 1;
  168. string reason = 2;
  169. bool force = 3;
  170. }
  171. // WorkerShutdown notifies admin that worker is shutting down
  172. message WorkerShutdown {
  173. string worker_id = 1;
  174. string reason = 2;
  175. repeated string pending_task_ids = 3;
  176. }
  177. // AdminShutdown notifies worker that admin is shutting down
  178. message AdminShutdown {
  179. string reason = 1;
  180. int32 graceful_shutdown_seconds = 2;
  181. }
  182. // ========== Task Log Messages ==========
  183. // TaskLogRequest requests logs for a specific task
  184. message TaskLogRequest {
  185. string task_id = 1;
  186. string worker_id = 2;
  187. bool include_metadata = 3; // Include task metadata
  188. int32 max_entries = 4; // Maximum number of log entries (0 = all)
  189. string log_level = 5; // Filter by log level (INFO, WARNING, ERROR, DEBUG)
  190. int64 start_time = 6; // Unix timestamp for start time filter
  191. int64 end_time = 7; // Unix timestamp for end time filter
  192. }
  193. // TaskLogResponse returns task logs and metadata
  194. message TaskLogResponse {
  195. string task_id = 1;
  196. string worker_id = 2;
  197. bool success = 3;
  198. string error_message = 4;
  199. TaskLogMetadata metadata = 5;
  200. repeated TaskLogEntry log_entries = 6;
  201. }
  202. // TaskLogMetadata contains metadata about task execution
  203. message TaskLogMetadata {
  204. string task_id = 1;
  205. string task_type = 2;
  206. string worker_id = 3;
  207. int64 start_time = 4;
  208. int64 end_time = 5;
  209. int64 duration_ms = 6;
  210. string status = 7;
  211. float progress = 8;
  212. uint32 volume_id = 9;
  213. string server = 10;
  214. string collection = 11;
  215. string log_file_path = 12;
  216. int64 created_at = 13;
  217. map<string, string> custom_data = 14;
  218. }
  219. // TaskLogEntry represents a single log entry
  220. message TaskLogEntry {
  221. int64 timestamp = 1;
  222. string level = 2;
  223. string message = 3;
  224. map<string, string> fields = 4;
  225. float progress = 5;
  226. string status = 6;
  227. }
  228. // ========== Maintenance Configuration Messages ==========
  229. // MaintenanceConfig holds configuration for the maintenance system
  230. message MaintenanceConfig {
  231. bool enabled = 1;
  232. int32 scan_interval_seconds = 2; // How often to scan for maintenance needs
  233. int32 worker_timeout_seconds = 3; // Worker heartbeat timeout
  234. int32 task_timeout_seconds = 4; // Individual task timeout
  235. int32 retry_delay_seconds = 5; // Delay between retries
  236. int32 max_retries = 6; // Default max retries for tasks
  237. int32 cleanup_interval_seconds = 7; // How often to clean up old tasks
  238. int32 task_retention_seconds = 8; // How long to keep completed/failed tasks
  239. MaintenancePolicy policy = 9;
  240. }
  241. // MaintenancePolicy defines policies for maintenance operations
  242. message MaintenancePolicy {
  243. map<string, TaskPolicy> task_policies = 1; // Task type -> policy mapping
  244. int32 global_max_concurrent = 2; // Overall limit across all task types
  245. int32 default_repeat_interval_seconds = 3; // Default seconds if task doesn't specify
  246. int32 default_check_interval_seconds = 4; // Default seconds for periodic checks
  247. }
  248. // TaskPolicy represents configuration for a specific task type
  249. message TaskPolicy {
  250. bool enabled = 1;
  251. int32 max_concurrent = 2;
  252. int32 repeat_interval_seconds = 3; // Seconds to wait before repeating
  253. int32 check_interval_seconds = 4; // Seconds between checks
  254. // Typed task-specific configuration (replaces generic map)
  255. oneof task_config {
  256. VacuumTaskConfig vacuum_config = 5;
  257. ErasureCodingTaskConfig erasure_coding_config = 6;
  258. BalanceTaskConfig balance_config = 7;
  259. ReplicationTaskConfig replication_config = 8;
  260. }
  261. }
  262. // Task-specific configuration messages
  263. // VacuumTaskConfig contains vacuum-specific configuration
  264. message VacuumTaskConfig {
  265. double garbage_threshold = 1; // Minimum garbage ratio to trigger vacuum (0.0-1.0)
  266. int32 min_volume_age_hours = 2; // Minimum age before vacuum is considered
  267. int32 min_interval_seconds = 3; // Minimum time between vacuum operations on the same volume
  268. }
  269. // ErasureCodingTaskConfig contains EC-specific configuration
  270. message ErasureCodingTaskConfig {
  271. double fullness_ratio = 1; // Minimum fullness ratio to trigger EC (0.0-1.0)
  272. int32 quiet_for_seconds = 2; // Minimum quiet time before EC
  273. int32 min_volume_size_mb = 3; // Minimum volume size for EC
  274. string collection_filter = 4; // Only process volumes from specific collections
  275. }
  276. // BalanceTaskConfig contains balance-specific configuration
  277. message BalanceTaskConfig {
  278. double imbalance_threshold = 1; // Threshold for triggering rebalancing (0.0-1.0)
  279. int32 min_server_count = 2; // Minimum number of servers required for balancing
  280. }
  281. // ReplicationTaskConfig contains replication-specific configuration
  282. message ReplicationTaskConfig {
  283. int32 target_replica_count = 1; // Target number of replicas
  284. }
  285. // ========== Task Persistence Messages ==========
  286. // MaintenanceTaskData represents complete task state for persistence
  287. message MaintenanceTaskData {
  288. string id = 1;
  289. string type = 2;
  290. string priority = 3;
  291. string status = 4;
  292. uint32 volume_id = 5;
  293. string server = 6;
  294. string collection = 7;
  295. TaskParams typed_params = 8;
  296. string reason = 9;
  297. int64 created_at = 10;
  298. int64 scheduled_at = 11;
  299. int64 started_at = 12;
  300. int64 completed_at = 13;
  301. string worker_id = 14;
  302. string error = 15;
  303. double progress = 16;
  304. int32 retry_count = 17;
  305. int32 max_retries = 18;
  306. // Enhanced fields for detailed task tracking
  307. string created_by = 19;
  308. string creation_context = 20;
  309. repeated TaskAssignmentRecord assignment_history = 21;
  310. string detailed_reason = 22;
  311. map<string, string> tags = 23;
  312. TaskCreationMetrics creation_metrics = 24;
  313. }
  314. // TaskAssignmentRecord tracks worker assignments for a task
  315. message TaskAssignmentRecord {
  316. string worker_id = 1;
  317. string worker_address = 2;
  318. int64 assigned_at = 3;
  319. int64 unassigned_at = 4; // Optional: when worker was unassigned
  320. string reason = 5; // Reason for assignment/unassignment
  321. }
  322. // TaskCreationMetrics tracks why and how a task was created
  323. message TaskCreationMetrics {
  324. string trigger_metric = 1; // Name of metric that triggered creation
  325. double metric_value = 2; // Value that triggered creation
  326. double threshold = 3; // Threshold that was exceeded
  327. VolumeHealthMetrics volume_metrics = 4; // Volume health at creation time
  328. map<string, string> additional_data = 5; // Additional context data
  329. }
  330. // VolumeHealthMetrics captures volume state at task creation
  331. message VolumeHealthMetrics {
  332. uint64 total_size = 1;
  333. uint64 used_size = 2;
  334. uint64 garbage_size = 3;
  335. double garbage_ratio = 4;
  336. int32 file_count = 5;
  337. int32 deleted_file_count = 6;
  338. int64 last_modified = 7;
  339. int32 replica_count = 8;
  340. bool is_ec_volume = 9;
  341. string collection = 10;
  342. }
  343. // TaskStateFile wraps task data with metadata for persistence
  344. message TaskStateFile {
  345. MaintenanceTaskData task = 1;
  346. int64 last_updated = 2;
  347. string admin_version = 3;
  348. }