pending_operations.go 9.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311
  1. package maintenance
  2. import (
  3. "sync"
  4. "time"
  5. "github.com/seaweedfs/seaweedfs/weed/glog"
  6. "github.com/seaweedfs/seaweedfs/weed/worker/types"
  7. )
  8. // PendingOperationType represents the type of pending operation
  9. type PendingOperationType string
  10. const (
  11. OpTypeVolumeMove PendingOperationType = "volume_move"
  12. OpTypeVolumeBalance PendingOperationType = "volume_balance"
  13. OpTypeErasureCoding PendingOperationType = "erasure_coding"
  14. OpTypeVacuum PendingOperationType = "vacuum"
  15. OpTypeReplication PendingOperationType = "replication"
  16. )
  17. // PendingOperation represents a pending volume/shard operation
  18. type PendingOperation struct {
  19. VolumeID uint32 `json:"volume_id"`
  20. OperationType PendingOperationType `json:"operation_type"`
  21. SourceNode string `json:"source_node"`
  22. DestNode string `json:"dest_node,omitempty"` // Empty for non-movement operations
  23. TaskID string `json:"task_id"`
  24. StartTime time.Time `json:"start_time"`
  25. EstimatedSize uint64 `json:"estimated_size"` // Bytes
  26. Collection string `json:"collection"`
  27. Status string `json:"status"` // "assigned", "in_progress", "completing"
  28. }
  29. // PendingOperations tracks all pending volume/shard operations
  30. type PendingOperations struct {
  31. // Operations by volume ID for conflict detection
  32. byVolumeID map[uint32]*PendingOperation
  33. // Operations by task ID for updates
  34. byTaskID map[string]*PendingOperation
  35. // Operations by node for capacity calculations
  36. bySourceNode map[string][]*PendingOperation
  37. byDestNode map[string][]*PendingOperation
  38. mutex sync.RWMutex
  39. }
  40. // NewPendingOperations creates a new pending operations tracker
  41. func NewPendingOperations() *PendingOperations {
  42. return &PendingOperations{
  43. byVolumeID: make(map[uint32]*PendingOperation),
  44. byTaskID: make(map[string]*PendingOperation),
  45. bySourceNode: make(map[string][]*PendingOperation),
  46. byDestNode: make(map[string][]*PendingOperation),
  47. }
  48. }
  49. // AddOperation adds a pending operation
  50. func (po *PendingOperations) AddOperation(op *PendingOperation) {
  51. po.mutex.Lock()
  52. defer po.mutex.Unlock()
  53. // Check for existing operation on this volume
  54. if existing, exists := po.byVolumeID[op.VolumeID]; exists {
  55. glog.V(1).Infof("Replacing existing pending operation on volume %d: %s -> %s",
  56. op.VolumeID, existing.TaskID, op.TaskID)
  57. po.removeOperationUnlocked(existing)
  58. }
  59. // Add new operation
  60. po.byVolumeID[op.VolumeID] = op
  61. po.byTaskID[op.TaskID] = op
  62. // Add to node indexes
  63. po.bySourceNode[op.SourceNode] = append(po.bySourceNode[op.SourceNode], op)
  64. if op.DestNode != "" {
  65. po.byDestNode[op.DestNode] = append(po.byDestNode[op.DestNode], op)
  66. }
  67. glog.V(2).Infof("Added pending operation: volume %d, type %s, task %s, %s -> %s",
  68. op.VolumeID, op.OperationType, op.TaskID, op.SourceNode, op.DestNode)
  69. }
  70. // RemoveOperation removes a completed operation
  71. func (po *PendingOperations) RemoveOperation(taskID string) {
  72. po.mutex.Lock()
  73. defer po.mutex.Unlock()
  74. if op, exists := po.byTaskID[taskID]; exists {
  75. po.removeOperationUnlocked(op)
  76. glog.V(2).Infof("Removed completed operation: volume %d, task %s", op.VolumeID, taskID)
  77. }
  78. }
  79. // removeOperationUnlocked removes an operation (must hold lock)
  80. func (po *PendingOperations) removeOperationUnlocked(op *PendingOperation) {
  81. delete(po.byVolumeID, op.VolumeID)
  82. delete(po.byTaskID, op.TaskID)
  83. // Remove from source node list
  84. if ops, exists := po.bySourceNode[op.SourceNode]; exists {
  85. for i, other := range ops {
  86. if other.TaskID == op.TaskID {
  87. po.bySourceNode[op.SourceNode] = append(ops[:i], ops[i+1:]...)
  88. break
  89. }
  90. }
  91. }
  92. // Remove from dest node list
  93. if op.DestNode != "" {
  94. if ops, exists := po.byDestNode[op.DestNode]; exists {
  95. for i, other := range ops {
  96. if other.TaskID == op.TaskID {
  97. po.byDestNode[op.DestNode] = append(ops[:i], ops[i+1:]...)
  98. break
  99. }
  100. }
  101. }
  102. }
  103. }
  104. // HasPendingOperationOnVolume checks if a volume has a pending operation
  105. func (po *PendingOperations) HasPendingOperationOnVolume(volumeID uint32) bool {
  106. po.mutex.RLock()
  107. defer po.mutex.RUnlock()
  108. _, exists := po.byVolumeID[volumeID]
  109. return exists
  110. }
  111. // GetPendingOperationOnVolume returns the pending operation on a volume
  112. func (po *PendingOperations) GetPendingOperationOnVolume(volumeID uint32) *PendingOperation {
  113. po.mutex.RLock()
  114. defer po.mutex.RUnlock()
  115. return po.byVolumeID[volumeID]
  116. }
  117. // WouldConflictWithPending checks if a new operation would conflict with pending ones
  118. func (po *PendingOperations) WouldConflictWithPending(volumeID uint32, opType PendingOperationType) bool {
  119. po.mutex.RLock()
  120. defer po.mutex.RUnlock()
  121. if existing, exists := po.byVolumeID[volumeID]; exists {
  122. // Volume already has a pending operation
  123. glog.V(3).Infof("Volume %d conflict: already has %s operation (task %s)",
  124. volumeID, existing.OperationType, existing.TaskID)
  125. return true
  126. }
  127. return false
  128. }
  129. // GetPendingCapacityImpactForNode calculates pending capacity changes for a node
  130. func (po *PendingOperations) GetPendingCapacityImpactForNode(nodeID string) (incoming uint64, outgoing uint64) {
  131. po.mutex.RLock()
  132. defer po.mutex.RUnlock()
  133. // Calculate outgoing capacity (volumes leaving this node)
  134. if ops, exists := po.bySourceNode[nodeID]; exists {
  135. for _, op := range ops {
  136. // Only count movement operations
  137. if op.DestNode != "" {
  138. outgoing += op.EstimatedSize
  139. }
  140. }
  141. }
  142. // Calculate incoming capacity (volumes coming to this node)
  143. if ops, exists := po.byDestNode[nodeID]; exists {
  144. for _, op := range ops {
  145. incoming += op.EstimatedSize
  146. }
  147. }
  148. return incoming, outgoing
  149. }
  150. // FilterVolumeMetricsExcludingPending filters out volumes with pending operations
  151. func (po *PendingOperations) FilterVolumeMetricsExcludingPending(metrics []*types.VolumeHealthMetrics) []*types.VolumeHealthMetrics {
  152. po.mutex.RLock()
  153. defer po.mutex.RUnlock()
  154. var filtered []*types.VolumeHealthMetrics
  155. excludedCount := 0
  156. for _, metric := range metrics {
  157. if _, hasPending := po.byVolumeID[metric.VolumeID]; !hasPending {
  158. filtered = append(filtered, metric)
  159. } else {
  160. excludedCount++
  161. glog.V(3).Infof("Excluding volume %d from scan due to pending operation", metric.VolumeID)
  162. }
  163. }
  164. if excludedCount > 0 {
  165. glog.V(1).Infof("Filtered out %d volumes with pending operations from %d total volumes",
  166. excludedCount, len(metrics))
  167. }
  168. return filtered
  169. }
  170. // GetNodeCapacityProjection calculates projected capacity for a node
  171. func (po *PendingOperations) GetNodeCapacityProjection(nodeID string, currentUsed uint64, totalCapacity uint64) NodeCapacityProjection {
  172. incoming, outgoing := po.GetPendingCapacityImpactForNode(nodeID)
  173. projectedUsed := currentUsed + incoming - outgoing
  174. projectedFree := totalCapacity - projectedUsed
  175. return NodeCapacityProjection{
  176. NodeID: nodeID,
  177. CurrentUsed: currentUsed,
  178. TotalCapacity: totalCapacity,
  179. PendingIncoming: incoming,
  180. PendingOutgoing: outgoing,
  181. ProjectedUsed: projectedUsed,
  182. ProjectedFree: projectedFree,
  183. }
  184. }
  185. // GetAllPendingOperations returns all pending operations
  186. func (po *PendingOperations) GetAllPendingOperations() []*PendingOperation {
  187. po.mutex.RLock()
  188. defer po.mutex.RUnlock()
  189. var operations []*PendingOperation
  190. for _, op := range po.byVolumeID {
  191. operations = append(operations, op)
  192. }
  193. return operations
  194. }
  195. // UpdateOperationStatus updates the status of a pending operation
  196. func (po *PendingOperations) UpdateOperationStatus(taskID string, status string) {
  197. po.mutex.Lock()
  198. defer po.mutex.Unlock()
  199. if op, exists := po.byTaskID[taskID]; exists {
  200. op.Status = status
  201. glog.V(3).Infof("Updated operation status: task %s, volume %d -> %s", taskID, op.VolumeID, status)
  202. }
  203. }
  204. // CleanupStaleOperations removes operations that have been running too long
  205. func (po *PendingOperations) CleanupStaleOperations(maxAge time.Duration) int {
  206. po.mutex.Lock()
  207. defer po.mutex.Unlock()
  208. cutoff := time.Now().Add(-maxAge)
  209. var staleOps []*PendingOperation
  210. for _, op := range po.byVolumeID {
  211. if op.StartTime.Before(cutoff) {
  212. staleOps = append(staleOps, op)
  213. }
  214. }
  215. for _, op := range staleOps {
  216. po.removeOperationUnlocked(op)
  217. glog.Warningf("Removed stale pending operation: volume %d, task %s, age %v",
  218. op.VolumeID, op.TaskID, time.Since(op.StartTime))
  219. }
  220. return len(staleOps)
  221. }
  222. // NodeCapacityProjection represents projected capacity for a node
  223. type NodeCapacityProjection struct {
  224. NodeID string `json:"node_id"`
  225. CurrentUsed uint64 `json:"current_used"`
  226. TotalCapacity uint64 `json:"total_capacity"`
  227. PendingIncoming uint64 `json:"pending_incoming"`
  228. PendingOutgoing uint64 `json:"pending_outgoing"`
  229. ProjectedUsed uint64 `json:"projected_used"`
  230. ProjectedFree uint64 `json:"projected_free"`
  231. }
  232. // GetStats returns statistics about pending operations
  233. func (po *PendingOperations) GetStats() PendingOperationsStats {
  234. po.mutex.RLock()
  235. defer po.mutex.RUnlock()
  236. stats := PendingOperationsStats{
  237. TotalOperations: len(po.byVolumeID),
  238. ByType: make(map[PendingOperationType]int),
  239. ByStatus: make(map[string]int),
  240. }
  241. var totalSize uint64
  242. for _, op := range po.byVolumeID {
  243. stats.ByType[op.OperationType]++
  244. stats.ByStatus[op.Status]++
  245. totalSize += op.EstimatedSize
  246. }
  247. stats.TotalEstimatedSize = totalSize
  248. return stats
  249. }
  250. // PendingOperationsStats provides statistics about pending operations
  251. type PendingOperationsStats struct {
  252. TotalOperations int `json:"total_operations"`
  253. ByType map[PendingOperationType]int `json:"by_type"`
  254. ByStatus map[string]int `json:"by_status"`
  255. TotalEstimatedSize uint64 `json:"total_estimated_size"`
  256. }