topology.go 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470
  1. package topology
  2. import (
  3. "encoding/json"
  4. "errors"
  5. "fmt"
  6. "math/rand/v2"
  7. "slices"
  8. "sync"
  9. "time"
  10. "github.com/seaweedfs/seaweedfs/weed/pb"
  11. "github.com/seaweedfs/seaweedfs/weed/storage/types"
  12. backoff "github.com/cenkalti/backoff/v4"
  13. hashicorpRaft "github.com/hashicorp/raft"
  14. "github.com/seaweedfs/raft"
  15. "github.com/seaweedfs/seaweedfs/weed/glog"
  16. "github.com/seaweedfs/seaweedfs/weed/pb/master_pb"
  17. "github.com/seaweedfs/seaweedfs/weed/sequence"
  18. "github.com/seaweedfs/seaweedfs/weed/stats"
  19. "github.com/seaweedfs/seaweedfs/weed/storage"
  20. "github.com/seaweedfs/seaweedfs/weed/storage/needle"
  21. "github.com/seaweedfs/seaweedfs/weed/storage/super_block"
  22. "github.com/seaweedfs/seaweedfs/weed/util"
  23. )
  24. type Topology struct {
  25. vacuumLockCounter int64
  26. NodeImpl
  27. collectionMap *util.ConcurrentReadMap
  28. ecShardMap map[needle.VolumeId]*EcShardLocations
  29. ecShardMapLock sync.RWMutex
  30. pulse int64
  31. volumeSizeLimit uint64
  32. replicationAsMin bool
  33. isDisableVacuum bool
  34. Sequence sequence.Sequencer
  35. chanFullVolumes chan storage.VolumeInfo
  36. chanCrowdedVolumes chan storage.VolumeInfo
  37. Configuration *Configuration
  38. RaftServer raft.Server
  39. RaftServerAccessLock sync.RWMutex
  40. HashicorpRaft *hashicorpRaft.Raft
  41. barrierLock sync.Mutex
  42. barrierDone bool
  43. UuidAccessLock sync.RWMutex
  44. UuidMap map[string][]string
  45. LastLeaderChangeTime time.Time
  46. }
  47. func NewTopology(id string, seq sequence.Sequencer, volumeSizeLimit uint64, pulse int, replicationAsMin bool) *Topology {
  48. t := &Topology{}
  49. t.id = NodeId(id)
  50. t.nodeType = "Topology"
  51. t.NodeImpl.value = t
  52. t.diskUsages = newDiskUsages()
  53. t.children = make(map[NodeId]Node)
  54. t.capacityReservations = newCapacityReservations()
  55. t.collectionMap = util.NewConcurrentReadMap()
  56. t.ecShardMap = make(map[needle.VolumeId]*EcShardLocations)
  57. t.pulse = int64(pulse)
  58. t.volumeSizeLimit = volumeSizeLimit
  59. t.replicationAsMin = replicationAsMin
  60. t.Sequence = seq
  61. t.chanFullVolumes = make(chan storage.VolumeInfo)
  62. t.chanCrowdedVolumes = make(chan storage.VolumeInfo)
  63. t.Configuration = &Configuration{}
  64. return t
  65. }
  66. func (t *Topology) IsChildLocked() (bool, error) {
  67. if t.IsLocked() {
  68. return true, errors.New("topology is locked")
  69. }
  70. for _, dcNode := range t.Children() {
  71. if dcNode.IsLocked() {
  72. return true, fmt.Errorf("topology child %s is locked", dcNode.String())
  73. }
  74. for _, rackNode := range dcNode.Children() {
  75. if rackNode.IsLocked() {
  76. return true, fmt.Errorf("dc %s child %s is locked", dcNode.String(), rackNode.String())
  77. }
  78. for _, dataNode := range rackNode.Children() {
  79. if dataNode.IsLocked() {
  80. return true, fmt.Errorf("rack %s child %s is locked", rackNode.String(), dataNode.Id())
  81. }
  82. }
  83. }
  84. }
  85. return false, nil
  86. }
  87. func (t *Topology) IsLeader() bool {
  88. t.RaftServerAccessLock.RLock()
  89. defer t.RaftServerAccessLock.RUnlock()
  90. if t.RaftServer != nil {
  91. if t.RaftServer.State() == raft.Leader {
  92. return true
  93. }
  94. if leader, err := t.Leader(); err == nil {
  95. if pb.ServerAddress(t.RaftServer.Name()) == leader {
  96. return true
  97. }
  98. }
  99. } else if t.HashicorpRaft != nil {
  100. if t.HashicorpRaft.State() == hashicorpRaft.Leader {
  101. return true
  102. }
  103. }
  104. return false
  105. }
  106. func (t *Topology) IsLeaderAndCanRead() bool {
  107. if t.RaftServer != nil {
  108. return t.IsLeader()
  109. } else if t.HashicorpRaft != nil {
  110. return t.IsLeader() && t.DoBarrier()
  111. } else {
  112. return false
  113. }
  114. }
  115. func (t *Topology) DoBarrier() bool {
  116. t.barrierLock.Lock()
  117. defer t.barrierLock.Unlock()
  118. if t.barrierDone {
  119. return true
  120. }
  121. glog.V(0).Infof("raft do barrier")
  122. barrier := t.HashicorpRaft.Barrier(2 * time.Minute)
  123. if err := barrier.Error(); err != nil {
  124. glog.Errorf("failed to wait for barrier, error %s", err)
  125. return false
  126. }
  127. t.barrierDone = true
  128. glog.V(0).Infof("raft do barrier success")
  129. return true
  130. }
  131. func (t *Topology) BarrierReset() {
  132. t.barrierLock.Lock()
  133. defer t.barrierLock.Unlock()
  134. t.barrierDone = false
  135. }
  136. func (t *Topology) Leader() (l pb.ServerAddress, err error) {
  137. exponentialBackoff := backoff.NewExponentialBackOff()
  138. exponentialBackoff.InitialInterval = 100 * time.Millisecond
  139. exponentialBackoff.MaxElapsedTime = 20 * time.Second
  140. leaderNotSelected := errors.New("leader not selected yet")
  141. l, err = backoff.RetryWithData(
  142. func() (l pb.ServerAddress, err error) {
  143. l, err = t.MaybeLeader()
  144. if err == nil && l == "" {
  145. err = leaderNotSelected
  146. }
  147. return l, err
  148. },
  149. exponentialBackoff)
  150. if err == leaderNotSelected {
  151. l = ""
  152. }
  153. return l, err
  154. }
  155. func (t *Topology) MaybeLeader() (l pb.ServerAddress, err error) {
  156. t.RaftServerAccessLock.RLock()
  157. defer t.RaftServerAccessLock.RUnlock()
  158. if t.RaftServer != nil {
  159. l = pb.ServerAddress(t.RaftServer.Leader())
  160. } else if t.HashicorpRaft != nil {
  161. l = pb.ServerAddress(t.HashicorpRaft.Leader())
  162. } else {
  163. err = errors.New("Raft Server not ready yet!")
  164. }
  165. return
  166. }
  167. func (t *Topology) Lookup(collection string, vid needle.VolumeId) (dataNodes []*DataNode) {
  168. // maybe an issue if lots of collections?
  169. if collection == "" {
  170. for _, c := range t.collectionMap.Items() {
  171. if list := c.(*Collection).Lookup(vid); list != nil {
  172. return list
  173. }
  174. }
  175. } else {
  176. if c, ok := t.collectionMap.Find(collection); ok {
  177. return c.(*Collection).Lookup(vid)
  178. }
  179. }
  180. if locations, found := t.LookupEcShards(vid); found {
  181. for _, loc := range locations.Locations {
  182. dataNodes = append(dataNodes, loc...)
  183. }
  184. return dataNodes
  185. }
  186. return nil
  187. }
  188. func (t *Topology) NextVolumeId() (needle.VolumeId, error) {
  189. if !t.IsLeaderAndCanRead() {
  190. return 0, fmt.Errorf("as leader can not read yet")
  191. }
  192. vid := t.GetMaxVolumeId()
  193. next := vid.Next()
  194. t.RaftServerAccessLock.RLock()
  195. defer t.RaftServerAccessLock.RUnlock()
  196. if t.RaftServer != nil {
  197. if _, err := t.RaftServer.Do(NewMaxVolumeIdCommand(next)); err != nil {
  198. return 0, err
  199. }
  200. } else if t.HashicorpRaft != nil {
  201. b, err := json.Marshal(NewMaxVolumeIdCommand(next))
  202. if err != nil {
  203. return 0, fmt.Errorf("failed marshal NewMaxVolumeIdCommand: %+v", err)
  204. }
  205. if future := t.HashicorpRaft.Apply(b, time.Second); future.Error() != nil {
  206. return 0, future.Error()
  207. }
  208. }
  209. return next, nil
  210. }
  211. func (t *Topology) PickForWrite(requestedCount uint64, option *VolumeGrowOption, volumeLayout *VolumeLayout) (fileId string, count uint64, volumeLocationList *VolumeLocationList, shouldGrow bool, err error) {
  212. var vid needle.VolumeId
  213. vid, count, volumeLocationList, shouldGrow, err = volumeLayout.PickForWrite(requestedCount, option)
  214. if err != nil {
  215. return "", 0, nil, shouldGrow, fmt.Errorf("failed to find writable volumes for collection:%s replication:%s ttl:%s error: %v", option.Collection, option.ReplicaPlacement.String(), option.Ttl.String(), err)
  216. }
  217. if volumeLocationList == nil || volumeLocationList.Length() == 0 {
  218. return "", 0, nil, shouldGrow, fmt.Errorf("%s available for collection:%s replication:%s ttl:%s", NoWritableVolumes, option.Collection, option.ReplicaPlacement.String(), option.Ttl.String())
  219. }
  220. nextFileId := t.Sequence.NextFileId(requestedCount)
  221. fileId = needle.NewFileId(vid, nextFileId, rand.Uint32()).String()
  222. return fileId, count, volumeLocationList, shouldGrow, nil
  223. }
  224. func (t *Topology) GetVolumeLayout(collectionName string, rp *super_block.ReplicaPlacement, ttl *needle.TTL, diskType types.DiskType) *VolumeLayout {
  225. return t.collectionMap.Get(collectionName, func() interface{} {
  226. return NewCollection(collectionName, t.volumeSizeLimit, t.replicationAsMin)
  227. }).(*Collection).GetOrCreateVolumeLayout(rp, ttl, diskType)
  228. }
  229. func (t *Topology) ListCollections(includeNormalVolumes, includeEcVolumes bool) (ret []string) {
  230. found := make(map[string]bool)
  231. if includeNormalVolumes {
  232. t.collectionMap.RLock()
  233. for _, c := range t.collectionMap.Items() {
  234. found[c.(*Collection).Name] = true
  235. }
  236. t.collectionMap.RUnlock()
  237. }
  238. if includeEcVolumes {
  239. t.ecShardMapLock.RLock()
  240. for _, ecVolumeLocation := range t.ecShardMap {
  241. found[ecVolumeLocation.Collection] = true
  242. }
  243. t.ecShardMapLock.RUnlock()
  244. }
  245. for k := range found {
  246. ret = append(ret, k)
  247. }
  248. slices.Sort(ret)
  249. return ret
  250. }
  251. func (t *Topology) FindCollection(collectionName string) (*Collection, bool) {
  252. c, hasCollection := t.collectionMap.Find(collectionName)
  253. if !hasCollection {
  254. return nil, false
  255. }
  256. return c.(*Collection), hasCollection
  257. }
  258. func (t *Topology) DeleteCollection(collectionName string) {
  259. t.collectionMap.Delete(collectionName)
  260. }
  261. func (t *Topology) DeleteLayout(collectionName string, rp *super_block.ReplicaPlacement, ttl *needle.TTL, diskType types.DiskType) {
  262. collection, found := t.FindCollection(collectionName)
  263. if !found {
  264. return
  265. }
  266. collection.DeleteVolumeLayout(rp, ttl, diskType)
  267. if len(collection.storageType2VolumeLayout.Items()) == 0 {
  268. t.DeleteCollection(collectionName)
  269. }
  270. }
  271. func (t *Topology) RegisterVolumeLayout(v storage.VolumeInfo, dn *DataNode) {
  272. diskType := types.ToDiskType(v.DiskType)
  273. vl := t.GetVolumeLayout(v.Collection, v.ReplicaPlacement, v.Ttl, diskType)
  274. vl.RegisterVolume(&v, dn)
  275. vl.EnsureCorrectWritables(&v)
  276. }
  277. func (t *Topology) UnRegisterVolumeLayout(v storage.VolumeInfo, dn *DataNode) {
  278. glog.Infof("removing volume info: %+v from %v", v, dn.id)
  279. if v.ReplicaPlacement.GetCopyCount() > 1 {
  280. stats.MasterReplicaPlacementMismatch.WithLabelValues(v.Collection, v.Id.String()).Set(0)
  281. }
  282. diskType := types.ToDiskType(v.DiskType)
  283. volumeLayout := t.GetVolumeLayout(v.Collection, v.ReplicaPlacement, v.Ttl, diskType)
  284. volumeLayout.UnRegisterVolume(&v, dn)
  285. if volumeLayout.isEmpty() {
  286. t.DeleteLayout(v.Collection, v.ReplicaPlacement, v.Ttl, diskType)
  287. }
  288. }
  289. func (t *Topology) DataCenterExists(dcName string) bool {
  290. return dcName == "" || t.GetDataCenter(dcName) != nil
  291. }
  292. func (t *Topology) GetDataCenter(dcName string) (dc *DataCenter) {
  293. t.RLock()
  294. defer t.RUnlock()
  295. for _, c := range t.children {
  296. dc = c.(*DataCenter)
  297. if string(dc.Id()) == dcName {
  298. return dc
  299. }
  300. }
  301. return dc
  302. }
  303. func (t *Topology) GetOrCreateDataCenter(dcName string) *DataCenter {
  304. t.Lock()
  305. defer t.Unlock()
  306. for _, c := range t.children {
  307. dc := c.(*DataCenter)
  308. if string(dc.Id()) == dcName {
  309. return dc
  310. }
  311. }
  312. dc := NewDataCenter(dcName)
  313. t.doLinkChildNode(dc)
  314. return dc
  315. }
  316. func (t *Topology) ListDataCenters() (dcs []string) {
  317. t.RLock()
  318. defer t.RUnlock()
  319. for _, c := range t.children {
  320. dcs = append(dcs, string(c.(*DataCenter).Id()))
  321. }
  322. return dcs
  323. }
  324. func (t *Topology) ListDCAndRacks() (dcs map[NodeId][]NodeId) {
  325. t.RLock()
  326. defer t.RUnlock()
  327. dcs = make(map[NodeId][]NodeId)
  328. for _, dcNode := range t.children {
  329. dcNodeId := dcNode.(*DataCenter).Id()
  330. for _, rackNode := range dcNode.Children() {
  331. dcs[dcNodeId] = append(dcs[dcNodeId], rackNode.(*Rack).Id())
  332. }
  333. }
  334. return dcs
  335. }
  336. func (t *Topology) SyncDataNodeRegistration(volumes []*master_pb.VolumeInformationMessage, dn *DataNode) (newVolumes, deletedVolumes []storage.VolumeInfo) {
  337. // convert into in memory struct storage.VolumeInfo
  338. var volumeInfos []storage.VolumeInfo
  339. for _, v := range volumes {
  340. if vi, err := storage.NewVolumeInfo(v); err == nil {
  341. volumeInfos = append(volumeInfos, vi)
  342. } else {
  343. glog.V(0).Infof("Fail to convert joined volume information: %v", err)
  344. }
  345. }
  346. // find out the delta volumes
  347. var changedVolumes []storage.VolumeInfo
  348. newVolumes, deletedVolumes, changedVolumes = dn.UpdateVolumes(volumeInfos)
  349. for _, v := range newVolumes {
  350. t.RegisterVolumeLayout(v, dn)
  351. }
  352. for _, v := range deletedVolumes {
  353. t.UnRegisterVolumeLayout(v, dn)
  354. }
  355. for _, v := range changedVolumes {
  356. diskType := types.ToDiskType(v.DiskType)
  357. vl := t.GetVolumeLayout(v.Collection, v.ReplicaPlacement, v.Ttl, diskType)
  358. vl.EnsureCorrectWritables(&v)
  359. }
  360. return
  361. }
  362. func (t *Topology) IncrementalSyncDataNodeRegistration(newVolumes, deletedVolumes []*master_pb.VolumeShortInformationMessage, dn *DataNode) {
  363. var newVis, oldVis []storage.VolumeInfo
  364. for _, v := range newVolumes {
  365. vi, err := storage.NewVolumeInfoFromShort(v)
  366. if err != nil {
  367. glog.V(0).Infof("NewVolumeInfoFromShort %v: %v", v, err)
  368. continue
  369. }
  370. newVis = append(newVis, vi)
  371. }
  372. for _, v := range deletedVolumes {
  373. vi, err := storage.NewVolumeInfoFromShort(v)
  374. if err != nil {
  375. glog.V(0).Infof("NewVolumeInfoFromShort %v: %v", v, err)
  376. continue
  377. }
  378. oldVis = append(oldVis, vi)
  379. }
  380. dn.DeltaUpdateVolumes(newVis, oldVis)
  381. for _, vi := range newVis {
  382. t.RegisterVolumeLayout(vi, dn)
  383. }
  384. for _, vi := range oldVis {
  385. t.UnRegisterVolumeLayout(vi, dn)
  386. }
  387. return
  388. }
  389. func (t *Topology) DataNodeRegistration(dcName, rackName string, dn *DataNode) {
  390. if dn.Parent() != nil {
  391. return
  392. }
  393. // registration to topo
  394. dc := t.GetOrCreateDataCenter(dcName)
  395. rack := dc.GetOrCreateRack(rackName)
  396. rack.LinkChildNode(dn)
  397. glog.Infof("[%s] reLink To topo ", dn.Id())
  398. }
  399. func (t *Topology) DisableVacuum() {
  400. glog.V(0).Infof("DisableVacuum")
  401. t.isDisableVacuum = true
  402. }
  403. func (t *Topology) EnableVacuum() {
  404. glog.V(0).Infof("EnableVacuum")
  405. t.isDisableVacuum = false
  406. }