log_buffer.go 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387
  1. package log_buffer
  2. import (
  3. "bytes"
  4. "sync"
  5. "sync/atomic"
  6. "time"
  7. "google.golang.org/protobuf/proto"
  8. "github.com/seaweedfs/seaweedfs/weed/glog"
  9. "github.com/seaweedfs/seaweedfs/weed/pb/filer_pb"
  10. "github.com/seaweedfs/seaweedfs/weed/pb/mq_pb"
  11. "github.com/seaweedfs/seaweedfs/weed/util"
  12. )
  13. const BufferSize = 8 * 1024 * 1024
  14. const PreviousBufferCount = 32
  15. type dataToFlush struct {
  16. startTime time.Time
  17. stopTime time.Time
  18. data *bytes.Buffer
  19. }
  20. type EachLogEntryFuncType func(logEntry *filer_pb.LogEntry) (isDone bool, err error)
  21. type EachLogEntryWithBatchIndexFuncType func(logEntry *filer_pb.LogEntry, batchIndex int64) (isDone bool, err error)
  22. type LogFlushFuncType func(logBuffer *LogBuffer, startTime, stopTime time.Time, buf []byte)
  23. type LogReadFromDiskFuncType func(startPosition MessagePosition, stopTsNs int64, eachLogEntryFn EachLogEntryFuncType) (lastReadPosition MessagePosition, isDone bool, err error)
  24. type LogBuffer struct {
  25. LastFlushTsNs int64
  26. name string
  27. prevBuffers *SealedBuffers
  28. buf []byte
  29. batchIndex int64
  30. idx []int
  31. pos int
  32. startTime time.Time
  33. stopTime time.Time
  34. lastFlushDataTime time.Time
  35. sizeBuf []byte
  36. flushInterval time.Duration
  37. flushFn LogFlushFuncType
  38. ReadFromDiskFn LogReadFromDiskFuncType
  39. notifyFn func()
  40. isStopping *atomic.Bool
  41. isAllFlushed bool
  42. flushChan chan *dataToFlush
  43. LastTsNs atomic.Int64
  44. sync.RWMutex
  45. }
  46. func NewLogBuffer(name string, flushInterval time.Duration, flushFn LogFlushFuncType,
  47. readFromDiskFn LogReadFromDiskFuncType, notifyFn func()) *LogBuffer {
  48. lb := &LogBuffer{
  49. name: name,
  50. prevBuffers: newSealedBuffers(PreviousBufferCount),
  51. buf: make([]byte, BufferSize),
  52. sizeBuf: make([]byte, 4),
  53. flushInterval: flushInterval,
  54. flushFn: flushFn,
  55. ReadFromDiskFn: readFromDiskFn,
  56. notifyFn: notifyFn,
  57. flushChan: make(chan *dataToFlush, 256),
  58. isStopping: new(atomic.Bool),
  59. batchIndex: time.Now().UnixNano(), // Initialize with creation time for uniqueness across restarts
  60. }
  61. go lb.loopFlush()
  62. go lb.loopInterval()
  63. return lb
  64. }
  65. func (logBuffer *LogBuffer) AddToBuffer(message *mq_pb.DataMessage) {
  66. logBuffer.AddDataToBuffer(message.Key, message.Value, message.TsNs)
  67. }
  68. func (logBuffer *LogBuffer) AddDataToBuffer(partitionKey, data []byte, processingTsNs int64) {
  69. // PERFORMANCE OPTIMIZATION: Pre-process expensive operations OUTSIDE the lock
  70. var ts time.Time
  71. if processingTsNs == 0 {
  72. ts = time.Now()
  73. processingTsNs = ts.UnixNano()
  74. } else {
  75. ts = time.Unix(0, processingTsNs)
  76. }
  77. logEntry := &filer_pb.LogEntry{
  78. TsNs: processingTsNs, // Will be updated if needed
  79. PartitionKeyHash: util.HashToInt32(partitionKey),
  80. Data: data,
  81. Key: partitionKey,
  82. }
  83. logEntryData, _ := proto.Marshal(logEntry)
  84. var toFlush *dataToFlush
  85. logBuffer.Lock()
  86. defer func() {
  87. logBuffer.Unlock()
  88. if toFlush != nil {
  89. logBuffer.flushChan <- toFlush
  90. }
  91. if logBuffer.notifyFn != nil {
  92. logBuffer.notifyFn()
  93. }
  94. }()
  95. // Handle timestamp collision inside lock (rare case)
  96. if logBuffer.LastTsNs.Load() >= processingTsNs {
  97. processingTsNs = logBuffer.LastTsNs.Add(1)
  98. ts = time.Unix(0, processingTsNs)
  99. // Re-marshal with corrected timestamp
  100. logEntry.TsNs = processingTsNs
  101. logEntryData, _ = proto.Marshal(logEntry)
  102. } else {
  103. logBuffer.LastTsNs.Store(processingTsNs)
  104. }
  105. size := len(logEntryData)
  106. if logBuffer.pos == 0 {
  107. logBuffer.startTime = ts
  108. }
  109. if logBuffer.startTime.Add(logBuffer.flushInterval).Before(ts) || len(logBuffer.buf)-logBuffer.pos < size+4 {
  110. // glog.V(0).Infof("%s copyToFlush1 batch:%d count:%d start time %v, ts %v, remaining %d bytes", logBuffer.name, logBuffer.batchIndex, len(logBuffer.idx), logBuffer.startTime, ts, len(logBuffer.buf)-logBuffer.pos)
  111. toFlush = logBuffer.copyToFlush()
  112. logBuffer.startTime = ts
  113. if len(logBuffer.buf) < size+4 {
  114. logBuffer.buf = make([]byte, 2*size+4)
  115. }
  116. }
  117. logBuffer.stopTime = ts
  118. logBuffer.idx = append(logBuffer.idx, logBuffer.pos)
  119. util.Uint32toBytes(logBuffer.sizeBuf, uint32(size))
  120. copy(logBuffer.buf[logBuffer.pos:logBuffer.pos+4], logBuffer.sizeBuf)
  121. copy(logBuffer.buf[logBuffer.pos+4:logBuffer.pos+4+size], logEntryData)
  122. logBuffer.pos += size + 4
  123. // fmt.Printf("partitionKey %v entry size %d total %d count %d\n", string(partitionKey), size, m.pos, len(m.idx))
  124. }
  125. func (logBuffer *LogBuffer) IsStopping() bool {
  126. return logBuffer.isStopping.Load()
  127. }
  128. // ShutdownLogBuffer flushes the buffer and stops the log buffer
  129. func (logBuffer *LogBuffer) ShutdownLogBuffer() {
  130. isAlreadyStopped := logBuffer.isStopping.Swap(true)
  131. if isAlreadyStopped {
  132. return
  133. }
  134. toFlush := logBuffer.copyToFlush()
  135. logBuffer.flushChan <- toFlush
  136. close(logBuffer.flushChan)
  137. }
  138. // IsAllFlushed returns true if all data in the buffer has been flushed, after calling ShutdownLogBuffer().
  139. func (logBuffer *LogBuffer) IsAllFlushed() bool {
  140. return logBuffer.isAllFlushed
  141. }
  142. func (logBuffer *LogBuffer) loopFlush() {
  143. for d := range logBuffer.flushChan {
  144. if d != nil {
  145. // glog.V(4).Infof("%s flush [%v, %v] size %d", m.name, d.startTime, d.stopTime, len(d.data.Bytes()))
  146. logBuffer.flushFn(logBuffer, d.startTime, d.stopTime, d.data.Bytes())
  147. d.releaseMemory()
  148. // local logbuffer is different from aggregate logbuffer here
  149. logBuffer.lastFlushDataTime = d.stopTime
  150. }
  151. }
  152. logBuffer.isAllFlushed = true
  153. }
  154. func (logBuffer *LogBuffer) loopInterval() {
  155. for !logBuffer.IsStopping() {
  156. time.Sleep(logBuffer.flushInterval)
  157. if logBuffer.IsStopping() {
  158. return
  159. }
  160. logBuffer.Lock()
  161. toFlush := logBuffer.copyToFlush()
  162. logBuffer.Unlock()
  163. if toFlush != nil {
  164. glog.V(4).Infof("%s flush [%v, %v] size %d", logBuffer.name, toFlush.startTime, toFlush.stopTime, len(toFlush.data.Bytes()))
  165. logBuffer.flushChan <- toFlush
  166. } else {
  167. // glog.V(0).Infof("%s no flush", m.name)
  168. }
  169. }
  170. }
  171. func (logBuffer *LogBuffer) copyToFlush() *dataToFlush {
  172. if logBuffer.pos > 0 {
  173. // fmt.Printf("flush buffer %d pos %d empty space %d\n", len(m.buf), m.pos, len(m.buf)-m.pos)
  174. var d *dataToFlush
  175. if logBuffer.flushFn != nil {
  176. d = &dataToFlush{
  177. startTime: logBuffer.startTime,
  178. stopTime: logBuffer.stopTime,
  179. data: copiedBytes(logBuffer.buf[:logBuffer.pos]),
  180. }
  181. // glog.V(4).Infof("%s flushing [0,%d) with %d entries [%v, %v]", m.name, m.pos, len(m.idx), m.startTime, m.stopTime)
  182. } else {
  183. // glog.V(4).Infof("%s removed from memory [0,%d) with %d entries [%v, %v]", m.name, m.pos, len(m.idx), m.startTime, m.stopTime)
  184. logBuffer.lastFlushDataTime = logBuffer.stopTime
  185. }
  186. logBuffer.buf = logBuffer.prevBuffers.SealBuffer(logBuffer.startTime, logBuffer.stopTime, logBuffer.buf, logBuffer.pos, logBuffer.batchIndex)
  187. logBuffer.startTime = time.Unix(0, 0)
  188. logBuffer.stopTime = time.Unix(0, 0)
  189. logBuffer.pos = 0
  190. logBuffer.idx = logBuffer.idx[:0]
  191. logBuffer.batchIndex++
  192. return d
  193. }
  194. return nil
  195. }
  196. func (logBuffer *LogBuffer) GetEarliestTime() time.Time {
  197. return logBuffer.startTime
  198. }
  199. func (logBuffer *LogBuffer) GetEarliestPosition() MessagePosition {
  200. return MessagePosition{
  201. Time: logBuffer.startTime,
  202. BatchIndex: logBuffer.batchIndex,
  203. }
  204. }
  205. func (d *dataToFlush) releaseMemory() {
  206. d.data.Reset()
  207. bufferPool.Put(d.data)
  208. }
  209. func (logBuffer *LogBuffer) ReadFromBuffer(lastReadPosition MessagePosition) (bufferCopy *bytes.Buffer, batchIndex int64, err error) {
  210. logBuffer.RLock()
  211. defer logBuffer.RUnlock()
  212. // Read from disk and memory
  213. // 1. read from disk, last time is = td
  214. // 2. in memory, the earliest time = tm
  215. // if tm <= td, case 2.1
  216. // read from memory
  217. // if tm is empty, case 2.2
  218. // read from memory
  219. // if td < tm, case 2.3
  220. // read from disk again
  221. var tsMemory time.Time
  222. var tsBatchIndex int64
  223. if !logBuffer.startTime.IsZero() {
  224. tsMemory = logBuffer.startTime
  225. tsBatchIndex = logBuffer.batchIndex
  226. }
  227. for _, prevBuf := range logBuffer.prevBuffers.buffers {
  228. if !prevBuf.startTime.IsZero() && prevBuf.startTime.Before(tsMemory) {
  229. tsMemory = prevBuf.startTime
  230. tsBatchIndex = prevBuf.batchIndex
  231. }
  232. }
  233. if tsMemory.IsZero() { // case 2.2
  234. // println("2.2 no data")
  235. return nil, -2, nil
  236. } else if lastReadPosition.Before(tsMemory) && lastReadPosition.BatchIndex+1 < tsBatchIndex { // case 2.3
  237. if !logBuffer.lastFlushDataTime.IsZero() {
  238. glog.V(0).Infof("resume with last flush time: %v", logBuffer.lastFlushDataTime)
  239. return nil, -2, ResumeFromDiskError
  240. }
  241. }
  242. // the following is case 2.1
  243. if lastReadPosition.Equal(logBuffer.stopTime) {
  244. return nil, logBuffer.batchIndex, nil
  245. }
  246. if lastReadPosition.After(logBuffer.stopTime) {
  247. // glog.Fatalf("unexpected last read time %v, older than latest %v", lastReadPosition, m.stopTime)
  248. return nil, logBuffer.batchIndex, nil
  249. }
  250. if lastReadPosition.Before(logBuffer.startTime) {
  251. // println("checking ", lastReadPosition.UnixNano())
  252. for _, buf := range logBuffer.prevBuffers.buffers {
  253. if buf.startTime.After(lastReadPosition.Time) {
  254. // glog.V(4).Infof("%s return the %d sealed buffer %v", m.name, i, buf.startTime)
  255. // println("return the", i, "th in memory", buf.startTime.UnixNano())
  256. return copiedBytes(buf.buf[:buf.size]), buf.batchIndex, nil
  257. }
  258. if !buf.startTime.After(lastReadPosition.Time) && buf.stopTime.After(lastReadPosition.Time) {
  259. pos := buf.locateByTs(lastReadPosition.Time)
  260. // fmt.Printf("locate buffer[%d] pos %d\n", i, pos)
  261. return copiedBytes(buf.buf[pos:buf.size]), buf.batchIndex, nil
  262. }
  263. }
  264. // glog.V(4).Infof("%s return the current buf %v", m.name, lastReadPosition)
  265. return copiedBytes(logBuffer.buf[:logBuffer.pos]), logBuffer.batchIndex, nil
  266. }
  267. lastTs := lastReadPosition.UnixNano()
  268. l, h := 0, len(logBuffer.idx)-1
  269. /*
  270. for i, pos := range m.idx {
  271. logEntry, ts := readTs(m.buf, pos)
  272. event := &filer_pb.SubscribeMetadataResponse{}
  273. proto.Unmarshal(logEntry.Data, event)
  274. entry := event.EventNotification.OldEntry
  275. if entry == nil {
  276. entry = event.EventNotification.NewEntry
  277. }
  278. fmt.Printf("entry %d ts: %v offset:%d dir:%s name:%s\n", i, time.Unix(0, ts), pos, event.Directory, entry.Name)
  279. }
  280. fmt.Printf("l=%d, h=%d\n", l, h)
  281. */
  282. for l <= h {
  283. mid := (l + h) / 2
  284. pos := logBuffer.idx[mid]
  285. _, t := readTs(logBuffer.buf, pos)
  286. if t <= lastTs {
  287. l = mid + 1
  288. } else if lastTs < t {
  289. var prevT int64
  290. if mid > 0 {
  291. _, prevT = readTs(logBuffer.buf, logBuffer.idx[mid-1])
  292. }
  293. if prevT <= lastTs {
  294. // fmt.Printf("found l=%d, m-1=%d(ts=%d), m=%d(ts=%d), h=%d [%d, %d) \n", l, mid-1, prevT, mid, t, h, pos, m.pos)
  295. return copiedBytes(logBuffer.buf[pos:logBuffer.pos]), logBuffer.batchIndex, nil
  296. }
  297. h = mid
  298. }
  299. // fmt.Printf("l=%d, h=%d\n", l, h)
  300. }
  301. // FIXME: this could be that the buffer has been flushed already
  302. println("Not sure why no data", lastReadPosition.BatchIndex, tsBatchIndex)
  303. return nil, -2, nil
  304. }
  305. func (logBuffer *LogBuffer) ReleaseMemory(b *bytes.Buffer) {
  306. bufferPool.Put(b)
  307. }
  308. // GetName returns the log buffer name for metadata tracking
  309. func (logBuffer *LogBuffer) GetName() string {
  310. logBuffer.RLock()
  311. defer logBuffer.RUnlock()
  312. return logBuffer.name
  313. }
  314. // GetBatchIndex returns the current batch index for metadata tracking
  315. func (logBuffer *LogBuffer) GetBatchIndex() int64 {
  316. logBuffer.RLock()
  317. defer logBuffer.RUnlock()
  318. return logBuffer.batchIndex
  319. }
  320. var bufferPool = sync.Pool{
  321. New: func() interface{} {
  322. return new(bytes.Buffer)
  323. },
  324. }
  325. func copiedBytes(buf []byte) (copied *bytes.Buffer) {
  326. copied = bufferPool.Get().(*bytes.Buffer)
  327. copied.Reset()
  328. copied.Write(buf)
  329. return
  330. }
  331. func readTs(buf []byte, pos int) (size int, ts int64) {
  332. size = int(util.BytesToUint32(buf[pos : pos+4]))
  333. entryData := buf[pos+4 : pos+4+size]
  334. logEntry := &filer_pb.LogEntry{}
  335. err := proto.Unmarshal(entryData, logEntry)
  336. if err != nil {
  337. glog.Fatalf("unexpected unmarshal filer_pb.LogEntry: %v", err)
  338. }
  339. return size, logEntry.TsNs
  340. }