to_parquet_value_test.go 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666
  1. package schema
  2. import (
  3. "math/big"
  4. "testing"
  5. "time"
  6. "github.com/parquet-go/parquet-go"
  7. "github.com/seaweedfs/seaweedfs/weed/pb/schema_pb"
  8. )
  9. func TestToParquetValue_BasicTypes(t *testing.T) {
  10. tests := []struct {
  11. name string
  12. value *schema_pb.Value
  13. expected parquet.Value
  14. wantErr bool
  15. }{
  16. {
  17. name: "BoolValue true",
  18. value: &schema_pb.Value{
  19. Kind: &schema_pb.Value_BoolValue{BoolValue: true},
  20. },
  21. expected: parquet.BooleanValue(true),
  22. },
  23. {
  24. name: "Int32Value",
  25. value: &schema_pb.Value{
  26. Kind: &schema_pb.Value_Int32Value{Int32Value: 42},
  27. },
  28. expected: parquet.Int32Value(42),
  29. },
  30. {
  31. name: "Int64Value",
  32. value: &schema_pb.Value{
  33. Kind: &schema_pb.Value_Int64Value{Int64Value: 12345678901234},
  34. },
  35. expected: parquet.Int64Value(12345678901234),
  36. },
  37. {
  38. name: "FloatValue",
  39. value: &schema_pb.Value{
  40. Kind: &schema_pb.Value_FloatValue{FloatValue: 3.14159},
  41. },
  42. expected: parquet.FloatValue(3.14159),
  43. },
  44. {
  45. name: "DoubleValue",
  46. value: &schema_pb.Value{
  47. Kind: &schema_pb.Value_DoubleValue{DoubleValue: 2.718281828},
  48. },
  49. expected: parquet.DoubleValue(2.718281828),
  50. },
  51. {
  52. name: "BytesValue",
  53. value: &schema_pb.Value{
  54. Kind: &schema_pb.Value_BytesValue{BytesValue: []byte("hello world")},
  55. },
  56. expected: parquet.ByteArrayValue([]byte("hello world")),
  57. },
  58. {
  59. name: "BytesValue empty",
  60. value: &schema_pb.Value{
  61. Kind: &schema_pb.Value_BytesValue{BytesValue: []byte{}},
  62. },
  63. expected: parquet.ByteArrayValue([]byte{}),
  64. },
  65. {
  66. name: "StringValue",
  67. value: &schema_pb.Value{
  68. Kind: &schema_pb.Value_StringValue{StringValue: "test string"},
  69. },
  70. expected: parquet.ByteArrayValue([]byte("test string")),
  71. },
  72. }
  73. for _, tt := range tests {
  74. t.Run(tt.name, func(t *testing.T) {
  75. result, err := toParquetValue(tt.value)
  76. if (err != nil) != tt.wantErr {
  77. t.Errorf("toParquetValue() error = %v, wantErr %v", err, tt.wantErr)
  78. return
  79. }
  80. if !parquetValuesEqual(result, tt.expected) {
  81. t.Errorf("toParquetValue() = %v, want %v", result, tt.expected)
  82. }
  83. })
  84. }
  85. }
  86. func TestToParquetValue_TimestampValue(t *testing.T) {
  87. tests := []struct {
  88. name string
  89. value *schema_pb.Value
  90. expected parquet.Value
  91. wantErr bool
  92. }{
  93. {
  94. name: "Valid TimestampValue UTC",
  95. value: &schema_pb.Value{
  96. Kind: &schema_pb.Value_TimestampValue{
  97. TimestampValue: &schema_pb.TimestampValue{
  98. TimestampMicros: 1704067200000000, // 2024-01-01 00:00:00 UTC in microseconds
  99. IsUtc: true,
  100. },
  101. },
  102. },
  103. expected: parquet.Int64Value(1704067200000000),
  104. },
  105. {
  106. name: "Valid TimestampValue local",
  107. value: &schema_pb.Value{
  108. Kind: &schema_pb.Value_TimestampValue{
  109. TimestampValue: &schema_pb.TimestampValue{
  110. TimestampMicros: 1704067200000000,
  111. IsUtc: false,
  112. },
  113. },
  114. },
  115. expected: parquet.Int64Value(1704067200000000),
  116. },
  117. {
  118. name: "TimestampValue zero",
  119. value: &schema_pb.Value{
  120. Kind: &schema_pb.Value_TimestampValue{
  121. TimestampValue: &schema_pb.TimestampValue{
  122. TimestampMicros: 0,
  123. IsUtc: true,
  124. },
  125. },
  126. },
  127. expected: parquet.Int64Value(0),
  128. },
  129. {
  130. name: "TimestampValue negative (before epoch)",
  131. value: &schema_pb.Value{
  132. Kind: &schema_pb.Value_TimestampValue{
  133. TimestampValue: &schema_pb.TimestampValue{
  134. TimestampMicros: -1000000, // 1 second before epoch
  135. IsUtc: true,
  136. },
  137. },
  138. },
  139. expected: parquet.Int64Value(-1000000),
  140. },
  141. {
  142. name: "TimestampValue nil pointer",
  143. value: &schema_pb.Value{
  144. Kind: &schema_pb.Value_TimestampValue{
  145. TimestampValue: nil,
  146. },
  147. },
  148. expected: parquet.NullValue(),
  149. },
  150. }
  151. for _, tt := range tests {
  152. t.Run(tt.name, func(t *testing.T) {
  153. result, err := toParquetValue(tt.value)
  154. if (err != nil) != tt.wantErr {
  155. t.Errorf("toParquetValue() error = %v, wantErr %v", err, tt.wantErr)
  156. return
  157. }
  158. if !parquetValuesEqual(result, tt.expected) {
  159. t.Errorf("toParquetValue() = %v, want %v", result, tt.expected)
  160. }
  161. })
  162. }
  163. }
  164. func TestToParquetValue_DateValue(t *testing.T) {
  165. tests := []struct {
  166. name string
  167. value *schema_pb.Value
  168. expected parquet.Value
  169. wantErr bool
  170. }{
  171. {
  172. name: "Valid DateValue (2024-01-01)",
  173. value: &schema_pb.Value{
  174. Kind: &schema_pb.Value_DateValue{
  175. DateValue: &schema_pb.DateValue{
  176. DaysSinceEpoch: 19723, // 2024-01-01 = 19723 days since epoch
  177. },
  178. },
  179. },
  180. expected: parquet.Int32Value(19723),
  181. },
  182. {
  183. name: "DateValue epoch (1970-01-01)",
  184. value: &schema_pb.Value{
  185. Kind: &schema_pb.Value_DateValue{
  186. DateValue: &schema_pb.DateValue{
  187. DaysSinceEpoch: 0,
  188. },
  189. },
  190. },
  191. expected: parquet.Int32Value(0),
  192. },
  193. {
  194. name: "DateValue before epoch",
  195. value: &schema_pb.Value{
  196. Kind: &schema_pb.Value_DateValue{
  197. DateValue: &schema_pb.DateValue{
  198. DaysSinceEpoch: -365, // 1969-01-01
  199. },
  200. },
  201. },
  202. expected: parquet.Int32Value(-365),
  203. },
  204. {
  205. name: "DateValue nil pointer",
  206. value: &schema_pb.Value{
  207. Kind: &schema_pb.Value_DateValue{
  208. DateValue: nil,
  209. },
  210. },
  211. expected: parquet.NullValue(),
  212. },
  213. }
  214. for _, tt := range tests {
  215. t.Run(tt.name, func(t *testing.T) {
  216. result, err := toParquetValue(tt.value)
  217. if (err != nil) != tt.wantErr {
  218. t.Errorf("toParquetValue() error = %v, wantErr %v", err, tt.wantErr)
  219. return
  220. }
  221. if !parquetValuesEqual(result, tt.expected) {
  222. t.Errorf("toParquetValue() = %v, want %v", result, tt.expected)
  223. }
  224. })
  225. }
  226. }
  227. func TestToParquetValue_DecimalValue(t *testing.T) {
  228. tests := []struct {
  229. name string
  230. value *schema_pb.Value
  231. expected parquet.Value
  232. wantErr bool
  233. }{
  234. {
  235. name: "Small Decimal (precision <= 9) - positive",
  236. value: &schema_pb.Value{
  237. Kind: &schema_pb.Value_DecimalValue{
  238. DecimalValue: &schema_pb.DecimalValue{
  239. Value: encodeBigIntToBytes(big.NewInt(12345)), // 123.45 with scale 2
  240. Precision: 5,
  241. Scale: 2,
  242. },
  243. },
  244. },
  245. expected: createFixedLenByteArray(encodeBigIntToBytes(big.NewInt(12345))), // FixedLenByteArray conversion
  246. },
  247. {
  248. name: "Small Decimal (precision <= 9) - negative",
  249. value: &schema_pb.Value{
  250. Kind: &schema_pb.Value_DecimalValue{
  251. DecimalValue: &schema_pb.DecimalValue{
  252. Value: encodeBigIntToBytes(big.NewInt(-12345)),
  253. Precision: 5,
  254. Scale: 2,
  255. },
  256. },
  257. },
  258. expected: createFixedLenByteArray(encodeBigIntToBytes(big.NewInt(-12345))), // FixedLenByteArray conversion
  259. },
  260. {
  261. name: "Medium Decimal (9 < precision <= 18)",
  262. value: &schema_pb.Value{
  263. Kind: &schema_pb.Value_DecimalValue{
  264. DecimalValue: &schema_pb.DecimalValue{
  265. Value: encodeBigIntToBytes(big.NewInt(123456789012345)),
  266. Precision: 15,
  267. Scale: 2,
  268. },
  269. },
  270. },
  271. expected: createFixedLenByteArray(encodeBigIntToBytes(big.NewInt(123456789012345))), // FixedLenByteArray conversion
  272. },
  273. {
  274. name: "Large Decimal (precision > 18)",
  275. value: &schema_pb.Value{
  276. Kind: &schema_pb.Value_DecimalValue{
  277. DecimalValue: &schema_pb.DecimalValue{
  278. Value: []byte{0x01, 0x23, 0x45, 0x67, 0x89, 0xAB, 0xCD, 0xEF}, // Large number as bytes
  279. Precision: 25,
  280. Scale: 5,
  281. },
  282. },
  283. },
  284. expected: createFixedLenByteArray([]byte{0x01, 0x23, 0x45, 0x67, 0x89, 0xAB, 0xCD, 0xEF}), // FixedLenByteArray conversion
  285. },
  286. {
  287. name: "Decimal with zero precision",
  288. value: &schema_pb.Value{
  289. Kind: &schema_pb.Value_DecimalValue{
  290. DecimalValue: &schema_pb.DecimalValue{
  291. Value: encodeBigIntToBytes(big.NewInt(0)),
  292. Precision: 0,
  293. Scale: 0,
  294. },
  295. },
  296. },
  297. expected: createFixedLenByteArray(encodeBigIntToBytes(big.NewInt(0))), // Zero as FixedLenByteArray
  298. },
  299. {
  300. name: "Decimal nil pointer",
  301. value: &schema_pb.Value{
  302. Kind: &schema_pb.Value_DecimalValue{
  303. DecimalValue: nil,
  304. },
  305. },
  306. expected: parquet.NullValue(),
  307. },
  308. {
  309. name: "Decimal with nil Value bytes",
  310. value: &schema_pb.Value{
  311. Kind: &schema_pb.Value_DecimalValue{
  312. DecimalValue: &schema_pb.DecimalValue{
  313. Value: nil, // This was the original panic cause
  314. Precision: 5,
  315. Scale: 2,
  316. },
  317. },
  318. },
  319. expected: parquet.NullValue(),
  320. },
  321. {
  322. name: "Decimal with empty Value bytes",
  323. value: &schema_pb.Value{
  324. Kind: &schema_pb.Value_DecimalValue{
  325. DecimalValue: &schema_pb.DecimalValue{
  326. Value: []byte{}, // Empty slice
  327. Precision: 5,
  328. Scale: 2,
  329. },
  330. },
  331. },
  332. expected: parquet.NullValue(), // Returns null for empty bytes
  333. },
  334. {
  335. name: "Decimal out of int32 range (stored as binary)",
  336. value: &schema_pb.Value{
  337. Kind: &schema_pb.Value_DecimalValue{
  338. DecimalValue: &schema_pb.DecimalValue{
  339. Value: encodeBigIntToBytes(big.NewInt(999999999999)), // Too large for int32
  340. Precision: 5, // But precision says int32
  341. Scale: 0,
  342. },
  343. },
  344. },
  345. expected: createFixedLenByteArray(encodeBigIntToBytes(big.NewInt(999999999999))), // FixedLenByteArray
  346. },
  347. {
  348. name: "Decimal out of int64 range (stored as binary)",
  349. value: &schema_pb.Value{
  350. Kind: &schema_pb.Value_DecimalValue{
  351. DecimalValue: &schema_pb.DecimalValue{
  352. Value: func() []byte {
  353. // Create a number larger than int64 max
  354. bigNum := new(big.Int)
  355. bigNum.SetString("99999999999999999999999999999", 10)
  356. return encodeBigIntToBytes(bigNum)
  357. }(),
  358. Precision: 15, // Says int64 but value is too large
  359. Scale: 0,
  360. },
  361. },
  362. },
  363. expected: createFixedLenByteArray(func() []byte {
  364. bigNum := new(big.Int)
  365. bigNum.SetString("99999999999999999999999999999", 10)
  366. return encodeBigIntToBytes(bigNum)
  367. }()), // Large number as FixedLenByteArray (truncated to 16 bytes)
  368. },
  369. {
  370. name: "Decimal extremely large value (should be rejected)",
  371. value: &schema_pb.Value{
  372. Kind: &schema_pb.Value_DecimalValue{
  373. DecimalValue: &schema_pb.DecimalValue{
  374. Value: make([]byte, 100), // 100 bytes > 64 byte limit
  375. Precision: 100,
  376. Scale: 0,
  377. },
  378. },
  379. },
  380. expected: parquet.NullValue(),
  381. wantErr: true, // Should return error instead of corrupting data
  382. },
  383. }
  384. for _, tt := range tests {
  385. t.Run(tt.name, func(t *testing.T) {
  386. result, err := toParquetValue(tt.value)
  387. if (err != nil) != tt.wantErr {
  388. t.Errorf("toParquetValue() error = %v, wantErr %v", err, tt.wantErr)
  389. return
  390. }
  391. if !parquetValuesEqual(result, tt.expected) {
  392. t.Errorf("toParquetValue() = %v, want %v", result, tt.expected)
  393. }
  394. })
  395. }
  396. }
  397. func TestToParquetValue_TimeValue(t *testing.T) {
  398. tests := []struct {
  399. name string
  400. value *schema_pb.Value
  401. expected parquet.Value
  402. wantErr bool
  403. }{
  404. {
  405. name: "Valid TimeValue (12:34:56.789)",
  406. value: &schema_pb.Value{
  407. Kind: &schema_pb.Value_TimeValue{
  408. TimeValue: &schema_pb.TimeValue{
  409. TimeMicros: 45296789000, // 12:34:56.789 in microseconds since midnight
  410. },
  411. },
  412. },
  413. expected: parquet.Int64Value(45296789000),
  414. },
  415. {
  416. name: "TimeValue midnight",
  417. value: &schema_pb.Value{
  418. Kind: &schema_pb.Value_TimeValue{
  419. TimeValue: &schema_pb.TimeValue{
  420. TimeMicros: 0,
  421. },
  422. },
  423. },
  424. expected: parquet.Int64Value(0),
  425. },
  426. {
  427. name: "TimeValue end of day (23:59:59.999999)",
  428. value: &schema_pb.Value{
  429. Kind: &schema_pb.Value_TimeValue{
  430. TimeValue: &schema_pb.TimeValue{
  431. TimeMicros: 86399999999, // 23:59:59.999999
  432. },
  433. },
  434. },
  435. expected: parquet.Int64Value(86399999999),
  436. },
  437. {
  438. name: "TimeValue nil pointer",
  439. value: &schema_pb.Value{
  440. Kind: &schema_pb.Value_TimeValue{
  441. TimeValue: nil,
  442. },
  443. },
  444. expected: parquet.NullValue(),
  445. },
  446. }
  447. for _, tt := range tests {
  448. t.Run(tt.name, func(t *testing.T) {
  449. result, err := toParquetValue(tt.value)
  450. if (err != nil) != tt.wantErr {
  451. t.Errorf("toParquetValue() error = %v, wantErr %v", err, tt.wantErr)
  452. return
  453. }
  454. if !parquetValuesEqual(result, tt.expected) {
  455. t.Errorf("toParquetValue() = %v, want %v", result, tt.expected)
  456. }
  457. })
  458. }
  459. }
  460. func TestToParquetValue_EdgeCases(t *testing.T) {
  461. tests := []struct {
  462. name string
  463. value *schema_pb.Value
  464. expected parquet.Value
  465. wantErr bool
  466. }{
  467. {
  468. name: "Nil value",
  469. value: &schema_pb.Value{
  470. Kind: nil,
  471. },
  472. wantErr: true,
  473. },
  474. {
  475. name: "Completely nil value",
  476. value: nil,
  477. wantErr: true,
  478. },
  479. {
  480. name: "BytesValue with nil slice",
  481. value: &schema_pb.Value{
  482. Kind: &schema_pb.Value_BytesValue{BytesValue: nil},
  483. },
  484. expected: parquet.ByteArrayValue([]byte{}), // Should convert nil to empty slice
  485. },
  486. }
  487. for _, tt := range tests {
  488. t.Run(tt.name, func(t *testing.T) {
  489. result, err := toParquetValue(tt.value)
  490. if (err != nil) != tt.wantErr {
  491. t.Errorf("toParquetValue() error = %v, wantErr %v", err, tt.wantErr)
  492. return
  493. }
  494. if !tt.wantErr && !parquetValuesEqual(result, tt.expected) {
  495. t.Errorf("toParquetValue() = %v, want %v", result, tt.expected)
  496. }
  497. })
  498. }
  499. }
  500. // Helper function to encode a big.Int to bytes using two's complement representation
  501. func encodeBigIntToBytes(n *big.Int) []byte {
  502. if n.Sign() == 0 {
  503. return []byte{0}
  504. }
  505. // For positive numbers, just use Bytes()
  506. if n.Sign() > 0 {
  507. return n.Bytes()
  508. }
  509. // For negative numbers, we need two's complement representation
  510. bitLen := n.BitLen()
  511. if bitLen%8 != 0 {
  512. bitLen += 8 - (bitLen % 8) // Round up to byte boundary
  513. }
  514. byteLen := bitLen / 8
  515. if byteLen == 0 {
  516. byteLen = 1
  517. }
  518. // Calculate 2^(byteLen*8)
  519. modulus := new(big.Int).Lsh(big.NewInt(1), uint(byteLen*8))
  520. // Convert negative to positive representation: n + 2^(byteLen*8)
  521. positive := new(big.Int).Add(n, modulus)
  522. bytes := positive.Bytes()
  523. // Pad with leading zeros if needed
  524. if len(bytes) < byteLen {
  525. padded := make([]byte, byteLen)
  526. copy(padded[byteLen-len(bytes):], bytes)
  527. return padded
  528. }
  529. return bytes
  530. }
  531. // Helper function to create a FixedLenByteArray(16) matching our conversion logic
  532. func createFixedLenByteArray(inputBytes []byte) parquet.Value {
  533. fixedBytes := make([]byte, 16)
  534. if len(inputBytes) <= 16 {
  535. // Right-align the value (big-endian) - same as our conversion logic
  536. copy(fixedBytes[16-len(inputBytes):], inputBytes)
  537. } else {
  538. // Truncate if too large, taking the least significant bytes
  539. copy(fixedBytes, inputBytes[len(inputBytes)-16:])
  540. }
  541. return parquet.FixedLenByteArrayValue(fixedBytes)
  542. }
  543. // Helper function to compare parquet values
  544. func parquetValuesEqual(a, b parquet.Value) bool {
  545. // Handle both being null
  546. if a.IsNull() && b.IsNull() {
  547. return true
  548. }
  549. if a.IsNull() != b.IsNull() {
  550. return false
  551. }
  552. // Compare kind first
  553. if a.Kind() != b.Kind() {
  554. return false
  555. }
  556. // Compare based on type
  557. switch a.Kind() {
  558. case parquet.Boolean:
  559. return a.Boolean() == b.Boolean()
  560. case parquet.Int32:
  561. return a.Int32() == b.Int32()
  562. case parquet.Int64:
  563. return a.Int64() == b.Int64()
  564. case parquet.Float:
  565. return a.Float() == b.Float()
  566. case parquet.Double:
  567. return a.Double() == b.Double()
  568. case parquet.ByteArray:
  569. aBytes := a.ByteArray()
  570. bBytes := b.ByteArray()
  571. if len(aBytes) != len(bBytes) {
  572. return false
  573. }
  574. for i, v := range aBytes {
  575. if v != bBytes[i] {
  576. return false
  577. }
  578. }
  579. return true
  580. case parquet.FixedLenByteArray:
  581. aBytes := a.ByteArray() // FixedLenByteArray also uses ByteArray() method
  582. bBytes := b.ByteArray()
  583. if len(aBytes) != len(bBytes) {
  584. return false
  585. }
  586. for i, v := range aBytes {
  587. if v != bBytes[i] {
  588. return false
  589. }
  590. }
  591. return true
  592. default:
  593. return false
  594. }
  595. }
  596. // Benchmark tests
  597. func BenchmarkToParquetValue_BasicTypes(b *testing.B) {
  598. value := &schema_pb.Value{
  599. Kind: &schema_pb.Value_Int64Value{Int64Value: 12345678901234},
  600. }
  601. b.ResetTimer()
  602. for i := 0; i < b.N; i++ {
  603. _, _ = toParquetValue(value)
  604. }
  605. }
  606. func BenchmarkToParquetValue_TimestampValue(b *testing.B) {
  607. value := &schema_pb.Value{
  608. Kind: &schema_pb.Value_TimestampValue{
  609. TimestampValue: &schema_pb.TimestampValue{
  610. TimestampMicros: time.Now().UnixMicro(),
  611. IsUtc: true,
  612. },
  613. },
  614. }
  615. b.ResetTimer()
  616. for i := 0; i < b.N; i++ {
  617. _, _ = toParquetValue(value)
  618. }
  619. }
  620. func BenchmarkToParquetValue_DecimalValue(b *testing.B) {
  621. value := &schema_pb.Value{
  622. Kind: &schema_pb.Value_DecimalValue{
  623. DecimalValue: &schema_pb.DecimalValue{
  624. Value: encodeBigIntToBytes(big.NewInt(123456789012345)),
  625. Precision: 15,
  626. Scale: 2,
  627. },
  628. },
  629. }
  630. b.ResetTimer()
  631. for i := 0; i < b.N; i++ {
  632. _, _ = toParquetValue(value)
  633. }
  634. }