| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666 |
- package schema
- import (
- "math/big"
- "testing"
- "time"
- "github.com/parquet-go/parquet-go"
- "github.com/seaweedfs/seaweedfs/weed/pb/schema_pb"
- )
- func TestToParquetValue_BasicTypes(t *testing.T) {
- tests := []struct {
- name string
- value *schema_pb.Value
- expected parquet.Value
- wantErr bool
- }{
- {
- name: "BoolValue true",
- value: &schema_pb.Value{
- Kind: &schema_pb.Value_BoolValue{BoolValue: true},
- },
- expected: parquet.BooleanValue(true),
- },
- {
- name: "Int32Value",
- value: &schema_pb.Value{
- Kind: &schema_pb.Value_Int32Value{Int32Value: 42},
- },
- expected: parquet.Int32Value(42),
- },
- {
- name: "Int64Value",
- value: &schema_pb.Value{
- Kind: &schema_pb.Value_Int64Value{Int64Value: 12345678901234},
- },
- expected: parquet.Int64Value(12345678901234),
- },
- {
- name: "FloatValue",
- value: &schema_pb.Value{
- Kind: &schema_pb.Value_FloatValue{FloatValue: 3.14159},
- },
- expected: parquet.FloatValue(3.14159),
- },
- {
- name: "DoubleValue",
- value: &schema_pb.Value{
- Kind: &schema_pb.Value_DoubleValue{DoubleValue: 2.718281828},
- },
- expected: parquet.DoubleValue(2.718281828),
- },
- {
- name: "BytesValue",
- value: &schema_pb.Value{
- Kind: &schema_pb.Value_BytesValue{BytesValue: []byte("hello world")},
- },
- expected: parquet.ByteArrayValue([]byte("hello world")),
- },
- {
- name: "BytesValue empty",
- value: &schema_pb.Value{
- Kind: &schema_pb.Value_BytesValue{BytesValue: []byte{}},
- },
- expected: parquet.ByteArrayValue([]byte{}),
- },
- {
- name: "StringValue",
- value: &schema_pb.Value{
- Kind: &schema_pb.Value_StringValue{StringValue: "test string"},
- },
- expected: parquet.ByteArrayValue([]byte("test string")),
- },
- }
- for _, tt := range tests {
- t.Run(tt.name, func(t *testing.T) {
- result, err := toParquetValue(tt.value)
- if (err != nil) != tt.wantErr {
- t.Errorf("toParquetValue() error = %v, wantErr %v", err, tt.wantErr)
- return
- }
- if !parquetValuesEqual(result, tt.expected) {
- t.Errorf("toParquetValue() = %v, want %v", result, tt.expected)
- }
- })
- }
- }
- func TestToParquetValue_TimestampValue(t *testing.T) {
- tests := []struct {
- name string
- value *schema_pb.Value
- expected parquet.Value
- wantErr bool
- }{
- {
- name: "Valid TimestampValue UTC",
- value: &schema_pb.Value{
- Kind: &schema_pb.Value_TimestampValue{
- TimestampValue: &schema_pb.TimestampValue{
- TimestampMicros: 1704067200000000, // 2024-01-01 00:00:00 UTC in microseconds
- IsUtc: true,
- },
- },
- },
- expected: parquet.Int64Value(1704067200000000),
- },
- {
- name: "Valid TimestampValue local",
- value: &schema_pb.Value{
- Kind: &schema_pb.Value_TimestampValue{
- TimestampValue: &schema_pb.TimestampValue{
- TimestampMicros: 1704067200000000,
- IsUtc: false,
- },
- },
- },
- expected: parquet.Int64Value(1704067200000000),
- },
- {
- name: "TimestampValue zero",
- value: &schema_pb.Value{
- Kind: &schema_pb.Value_TimestampValue{
- TimestampValue: &schema_pb.TimestampValue{
- TimestampMicros: 0,
- IsUtc: true,
- },
- },
- },
- expected: parquet.Int64Value(0),
- },
- {
- name: "TimestampValue negative (before epoch)",
- value: &schema_pb.Value{
- Kind: &schema_pb.Value_TimestampValue{
- TimestampValue: &schema_pb.TimestampValue{
- TimestampMicros: -1000000, // 1 second before epoch
- IsUtc: true,
- },
- },
- },
- expected: parquet.Int64Value(-1000000),
- },
- {
- name: "TimestampValue nil pointer",
- value: &schema_pb.Value{
- Kind: &schema_pb.Value_TimestampValue{
- TimestampValue: nil,
- },
- },
- expected: parquet.NullValue(),
- },
- }
- for _, tt := range tests {
- t.Run(tt.name, func(t *testing.T) {
- result, err := toParquetValue(tt.value)
- if (err != nil) != tt.wantErr {
- t.Errorf("toParquetValue() error = %v, wantErr %v", err, tt.wantErr)
- return
- }
- if !parquetValuesEqual(result, tt.expected) {
- t.Errorf("toParquetValue() = %v, want %v", result, tt.expected)
- }
- })
- }
- }
- func TestToParquetValue_DateValue(t *testing.T) {
- tests := []struct {
- name string
- value *schema_pb.Value
- expected parquet.Value
- wantErr bool
- }{
- {
- name: "Valid DateValue (2024-01-01)",
- value: &schema_pb.Value{
- Kind: &schema_pb.Value_DateValue{
- DateValue: &schema_pb.DateValue{
- DaysSinceEpoch: 19723, // 2024-01-01 = 19723 days since epoch
- },
- },
- },
- expected: parquet.Int32Value(19723),
- },
- {
- name: "DateValue epoch (1970-01-01)",
- value: &schema_pb.Value{
- Kind: &schema_pb.Value_DateValue{
- DateValue: &schema_pb.DateValue{
- DaysSinceEpoch: 0,
- },
- },
- },
- expected: parquet.Int32Value(0),
- },
- {
- name: "DateValue before epoch",
- value: &schema_pb.Value{
- Kind: &schema_pb.Value_DateValue{
- DateValue: &schema_pb.DateValue{
- DaysSinceEpoch: -365, // 1969-01-01
- },
- },
- },
- expected: parquet.Int32Value(-365),
- },
- {
- name: "DateValue nil pointer",
- value: &schema_pb.Value{
- Kind: &schema_pb.Value_DateValue{
- DateValue: nil,
- },
- },
- expected: parquet.NullValue(),
- },
- }
- for _, tt := range tests {
- t.Run(tt.name, func(t *testing.T) {
- result, err := toParquetValue(tt.value)
- if (err != nil) != tt.wantErr {
- t.Errorf("toParquetValue() error = %v, wantErr %v", err, tt.wantErr)
- return
- }
- if !parquetValuesEqual(result, tt.expected) {
- t.Errorf("toParquetValue() = %v, want %v", result, tt.expected)
- }
- })
- }
- }
- func TestToParquetValue_DecimalValue(t *testing.T) {
- tests := []struct {
- name string
- value *schema_pb.Value
- expected parquet.Value
- wantErr bool
- }{
- {
- name: "Small Decimal (precision <= 9) - positive",
- value: &schema_pb.Value{
- Kind: &schema_pb.Value_DecimalValue{
- DecimalValue: &schema_pb.DecimalValue{
- Value: encodeBigIntToBytes(big.NewInt(12345)), // 123.45 with scale 2
- Precision: 5,
- Scale: 2,
- },
- },
- },
- expected: createFixedLenByteArray(encodeBigIntToBytes(big.NewInt(12345))), // FixedLenByteArray conversion
- },
- {
- name: "Small Decimal (precision <= 9) - negative",
- value: &schema_pb.Value{
- Kind: &schema_pb.Value_DecimalValue{
- DecimalValue: &schema_pb.DecimalValue{
- Value: encodeBigIntToBytes(big.NewInt(-12345)),
- Precision: 5,
- Scale: 2,
- },
- },
- },
- expected: createFixedLenByteArray(encodeBigIntToBytes(big.NewInt(-12345))), // FixedLenByteArray conversion
- },
- {
- name: "Medium Decimal (9 < precision <= 18)",
- value: &schema_pb.Value{
- Kind: &schema_pb.Value_DecimalValue{
- DecimalValue: &schema_pb.DecimalValue{
- Value: encodeBigIntToBytes(big.NewInt(123456789012345)),
- Precision: 15,
- Scale: 2,
- },
- },
- },
- expected: createFixedLenByteArray(encodeBigIntToBytes(big.NewInt(123456789012345))), // FixedLenByteArray conversion
- },
- {
- name: "Large Decimal (precision > 18)",
- value: &schema_pb.Value{
- Kind: &schema_pb.Value_DecimalValue{
- DecimalValue: &schema_pb.DecimalValue{
- Value: []byte{0x01, 0x23, 0x45, 0x67, 0x89, 0xAB, 0xCD, 0xEF}, // Large number as bytes
- Precision: 25,
- Scale: 5,
- },
- },
- },
- expected: createFixedLenByteArray([]byte{0x01, 0x23, 0x45, 0x67, 0x89, 0xAB, 0xCD, 0xEF}), // FixedLenByteArray conversion
- },
- {
- name: "Decimal with zero precision",
- value: &schema_pb.Value{
- Kind: &schema_pb.Value_DecimalValue{
- DecimalValue: &schema_pb.DecimalValue{
- Value: encodeBigIntToBytes(big.NewInt(0)),
- Precision: 0,
- Scale: 0,
- },
- },
- },
- expected: createFixedLenByteArray(encodeBigIntToBytes(big.NewInt(0))), // Zero as FixedLenByteArray
- },
- {
- name: "Decimal nil pointer",
- value: &schema_pb.Value{
- Kind: &schema_pb.Value_DecimalValue{
- DecimalValue: nil,
- },
- },
- expected: parquet.NullValue(),
- },
- {
- name: "Decimal with nil Value bytes",
- value: &schema_pb.Value{
- Kind: &schema_pb.Value_DecimalValue{
- DecimalValue: &schema_pb.DecimalValue{
- Value: nil, // This was the original panic cause
- Precision: 5,
- Scale: 2,
- },
- },
- },
- expected: parquet.NullValue(),
- },
- {
- name: "Decimal with empty Value bytes",
- value: &schema_pb.Value{
- Kind: &schema_pb.Value_DecimalValue{
- DecimalValue: &schema_pb.DecimalValue{
- Value: []byte{}, // Empty slice
- Precision: 5,
- Scale: 2,
- },
- },
- },
- expected: parquet.NullValue(), // Returns null for empty bytes
- },
- {
- name: "Decimal out of int32 range (stored as binary)",
- value: &schema_pb.Value{
- Kind: &schema_pb.Value_DecimalValue{
- DecimalValue: &schema_pb.DecimalValue{
- Value: encodeBigIntToBytes(big.NewInt(999999999999)), // Too large for int32
- Precision: 5, // But precision says int32
- Scale: 0,
- },
- },
- },
- expected: createFixedLenByteArray(encodeBigIntToBytes(big.NewInt(999999999999))), // FixedLenByteArray
- },
- {
- name: "Decimal out of int64 range (stored as binary)",
- value: &schema_pb.Value{
- Kind: &schema_pb.Value_DecimalValue{
- DecimalValue: &schema_pb.DecimalValue{
- Value: func() []byte {
- // Create a number larger than int64 max
- bigNum := new(big.Int)
- bigNum.SetString("99999999999999999999999999999", 10)
- return encodeBigIntToBytes(bigNum)
- }(),
- Precision: 15, // Says int64 but value is too large
- Scale: 0,
- },
- },
- },
- expected: createFixedLenByteArray(func() []byte {
- bigNum := new(big.Int)
- bigNum.SetString("99999999999999999999999999999", 10)
- return encodeBigIntToBytes(bigNum)
- }()), // Large number as FixedLenByteArray (truncated to 16 bytes)
- },
- {
- name: "Decimal extremely large value (should be rejected)",
- value: &schema_pb.Value{
- Kind: &schema_pb.Value_DecimalValue{
- DecimalValue: &schema_pb.DecimalValue{
- Value: make([]byte, 100), // 100 bytes > 64 byte limit
- Precision: 100,
- Scale: 0,
- },
- },
- },
- expected: parquet.NullValue(),
- wantErr: true, // Should return error instead of corrupting data
- },
- }
- for _, tt := range tests {
- t.Run(tt.name, func(t *testing.T) {
- result, err := toParquetValue(tt.value)
- if (err != nil) != tt.wantErr {
- t.Errorf("toParquetValue() error = %v, wantErr %v", err, tt.wantErr)
- return
- }
- if !parquetValuesEqual(result, tt.expected) {
- t.Errorf("toParquetValue() = %v, want %v", result, tt.expected)
- }
- })
- }
- }
- func TestToParquetValue_TimeValue(t *testing.T) {
- tests := []struct {
- name string
- value *schema_pb.Value
- expected parquet.Value
- wantErr bool
- }{
- {
- name: "Valid TimeValue (12:34:56.789)",
- value: &schema_pb.Value{
- Kind: &schema_pb.Value_TimeValue{
- TimeValue: &schema_pb.TimeValue{
- TimeMicros: 45296789000, // 12:34:56.789 in microseconds since midnight
- },
- },
- },
- expected: parquet.Int64Value(45296789000),
- },
- {
- name: "TimeValue midnight",
- value: &schema_pb.Value{
- Kind: &schema_pb.Value_TimeValue{
- TimeValue: &schema_pb.TimeValue{
- TimeMicros: 0,
- },
- },
- },
- expected: parquet.Int64Value(0),
- },
- {
- name: "TimeValue end of day (23:59:59.999999)",
- value: &schema_pb.Value{
- Kind: &schema_pb.Value_TimeValue{
- TimeValue: &schema_pb.TimeValue{
- TimeMicros: 86399999999, // 23:59:59.999999
- },
- },
- },
- expected: parquet.Int64Value(86399999999),
- },
- {
- name: "TimeValue nil pointer",
- value: &schema_pb.Value{
- Kind: &schema_pb.Value_TimeValue{
- TimeValue: nil,
- },
- },
- expected: parquet.NullValue(),
- },
- }
- for _, tt := range tests {
- t.Run(tt.name, func(t *testing.T) {
- result, err := toParquetValue(tt.value)
- if (err != nil) != tt.wantErr {
- t.Errorf("toParquetValue() error = %v, wantErr %v", err, tt.wantErr)
- return
- }
- if !parquetValuesEqual(result, tt.expected) {
- t.Errorf("toParquetValue() = %v, want %v", result, tt.expected)
- }
- })
- }
- }
- func TestToParquetValue_EdgeCases(t *testing.T) {
- tests := []struct {
- name string
- value *schema_pb.Value
- expected parquet.Value
- wantErr bool
- }{
- {
- name: "Nil value",
- value: &schema_pb.Value{
- Kind: nil,
- },
- wantErr: true,
- },
- {
- name: "Completely nil value",
- value: nil,
- wantErr: true,
- },
- {
- name: "BytesValue with nil slice",
- value: &schema_pb.Value{
- Kind: &schema_pb.Value_BytesValue{BytesValue: nil},
- },
- expected: parquet.ByteArrayValue([]byte{}), // Should convert nil to empty slice
- },
- }
- for _, tt := range tests {
- t.Run(tt.name, func(t *testing.T) {
- result, err := toParquetValue(tt.value)
- if (err != nil) != tt.wantErr {
- t.Errorf("toParquetValue() error = %v, wantErr %v", err, tt.wantErr)
- return
- }
- if !tt.wantErr && !parquetValuesEqual(result, tt.expected) {
- t.Errorf("toParquetValue() = %v, want %v", result, tt.expected)
- }
- })
- }
- }
- // Helper function to encode a big.Int to bytes using two's complement representation
- func encodeBigIntToBytes(n *big.Int) []byte {
- if n.Sign() == 0 {
- return []byte{0}
- }
- // For positive numbers, just use Bytes()
- if n.Sign() > 0 {
- return n.Bytes()
- }
- // For negative numbers, we need two's complement representation
- bitLen := n.BitLen()
- if bitLen%8 != 0 {
- bitLen += 8 - (bitLen % 8) // Round up to byte boundary
- }
- byteLen := bitLen / 8
- if byteLen == 0 {
- byteLen = 1
- }
- // Calculate 2^(byteLen*8)
- modulus := new(big.Int).Lsh(big.NewInt(1), uint(byteLen*8))
- // Convert negative to positive representation: n + 2^(byteLen*8)
- positive := new(big.Int).Add(n, modulus)
- bytes := positive.Bytes()
- // Pad with leading zeros if needed
- if len(bytes) < byteLen {
- padded := make([]byte, byteLen)
- copy(padded[byteLen-len(bytes):], bytes)
- return padded
- }
- return bytes
- }
- // Helper function to create a FixedLenByteArray(16) matching our conversion logic
- func createFixedLenByteArray(inputBytes []byte) parquet.Value {
- fixedBytes := make([]byte, 16)
- if len(inputBytes) <= 16 {
- // Right-align the value (big-endian) - same as our conversion logic
- copy(fixedBytes[16-len(inputBytes):], inputBytes)
- } else {
- // Truncate if too large, taking the least significant bytes
- copy(fixedBytes, inputBytes[len(inputBytes)-16:])
- }
- return parquet.FixedLenByteArrayValue(fixedBytes)
- }
- // Helper function to compare parquet values
- func parquetValuesEqual(a, b parquet.Value) bool {
- // Handle both being null
- if a.IsNull() && b.IsNull() {
- return true
- }
- if a.IsNull() != b.IsNull() {
- return false
- }
- // Compare kind first
- if a.Kind() != b.Kind() {
- return false
- }
- // Compare based on type
- switch a.Kind() {
- case parquet.Boolean:
- return a.Boolean() == b.Boolean()
- case parquet.Int32:
- return a.Int32() == b.Int32()
- case parquet.Int64:
- return a.Int64() == b.Int64()
- case parquet.Float:
- return a.Float() == b.Float()
- case parquet.Double:
- return a.Double() == b.Double()
- case parquet.ByteArray:
- aBytes := a.ByteArray()
- bBytes := b.ByteArray()
- if len(aBytes) != len(bBytes) {
- return false
- }
- for i, v := range aBytes {
- if v != bBytes[i] {
- return false
- }
- }
- return true
- case parquet.FixedLenByteArray:
- aBytes := a.ByteArray() // FixedLenByteArray also uses ByteArray() method
- bBytes := b.ByteArray()
- if len(aBytes) != len(bBytes) {
- return false
- }
- for i, v := range aBytes {
- if v != bBytes[i] {
- return false
- }
- }
- return true
- default:
- return false
- }
- }
- // Benchmark tests
- func BenchmarkToParquetValue_BasicTypes(b *testing.B) {
- value := &schema_pb.Value{
- Kind: &schema_pb.Value_Int64Value{Int64Value: 12345678901234},
- }
- b.ResetTimer()
- for i := 0; i < b.N; i++ {
- _, _ = toParquetValue(value)
- }
- }
- func BenchmarkToParquetValue_TimestampValue(b *testing.B) {
- value := &schema_pb.Value{
- Kind: &schema_pb.Value_TimestampValue{
- TimestampValue: &schema_pb.TimestampValue{
- TimestampMicros: time.Now().UnixMicro(),
- IsUtc: true,
- },
- },
- }
- b.ResetTimer()
- for i := 0; i < b.N; i++ {
- _, _ = toParquetValue(value)
- }
- }
- func BenchmarkToParquetValue_DecimalValue(b *testing.B) {
- value := &schema_pb.Value{
- Kind: &schema_pb.Value_DecimalValue{
- DecimalValue: &schema_pb.DecimalValue{
- Value: encodeBigIntToBytes(big.NewInt(123456789012345)),
- Precision: 15,
- Scale: 2,
- },
- },
- }
- b.ResetTimer()
- for i := 0; i < b.N; i++ {
- _, _ = toParquetValue(value)
- }
- }
|