to_parquet_value.go 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346
  1. package schema
  2. import (
  3. "fmt"
  4. "strconv"
  5. parquet "github.com/parquet-go/parquet-go"
  6. "github.com/seaweedfs/seaweedfs/weed/pb/schema_pb"
  7. )
  8. func rowBuilderVisit(rowBuilder *parquet.RowBuilder, fieldType *schema_pb.Type, levels *ParquetLevels, fieldValue *schema_pb.Value) (err error) {
  9. switch fieldType.Kind.(type) {
  10. case *schema_pb.Type_ScalarType:
  11. // If value is missing, write NULL at the correct column to keep rows aligned
  12. if fieldValue == nil || fieldValue.Kind == nil {
  13. rowBuilder.Add(levels.startColumnIndex, parquet.NullValue())
  14. return nil
  15. }
  16. var parquetValue parquet.Value
  17. parquetValue, err = toParquetValueForType(fieldType, fieldValue)
  18. if err != nil {
  19. return
  20. }
  21. // Safety check: prevent nil byte arrays from reaching parquet library
  22. if parquetValue.Kind() == parquet.ByteArray {
  23. byteData := parquetValue.ByteArray()
  24. if byteData == nil {
  25. parquetValue = parquet.ByteArrayValue([]byte{})
  26. }
  27. }
  28. rowBuilder.Add(levels.startColumnIndex, parquetValue)
  29. case *schema_pb.Type_ListType:
  30. // Advance to list position even if value is missing
  31. rowBuilder.Next(levels.startColumnIndex)
  32. if fieldValue == nil || fieldValue.GetListValue() == nil {
  33. return nil
  34. }
  35. elementType := fieldType.GetListType().ElementType
  36. for _, value := range fieldValue.GetListValue().Values {
  37. if err = rowBuilderVisit(rowBuilder, elementType, levels, value); err != nil {
  38. return
  39. }
  40. }
  41. }
  42. return
  43. }
  44. func AddRecordValue(rowBuilder *parquet.RowBuilder, recordType *schema_pb.RecordType, parquetLevels *ParquetLevels, recordValue *schema_pb.RecordValue) error {
  45. visitor := func(fieldType *schema_pb.Type, levels *ParquetLevels, fieldValue *schema_pb.Value) (err error) {
  46. return rowBuilderVisit(rowBuilder, fieldType, levels, fieldValue)
  47. }
  48. fieldType := &schema_pb.Type{Kind: &schema_pb.Type_RecordType{RecordType: recordType}}
  49. fieldValue := &schema_pb.Value{Kind: &schema_pb.Value_RecordValue{RecordValue: recordValue}}
  50. return doVisitValue(fieldType, parquetLevels, fieldValue, visitor)
  51. }
  52. // typeValueVisitor is a function that is called for each value in a schema_pb.Value
  53. // Find the column index.
  54. // intended to be used in RowBuilder.Add(columnIndex, value)
  55. type typeValueVisitor func(fieldType *schema_pb.Type, levels *ParquetLevels, fieldValue *schema_pb.Value) (err error)
  56. // endIndex is exclusive
  57. // same logic as RowBuilder.configure in row_builder.go
  58. func doVisitValue(fieldType *schema_pb.Type, levels *ParquetLevels, fieldValue *schema_pb.Value, visitor typeValueVisitor) (err error) {
  59. switch fieldType.Kind.(type) {
  60. case *schema_pb.Type_ScalarType:
  61. return visitor(fieldType, levels, fieldValue)
  62. case *schema_pb.Type_ListType:
  63. return visitor(fieldType, levels, fieldValue)
  64. case *schema_pb.Type_RecordType:
  65. for _, field := range fieldType.GetRecordType().Fields {
  66. var fv *schema_pb.Value
  67. if fieldValue != nil && fieldValue.GetRecordValue() != nil {
  68. var found bool
  69. fv, found = fieldValue.GetRecordValue().Fields[field.Name]
  70. if !found {
  71. // pass nil so visitor can emit NULL for alignment
  72. fv = nil
  73. }
  74. }
  75. fieldLevels := levels.levels[field.Name]
  76. err = doVisitValue(field.Type, fieldLevels, fv, visitor)
  77. if err != nil {
  78. return
  79. }
  80. }
  81. return
  82. }
  83. return
  84. }
  85. func toParquetValue(value *schema_pb.Value) (parquet.Value, error) {
  86. // Safety check for nil value
  87. if value == nil || value.Kind == nil {
  88. return parquet.NullValue(), fmt.Errorf("nil value or nil value kind")
  89. }
  90. switch value.Kind.(type) {
  91. case *schema_pb.Value_BoolValue:
  92. return parquet.BooleanValue(value.GetBoolValue()), nil
  93. case *schema_pb.Value_Int32Value:
  94. return parquet.Int32Value(value.GetInt32Value()), nil
  95. case *schema_pb.Value_Int64Value:
  96. return parquet.Int64Value(value.GetInt64Value()), nil
  97. case *schema_pb.Value_FloatValue:
  98. return parquet.FloatValue(value.GetFloatValue()), nil
  99. case *schema_pb.Value_DoubleValue:
  100. return parquet.DoubleValue(value.GetDoubleValue()), nil
  101. case *schema_pb.Value_BytesValue:
  102. // Handle nil byte slices to prevent growslice panic in parquet-go
  103. byteData := value.GetBytesValue()
  104. if byteData == nil {
  105. byteData = []byte{} // Use empty slice instead of nil
  106. }
  107. return parquet.ByteArrayValue(byteData), nil
  108. case *schema_pb.Value_StringValue:
  109. // Convert string to bytes, ensuring we never pass nil
  110. stringData := value.GetStringValue()
  111. return parquet.ByteArrayValue([]byte(stringData)), nil
  112. // Parquet logical types with safe conversion (preventing commit 7a4aeec60 panic)
  113. case *schema_pb.Value_TimestampValue:
  114. timestampValue := value.GetTimestampValue()
  115. if timestampValue == nil {
  116. return parquet.NullValue(), nil
  117. }
  118. return parquet.Int64Value(timestampValue.TimestampMicros), nil
  119. case *schema_pb.Value_DateValue:
  120. dateValue := value.GetDateValue()
  121. if dateValue == nil {
  122. return parquet.NullValue(), nil
  123. }
  124. return parquet.Int32Value(dateValue.DaysSinceEpoch), nil
  125. case *schema_pb.Value_DecimalValue:
  126. decimalValue := value.GetDecimalValue()
  127. if decimalValue == nil || decimalValue.Value == nil || len(decimalValue.Value) == 0 {
  128. return parquet.NullValue(), nil
  129. }
  130. // Validate input data - reject unreasonably large values instead of corrupting data
  131. if len(decimalValue.Value) > 64 {
  132. // Reject extremely large decimal values (>512 bits) as likely corrupted data
  133. // Better to fail fast than silently corrupt financial/scientific data
  134. return parquet.NullValue(), fmt.Errorf("decimal value too large: %d bytes (max 64)", len(decimalValue.Value))
  135. }
  136. // Convert to FixedLenByteArray to match schema (DECIMAL with FixedLenByteArray physical type)
  137. // This accommodates any precision up to 38 digits (16 bytes = 128 bits)
  138. // Pad or truncate to exactly 16 bytes for FixedLenByteArray
  139. fixedBytes := make([]byte, 16)
  140. if len(decimalValue.Value) <= 16 {
  141. // Right-align the value (big-endian)
  142. copy(fixedBytes[16-len(decimalValue.Value):], decimalValue.Value)
  143. } else {
  144. // Truncate if too large, taking the least significant bytes
  145. copy(fixedBytes, decimalValue.Value[len(decimalValue.Value)-16:])
  146. }
  147. return parquet.FixedLenByteArrayValue(fixedBytes), nil
  148. case *schema_pb.Value_TimeValue:
  149. timeValue := value.GetTimeValue()
  150. if timeValue == nil {
  151. return parquet.NullValue(), nil
  152. }
  153. return parquet.Int64Value(timeValue.TimeMicros), nil
  154. default:
  155. return parquet.NullValue(), fmt.Errorf("unknown value type: %T", value.Kind)
  156. }
  157. }
  158. // toParquetValueForType coerces a schema_pb.Value into a parquet.Value that matches the declared field type.
  159. func toParquetValueForType(fieldType *schema_pb.Type, value *schema_pb.Value) (parquet.Value, error) {
  160. switch t := fieldType.Kind.(type) {
  161. case *schema_pb.Type_ScalarType:
  162. switch t.ScalarType {
  163. case schema_pb.ScalarType_BOOL:
  164. switch v := value.Kind.(type) {
  165. case *schema_pb.Value_BoolValue:
  166. return parquet.BooleanValue(v.BoolValue), nil
  167. case *schema_pb.Value_StringValue:
  168. if b, err := strconv.ParseBool(v.StringValue); err == nil {
  169. return parquet.BooleanValue(b), nil
  170. }
  171. return parquet.BooleanValue(false), nil
  172. default:
  173. return parquet.BooleanValue(false), nil
  174. }
  175. case schema_pb.ScalarType_INT32:
  176. switch v := value.Kind.(type) {
  177. case *schema_pb.Value_Int32Value:
  178. return parquet.Int32Value(v.Int32Value), nil
  179. case *schema_pb.Value_Int64Value:
  180. return parquet.Int32Value(int32(v.Int64Value)), nil
  181. case *schema_pb.Value_DoubleValue:
  182. return parquet.Int32Value(int32(v.DoubleValue)), nil
  183. case *schema_pb.Value_StringValue:
  184. if i, err := strconv.ParseInt(v.StringValue, 10, 32); err == nil {
  185. return parquet.Int32Value(int32(i)), nil
  186. }
  187. return parquet.Int32Value(0), nil
  188. default:
  189. return parquet.Int32Value(0), nil
  190. }
  191. case schema_pb.ScalarType_INT64:
  192. switch v := value.Kind.(type) {
  193. case *schema_pb.Value_Int64Value:
  194. return parquet.Int64Value(v.Int64Value), nil
  195. case *schema_pb.Value_Int32Value:
  196. return parquet.Int64Value(int64(v.Int32Value)), nil
  197. case *schema_pb.Value_DoubleValue:
  198. return parquet.Int64Value(int64(v.DoubleValue)), nil
  199. case *schema_pb.Value_StringValue:
  200. if i, err := strconv.ParseInt(v.StringValue, 10, 64); err == nil {
  201. return parquet.Int64Value(i), nil
  202. }
  203. return parquet.Int64Value(0), nil
  204. default:
  205. return parquet.Int64Value(0), nil
  206. }
  207. case schema_pb.ScalarType_FLOAT:
  208. switch v := value.Kind.(type) {
  209. case *schema_pb.Value_FloatValue:
  210. return parquet.FloatValue(v.FloatValue), nil
  211. case *schema_pb.Value_DoubleValue:
  212. return parquet.FloatValue(float32(v.DoubleValue)), nil
  213. case *schema_pb.Value_Int64Value:
  214. return parquet.FloatValue(float32(v.Int64Value)), nil
  215. case *schema_pb.Value_StringValue:
  216. if f, err := strconv.ParseFloat(v.StringValue, 32); err == nil {
  217. return parquet.FloatValue(float32(f)), nil
  218. }
  219. return parquet.FloatValue(0), nil
  220. default:
  221. return parquet.FloatValue(0), nil
  222. }
  223. case schema_pb.ScalarType_DOUBLE:
  224. switch v := value.Kind.(type) {
  225. case *schema_pb.Value_DoubleValue:
  226. return parquet.DoubleValue(v.DoubleValue), nil
  227. case *schema_pb.Value_Int64Value:
  228. return parquet.DoubleValue(float64(v.Int64Value)), nil
  229. case *schema_pb.Value_Int32Value:
  230. return parquet.DoubleValue(float64(v.Int32Value)), nil
  231. case *schema_pb.Value_StringValue:
  232. if f, err := strconv.ParseFloat(v.StringValue, 64); err == nil {
  233. return parquet.DoubleValue(f), nil
  234. }
  235. return parquet.DoubleValue(0), nil
  236. default:
  237. return parquet.DoubleValue(0), nil
  238. }
  239. case schema_pb.ScalarType_BYTES:
  240. switch v := value.Kind.(type) {
  241. case *schema_pb.Value_BytesValue:
  242. b := v.BytesValue
  243. if b == nil {
  244. b = []byte{}
  245. }
  246. return parquet.ByteArrayValue(b), nil
  247. case *schema_pb.Value_StringValue:
  248. return parquet.ByteArrayValue([]byte(v.StringValue)), nil
  249. case *schema_pb.Value_Int64Value:
  250. return parquet.ByteArrayValue([]byte(strconv.FormatInt(v.Int64Value, 10))), nil
  251. case *schema_pb.Value_Int32Value:
  252. return parquet.ByteArrayValue([]byte(strconv.FormatInt(int64(v.Int32Value), 10))), nil
  253. case *schema_pb.Value_DoubleValue:
  254. return parquet.ByteArrayValue([]byte(strconv.FormatFloat(v.DoubleValue, 'f', -1, 64))), nil
  255. case *schema_pb.Value_FloatValue:
  256. return parquet.ByteArrayValue([]byte(strconv.FormatFloat(float64(v.FloatValue), 'f', -1, 32))), nil
  257. case *schema_pb.Value_BoolValue:
  258. if v.BoolValue {
  259. return parquet.ByteArrayValue([]byte("true")), nil
  260. }
  261. return parquet.ByteArrayValue([]byte("false")), nil
  262. default:
  263. return parquet.ByteArrayValue([]byte{}), nil
  264. }
  265. case schema_pb.ScalarType_STRING:
  266. // Same as bytes but semantically string
  267. switch v := value.Kind.(type) {
  268. case *schema_pb.Value_StringValue:
  269. return parquet.ByteArrayValue([]byte(v.StringValue)), nil
  270. default:
  271. // Fallback through bytes coercion
  272. b, _ := toParquetValueForType(&schema_pb.Type{Kind: &schema_pb.Type_ScalarType{ScalarType: schema_pb.ScalarType_BYTES}}, value)
  273. return b, nil
  274. }
  275. case schema_pb.ScalarType_TIMESTAMP:
  276. switch v := value.Kind.(type) {
  277. case *schema_pb.Value_Int64Value:
  278. return parquet.Int64Value(v.Int64Value), nil
  279. case *schema_pb.Value_StringValue:
  280. if i, err := strconv.ParseInt(v.StringValue, 10, 64); err == nil {
  281. return parquet.Int64Value(i), nil
  282. }
  283. return parquet.Int64Value(0), nil
  284. default:
  285. return parquet.Int64Value(0), nil
  286. }
  287. case schema_pb.ScalarType_DATE:
  288. switch v := value.Kind.(type) {
  289. case *schema_pb.Value_Int32Value:
  290. return parquet.Int32Value(v.Int32Value), nil
  291. case *schema_pb.Value_Int64Value:
  292. return parquet.Int32Value(int32(v.Int64Value)), nil
  293. case *schema_pb.Value_StringValue:
  294. if i, err := strconv.ParseInt(v.StringValue, 10, 32); err == nil {
  295. return parquet.Int32Value(int32(i)), nil
  296. }
  297. return parquet.Int32Value(0), nil
  298. default:
  299. return parquet.Int32Value(0), nil
  300. }
  301. case schema_pb.ScalarType_DECIMAL:
  302. // Reuse existing conversion path (FixedLenByteArray 16)
  303. return toParquetValue(value)
  304. case schema_pb.ScalarType_TIME:
  305. switch v := value.Kind.(type) {
  306. case *schema_pb.Value_Int64Value:
  307. return parquet.Int64Value(v.Int64Value), nil
  308. case *schema_pb.Value_StringValue:
  309. if i, err := strconv.ParseInt(v.StringValue, 10, 64); err == nil {
  310. return parquet.Int64Value(i), nil
  311. }
  312. return parquet.Int64Value(0), nil
  313. default:
  314. return parquet.Int64Value(0), nil
  315. }
  316. }
  317. }
  318. // Fallback to generic conversion
  319. return toParquetValue(value)
  320. }