zeroshade commented on code in PR #524:
URL: https://github.com/apache/iceberg-go/pull/524#discussion_r2370363703
##########
schema_conversions.go:
##########
@@ -30,33 +30,36 @@ func partitionTypeToAvroSchema(t *StructType) (avro.Schema,
error) {
var sc avro.Schema
switch typ := f.Type.(type) {
case Int32Type:
- sc = internal.IntSchema
+ sc = internal.NullableSchema(internal.IntSchema)
case Int64Type:
- sc = internal.LongSchema
+ sc = internal.NullableSchema(internal.LongSchema)
case Float32Type:
- sc = internal.FloatSchema
+ sc = internal.NullableSchema(internal.FloatSchema)
case Float64Type:
- sc = internal.DoubleSchema
+ sc = internal.NullableSchema(internal.DoubleSchema)
case StringType:
- sc = internal.StringSchema
+ sc = internal.NullableSchema(internal.StringSchema)
case DateType:
- sc = internal.DateSchema
+ sc = internal.NullableSchema(internal.DateSchema)
case TimeType:
- sc = internal.TimeSchema
+ sc = internal.NullableSchema(internal.TimeSchema)
case TimestampType:
- sc = internal.TimestampSchema
+ sc = internal.NullableSchema(internal.TimestampSchema)
case TimestampTzType:
- sc = internal.TimestampTzSchema
+ sc = internal.NullableSchema(internal.TimestampTzSchema)
case UUIDType:
- sc = internal.UUIDSchema
+ sc = internal.NullableSchema(internal.UUIDSchema)
case BooleanType:
- sc = internal.BoolSchema
+ sc = internal.NullableSchema(internal.BoolSchema)
case BinaryType:
- sc = internal.BinarySchema
+ sc = internal.NullableSchema(internal.BinarySchema)
case FixedType:
- sc = internal.Must(avro.NewFixedSchema("fixed", "",
typ.len, nil))
+ // Currently the hamba/avro library couldn't resolve
the [n]byte array types for fixed schemas in unions.
+ // TODO: Create the proper Fixed Schema for Avro that
can match the use case
Review Comment:
can we file an issue on hamba to get this fixed?
##########
manifest.go:
##########
@@ -1583,18 +1722,87 @@ func (d *dataFile) initializeMapData() {
d.fieldIDToPartitionData = make(map[int]any,
len(d.PartitionData))
for k, v := range d.PartitionData {
if id, ok := d.fieldNameToID[k]; ok {
- d.fieldIDToPartitionData[id] = v
+ convertedValue :=
d.convertAvroValueToIcebergType(v, id)
+ d.fieldIDToPartitionData[id] =
convertedValue
}
}
}
- d.fieldIDToPartitionData =
avroPartitionData(d.fieldIDToPartitionData, d.fieldIDToLogicalType)
})
}
+func (d *dataFile) convertAvroValueToIcebergType(v any, fieldID int) any {
+ if logicalType, ok := d.fieldIDToLogicalType[fieldID]; ok {
+ switch logicalType {
+ case avro.Date:
+ if val, ok := v.(time.Time); ok {
+ return Date(val.Truncate(24*time.Hour).Unix() /
int64((time.Hour * 24).Seconds()))
+ }
+
+ return Date(v.(int32))
+ case avro.TimeMillis:
+ if val, ok := v.(time.Duration); ok {
+ return Time(val.Milliseconds())
+ }
+
+ return Time(v.(int64))
+ case avro.TimeMicros:
+ if val, ok := v.(time.Duration); ok {
+ return Time(val.Microseconds())
+ }
+
+ return Time(v.(int64))
+ case avro.TimestampMillis:
+ if val, ok := v.(time.Time); ok {
+ return Timestamp(val.UTC().UnixMilli())
+ }
+
+ return Timestamp(v.(int64))
+ case avro.TimestampMicros:
+ if val, ok := v.(time.Time); ok {
+ return Timestamp(val.UTC().UnixMicro())
+ }
+
+ return Timestamp(v.(int64))
+ case avro.Decimal:
+ if unionMap, ok := v.(map[string]interface{}); ok {
+ if val, ok := unionMap["fixed"]; ok {
+ if bigRatValue, ok := val.(*big.Rat);
ok {
+ scale :=
d.fieldIDToFixedSize[fieldID]
+ scaleFactor :=
new(big.Int).Exp(big.NewInt(10), big.NewInt(int64(scale)), nil)
+ unscaled :=
new(big.Int).Mul(bigRatValue.Num(), scaleFactor)
+ unscaled =
unscaled.Div(unscaled, bigRatValue.Denom())
+ decimal128Val :=
decimal128.FromBigInt(unscaled)
+
+ return DecimalLiteral{
+ Scale: scale,
+ Val: decimal128Val,
+ }
+ }
+ }
+ }
Review Comment:
I think it would be better and more performant if we instead register
`[]byte` as the type for reading in a decimal and then just used
`UnmarshalBinary` etc. for the decimal. It doesn't make sense for hamba avro to
do all the computation to generate the Rat only for us to undo that.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]