zeroshade commented on code in PR #524: URL: https://github.com/apache/iceberg-go/pull/524#discussion_r2314393801
########## manifest.go: ########## @@ -1461,33 +1598,114 @@ func mapToAvroColMap[K comparable, V any](m map[K]V) *[]colMap[K, V] { return &out } -func avroPartitionData(input map[int]any, logicalTypes map[int]avro.LogicalType) map[int]any { +func avroPartitionData(input map[int]any, logicalTypes map[int]avro.LogicalType, fixedSizes map[int]int) map[int]any { out := make(map[int]any) for k, v := range input { if logical, ok := logicalTypes[k]; ok { - switch logical { - case avro.Date: - out[k] = Date(v.(time.Time).Truncate(24*time.Hour).Unix() / int64((time.Hour * 24).Seconds())) - case avro.TimeMillis: - out[k] = Time(v.(time.Duration).Milliseconds()) - case avro.TimeMicros: - out[k] = Time(v.(time.Duration).Microseconds()) - case avro.TimestampMillis: - out[k] = Timestamp(v.(time.Time).UTC().UnixMilli()) - case avro.TimestampMicros: - out[k] = Timestamp(v.(time.Time).UTC().UnixMicro()) - default: - out[k] = v - } - - continue + out[k] = convertLogicalTypeValue(v, logical, fixedSizes[k]) + } else { + out[k] = convertDefaultValue(v, fixedSizes[k]) } - out[k] = v } return out } +func convertLogicalTypeValue(v any, logicalType avro.LogicalType, fixedSize int) any { + switch logicalType { + case avro.Date: + return convertDateValue(v) + case avro.TimeMicros: + return convertTimeMicrosValue(v) + case avro.TimestampMicros: + return convertTimestampMicrosValue(v) + case avro.Decimal: + return convertDecimalValue(v, fixedSize) + default: + return v + } +} + +func convertDateValue(v any) any { + if t, ok := v.(time.Time); ok { + return map[string]any{"int.date": int32(t.Truncate(24*time.Hour).Unix() / int64((time.Hour * 24).Seconds()))} + } + if d, ok := v.(Date); ok { + return map[string]any{"int.date": int32(d)} + } + + return v +} + +func convertTimeMicrosValue(v any) any { + if t, ok := v.(Time); ok { + return map[string]any{"long.time-micros": int64(t)} + } + if d, ok := v.(time.Duration); ok { + return map[string]any{"long.time-micros": d.Microseconds()} + } + + return v +} + +func convertTimestampMicrosValue(v any) any { + if t, ok := v.(time.Time); ok { + return map[string]any{"long.timestamp-micros": t.UTC().UnixMicro()} + } + if ts, ok := v.(Timestamp); ok { + return map[string]any{"long.timestamp-micros": int64(ts)} + } + + return v +} + +func convertDecimalValue(v any, fixedSize int) any { + if v == nil { + return map[string]any{"null": nil} + } + + dec, ok := v.(Decimal) + if !ok { + return v + } + + bytes, err := DecimalLiteral(dec).MarshalBinary() + if err != nil { + return v + } + fixedArray := convertToFixedArray(padOrTruncateBytes(bytes, fixedSize), fixedSize) Review Comment: why converting to fixed array instead of just returning the decimal type? ########## manifest.go: ########## @@ -1461,33 +1598,114 @@ func mapToAvroColMap[K comparable, V any](m map[K]V) *[]colMap[K, V] { return &out } -func avroPartitionData(input map[int]any, logicalTypes map[int]avro.LogicalType) map[int]any { +func avroPartitionData(input map[int]any, logicalTypes map[int]avro.LogicalType, fixedSizes map[int]int) map[int]any { out := make(map[int]any) for k, v := range input { if logical, ok := logicalTypes[k]; ok { - switch logical { - case avro.Date: - out[k] = Date(v.(time.Time).Truncate(24*time.Hour).Unix() / int64((time.Hour * 24).Seconds())) - case avro.TimeMillis: - out[k] = Time(v.(time.Duration).Milliseconds()) - case avro.TimeMicros: - out[k] = Time(v.(time.Duration).Microseconds()) - case avro.TimestampMillis: - out[k] = Timestamp(v.(time.Time).UTC().UnixMilli()) - case avro.TimestampMicros: - out[k] = Timestamp(v.(time.Time).UTC().UnixMicro()) - default: - out[k] = v - } - - continue + out[k] = convertLogicalTypeValue(v, logical, fixedSizes[k]) + } else { + out[k] = convertDefaultValue(v, fixedSizes[k]) } - out[k] = v } return out } +func convertLogicalTypeValue(v any, logicalType avro.LogicalType, fixedSize int) any { + switch logicalType { + case avro.Date: + return convertDateValue(v) + case avro.TimeMicros: + return convertTimeMicrosValue(v) + case avro.TimestampMicros: + return convertTimestampMicrosValue(v) + case avro.Decimal: + return convertDecimalValue(v, fixedSize) + default: + return v + } +} + +func convertDateValue(v any) any { + if t, ok := v.(time.Time); ok { + return map[string]any{"int.date": int32(t.Truncate(24*time.Hour).Unix() / int64((time.Hour * 24).Seconds()))} + } + if d, ok := v.(Date); ok { + return map[string]any{"int.date": int32(d)} + } + + return v +} + +func convertTimeMicrosValue(v any) any { + if t, ok := v.(Time); ok { + return map[string]any{"long.time-micros": int64(t)} + } + if d, ok := v.(time.Duration); ok { + return map[string]any{"long.time-micros": d.Microseconds()} + } + + return v +} + +func convertTimestampMicrosValue(v any) any { + if t, ok := v.(time.Time); ok { + return map[string]any{"long.timestamp-micros": t.UTC().UnixMicro()} + } + if ts, ok := v.(Timestamp); ok { + return map[string]any{"long.timestamp-micros": int64(ts)} + } + + return v +} + +func convertDecimalValue(v any, fixedSize int) any { + if v == nil { + return map[string]any{"null": nil} + } + + dec, ok := v.(Decimal) + if !ok { + return v + } + + bytes, err := DecimalLiteral(dec).MarshalBinary() + if err != nil { + return v + } + fixedArray := convertToFixedArray(padOrTruncateBytes(bytes, fixedSize), fixedSize) + + return map[string]any{"fixed": fixedArray} +} + +func convertDefaultValue(v any, fixedSize int) any { + if uuidVal, ok := v.(uuid.UUID); ok { + return uuidVal.String() + } + + if bytes, ok := v.([]byte); ok && fixedSize > 0 { + return convertToFixedArray(padOrTruncateBytes(bytes, fixedSize), fixedSize) Review Comment: why do we need to do this? We can't just return the uuid? ########## exprs.go: ########## @@ -482,8 +482,33 @@ func (b *boundRef[T]) Equals(other BoundTerm) bool { } func (b *boundRef[T]) Ref() BoundReference { return b } -func (b *boundRef[T]) Field() NestedField { return b.field } -func (b *boundRef[T]) Type() Type { return b.field.Type } + +func unwrapLogicalTypeValue(v any) any { + if m, ok := v.(map[string]any); ok { + if val, exists := m["long.timestamp-micros"]; exists { + if microseconds, ok := val.(int64); ok { + return Timestamp(microseconds) + } + } + + if val, exists := m["int.date"]; exists { + if days, ok := val.(int32); ok { + return days + } + } + + if val, exists := m["long.time-micros"]; exists { + if microseconds, ok := val.(int64); ok { + return Time(microseconds) + } + } + } + + return v Review Comment: this conversion should be happening before we ever get this far. What workflow would result in us getting here using `map[string]any`? ########## manifest.go: ########## @@ -416,30 +429,53 @@ func getFieldIDMap(sc avro.Schema) (map[string]int, map[int]avro.LogicalType) { result := make(map[string]int) logicalTypes := make(map[int]avro.LogicalType) + fixedSizes := make(map[int]int) + entryField := getField(sc.(*avro.RecordSchema), "data_file") partitionField := getField(entryField.Type().(*avro.RecordSchema), "partition") for _, field := range partitionField.Type().(*avro.RecordSchema).Fields() { - if fid, ok := field.Prop("field-id").(float64); ok { - result[field.Name()] = int(fid) - avroTyp := field.Type() - if us, ok := avroTyp.(*avro.UnionSchema); ok { - for _, t := range us.Types() { - avroTyp = t - } + var fid int + switch v := field.Prop("field-id").(type) { Review Comment: does the `field-id` come back as a float instead of an `int` for some reason? ########## table/arrow_utils.go: ########## @@ -1030,11 +1031,19 @@ func (sc *schemaCompatVisitor) isFieldCompat(lhs iceberg.NestedField) bool { func (sc *schemaCompatVisitor) Schema(s *iceberg.Schema, v func() bool) bool { if !v() { - pterm.DisableColor() - tbl := pterm.DefaultTable.WithHasHeader(true).WithData(sc.errorData) - tbl.Render() - txt, _ := tbl.Srender() - pterm.EnableColor() + var lines []string + lines = append(lines, " | Table Field | Requested Field") + + for i, row := range sc.errorData { + if i == 0 { + continue + } + if len(row) >= 3 { + lines = append(lines, fmt.Sprintf("%s | %-24s | %s", row[0], row[1], row[2])) + } + } Review Comment: why the change here? ########## manifest.go: ########## @@ -416,30 +429,53 @@ func getFieldIDMap(sc avro.Schema) (map[string]int, map[int]avro.LogicalType) { result := make(map[string]int) logicalTypes := make(map[int]avro.LogicalType) + fixedSizes := make(map[int]int) + entryField := getField(sc.(*avro.RecordSchema), "data_file") partitionField := getField(entryField.Type().(*avro.RecordSchema), "partition") for _, field := range partitionField.Type().(*avro.RecordSchema).Fields() { - if fid, ok := field.Prop("field-id").(float64); ok { - result[field.Name()] = int(fid) - avroTyp := field.Type() - if us, ok := avroTyp.(*avro.UnionSchema); ok { - for _, t := range us.Types() { - avroTyp = t - } + var fid int + switch v := field.Prop("field-id").(type) { + case int: + fid = v + case int32: + fid = int(v) + case int64: + fid = int(v) + case float64: + fid = int(v) + default: + continue + } + + result[field.Name()] = fid + avroTyp := field.Type() + if us, ok := avroTyp.(*avro.UnionSchema); ok { + for _, t := range us.Types() { + avroTyp = t } Review Comment: should we confirm that we're not using null? ########## manifest.go: ########## @@ -960,6 +1003,37 @@ func (p *partitionFieldStats[T]) update(value any) (err error) { return nil } +func extractBytesFromFixed(fixedBytes interface{}) []byte { + switch fb := fixedBytes.(type) { + case []interface{}: Review Comment: What situation does this happen with? Where we get `[]interface{}` instead of `[]byte`? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org