This is an automated email from the ASF dual-hosted git repository.
zeroshade pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-go.git
The following commit(s) were added to refs/heads/main by this push:
new fe4bd93 fix(parquet): Reading UUID columns (#173)
fe4bd93 is described below
commit fe4bd9396514bbfcfb1f84bb85af7f664f9558ad
Author: Matt Topol <[email protected]>
AuthorDate: Sat Oct 26 11:27:01 2024 -0400
fix(parquet): Reading UUID columns (#173)
Split from #171 to be a more focused PR.
Currently we will properly write arrow data with the canonical UUID
extension type as a parquet UUID column via `pqarrow`. This PR enables
us to read back that data using the `extensions.UUID` data type
correctly even when we don't have a stored schema.
Added a test to the `ArrowExtensionTypeRoundTrip` to ensure proper round
trip without a stored schema.
---------
Co-authored-by: Sutou Kouhei <[email protected]>
---
arrow/extensions/extensions.go | 4 ++--
parquet/pqarrow/encode_arrow_test.go | 1 +
parquet/pqarrow/schema.go | 21 ++++++++++++++-------
3 files changed, 17 insertions(+), 9 deletions(-)
diff --git a/arrow/extensions/extensions.go b/arrow/extensions/extensions.go
index 4e02823..22fb01f 100644
--- a/arrow/extensions/extensions.go
+++ b/arrow/extensions/extensions.go
@@ -21,8 +21,8 @@ import (
)
var canonicalExtensionTypes = []arrow.ExtensionType{
- &Bool8Type{},
- &UUIDType{},
+ NewBool8Type(),
+ NewUUIDType(),
&OpaqueType{},
&JSONType{},
}
diff --git a/parquet/pqarrow/encode_arrow_test.go
b/parquet/pqarrow/encode_arrow_test.go
index 1ff1710..b75a5c0 100644
--- a/parquet/pqarrow/encode_arrow_test.go
+++ b/parquet/pqarrow/encode_arrow_test.go
@@ -2057,6 +2057,7 @@ func (ps *ParquetIOTestSuite)
TestArrowExtensionTypeRoundTrip() {
defer tbl.Release()
ps.roundTripTable(mem, tbl, true)
+ ps.roundTripTable(mem, tbl, false)
}
func (ps *ParquetIOTestSuite) TestArrowUnknownExtensionTypeRoundTrip() {
diff --git a/parquet/pqarrow/schema.go b/parquet/pqarrow/schema.go
index 77b8f75..6d30359 100644
--- a/parquet/pqarrow/schema.go
+++ b/parquet/pqarrow/schema.go
@@ -514,8 +514,14 @@ func arrowFromFLBA(logical schema.LogicalType, length int)
(arrow.DataType, erro
switch logtype := logical.(type) {
case schema.DecimalLogicalType:
return arrowDecimal(logtype), nil
- case schema.NoLogicalType, schema.IntervalLogicalType,
schema.UUIDLogicalType:
+ case schema.NoLogicalType, schema.IntervalLogicalType:
return &arrow.FixedSizeBinaryType{ByteWidth: int(length)}, nil
+ case schema.UUIDLogicalType:
+ uuidType := arrow.GetExtensionType("arrow.uuid")
+ if uuidType == nil {
+ return &arrow.FixedSizeBinaryType{ByteWidth:
int(length)}, nil
+ }
+ return uuidType, nil
case schema.Float16LogicalType:
return &arrow.Float16Type{}, nil
default:
@@ -984,13 +990,14 @@ func applyOriginalStorageMetadata(origin arrow.Field,
inferred *SchemaField) (mo
return
}
- if !arrow.TypeEqual(extType.StorageType(), inferred.Field.Type)
{
- return modified, fmt.Errorf("%w: mismatch storage type
'%s' for extension type '%s'",
- arrow.ErrInvalid, inferred.Field.Type, extType)
- }
+ if modified && !arrow.TypeEqual(extType, inferred.Field.Type) {
+ if !arrow.TypeEqual(extType.StorageType(),
inferred.Field.Type) {
+ return modified, fmt.Errorf("%w: mismatch
storage type '%s' for extension type '%s'",
+ arrow.ErrInvalid, inferred.Field.Type,
extType)
+ }
- inferred.Field.Type = extType
- modified = true
+ inferred.Field.Type = extType
+ }
case arrow.SPARSE_UNION, arrow.DENSE_UNION:
err = xerrors.New("unimplemented type")
case arrow.STRUCT: