This is an automated email from the ASF dual-hosted git repository.

zeroshade pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-go.git


The following commit(s) were added to refs/heads/main by this push:
     new fe4bd93  fix(parquet): Reading UUID columns (#173)
fe4bd93 is described below

commit fe4bd9396514bbfcfb1f84bb85af7f664f9558ad
Author: Matt Topol <[email protected]>
AuthorDate: Sat Oct 26 11:27:01 2024 -0400

    fix(parquet): Reading UUID columns (#173)
    
    Split from #171 to be a more focused PR.
    
    Currently we will properly write arrow data with the canonical UUID
    extension type as a parquet UUID column via `pqarrow`. This PR enables
    us to read back that data using the `extensions.UUID` data type
    correctly even when we don't have a stored schema.
    
    Added a test to the `ArrowExtensionTypeRoundTrip` to ensure proper round
    trip without a stored schema.
    
    ---------
    
    Co-authored-by: Sutou Kouhei <[email protected]>
---
 arrow/extensions/extensions.go       |  4 ++--
 parquet/pqarrow/encode_arrow_test.go |  1 +
 parquet/pqarrow/schema.go            | 21 ++++++++++++++-------
 3 files changed, 17 insertions(+), 9 deletions(-)

diff --git a/arrow/extensions/extensions.go b/arrow/extensions/extensions.go
index 4e02823..22fb01f 100644
--- a/arrow/extensions/extensions.go
+++ b/arrow/extensions/extensions.go
@@ -21,8 +21,8 @@ import (
 )
 
 var canonicalExtensionTypes = []arrow.ExtensionType{
-       &Bool8Type{},
-       &UUIDType{},
+       NewBool8Type(),
+       NewUUIDType(),
        &OpaqueType{},
        &JSONType{},
 }
diff --git a/parquet/pqarrow/encode_arrow_test.go 
b/parquet/pqarrow/encode_arrow_test.go
index 1ff1710..b75a5c0 100644
--- a/parquet/pqarrow/encode_arrow_test.go
+++ b/parquet/pqarrow/encode_arrow_test.go
@@ -2057,6 +2057,7 @@ func (ps *ParquetIOTestSuite) 
TestArrowExtensionTypeRoundTrip() {
        defer tbl.Release()
 
        ps.roundTripTable(mem, tbl, true)
+       ps.roundTripTable(mem, tbl, false)
 }
 
 func (ps *ParquetIOTestSuite) TestArrowUnknownExtensionTypeRoundTrip() {
diff --git a/parquet/pqarrow/schema.go b/parquet/pqarrow/schema.go
index 77b8f75..6d30359 100644
--- a/parquet/pqarrow/schema.go
+++ b/parquet/pqarrow/schema.go
@@ -514,8 +514,14 @@ func arrowFromFLBA(logical schema.LogicalType, length int) 
(arrow.DataType, erro
        switch logtype := logical.(type) {
        case schema.DecimalLogicalType:
                return arrowDecimal(logtype), nil
-       case schema.NoLogicalType, schema.IntervalLogicalType, 
schema.UUIDLogicalType:
+       case schema.NoLogicalType, schema.IntervalLogicalType:
                return &arrow.FixedSizeBinaryType{ByteWidth: int(length)}, nil
+       case schema.UUIDLogicalType:
+               uuidType := arrow.GetExtensionType("arrow.uuid")
+               if uuidType == nil {
+                       return &arrow.FixedSizeBinaryType{ByteWidth: 
int(length)}, nil
+               }
+               return uuidType, nil
        case schema.Float16LogicalType:
                return &arrow.Float16Type{}, nil
        default:
@@ -984,13 +990,14 @@ func applyOriginalStorageMetadata(origin arrow.Field, 
inferred *SchemaField) (mo
                        return
                }
 
-               if !arrow.TypeEqual(extType.StorageType(), inferred.Field.Type) 
{
-                       return modified, fmt.Errorf("%w: mismatch storage type 
'%s' for extension type '%s'",
-                               arrow.ErrInvalid, inferred.Field.Type, extType)
-               }
+               if modified && !arrow.TypeEqual(extType, inferred.Field.Type) {
+                       if !arrow.TypeEqual(extType.StorageType(), 
inferred.Field.Type) {
+                               return modified, fmt.Errorf("%w: mismatch 
storage type '%s' for extension type '%s'",
+                                       arrow.ErrInvalid, inferred.Field.Type, 
extType)
+                       }
 
-               inferred.Field.Type = extType
-               modified = true
+                       inferred.Field.Type = extType
+               }
        case arrow.SPARSE_UNION, arrow.DENSE_UNION:
                err = xerrors.New("unimplemented type")
        case arrow.STRUCT:

Reply via email to