This is an automated email from the ASF dual-hosted git repository.

zeroshade pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new c68f76838c GH-34584: [Go][CSV] Add extension types support (#34585)
c68f76838c is described below

commit c68f76838ccaeaf97d8001237fd0ac89a58c293b
Author: Yevgeny Pats <[email protected]>
AuthorDate: Thu Mar 23 10:45:45 2023 -0400

    GH-34584: [Go][CSV] Add extension types support (#34585)
    
    Built on top of https://github.com/apache/arrow/issues/34453
    
    ### Rationale for this change
    
    ### What changes are included in this PR?
    
    ### Are these changes tested?
    
    ### Are there any user-facing changes?
    
    * Closes: #34584
    
    Authored-by: Yevgeny Pats <[email protected]>
    Signed-off-by: Matt Topol <[email protected]>
---
 go/arrow/array/extension.go                        |  9 ++-
 go/arrow/csv/common.go                             |  1 +
 go/arrow/csv/reader.go                             | 17 ++++++
 go/arrow/csv/reader_test.go                        |  5 ++
 go/arrow/csv/testdata/header.csv                   |  8 +--
 go/arrow/csv/testdata/types.csv                    |  8 +--
 go/arrow/csv/transformer.go                        |  9 +++
 go/arrow/csv/writer_test.go                        | 25 ++++----
 go/arrow/internal/testing/types/extension_types.go | 67 +++++++++++++++++++++-
 9 files changed, 128 insertions(+), 21 deletions(-)

diff --git a/go/arrow/array/extension.go b/go/arrow/array/extension.go
index 10ad54c948..cd856df675 100644
--- a/go/arrow/array/extension.go
+++ b/go/arrow/array/extension.go
@@ -38,7 +38,8 @@ type ExtensionArray interface {
        ExtensionType() arrow.ExtensionType
        // Storage returns the underlying storage array for this array.
        Storage() arrow.Array
-
+       // ValueString returns a string represenation of the value at the given 
index for the extension array.
+       ValueString(i int) string
        // by having a non-exported function in the interface, it means that
        // consumers must embed ExtensionArrayBase in their structs in order
        // to fulfill this interface.
@@ -184,6 +185,12 @@ func (e *ExtensionArrayBase) setData(data *Data) {
        e.storage = MakeFromData(storageData).(arraymarshal)
 }
 
+// ValueString returns the value at index i as a string.
+// This needs to be implemented by the extension array type.
+func (e *ExtensionArrayBase) ValueString(i int) string {
+       panic("arrow/array: ValueString wasn't implemented by this extension 
array type")
+}
+
 // no-op function that exists simply to force embedding this in any extension 
array types.
 func (ExtensionArrayBase) mustEmbedExtensionArrayBase() {}
 
diff --git a/go/arrow/csv/common.go b/go/arrow/csv/common.go
index 74dbf36f60..3dbc82d64b 100644
--- a/go/arrow/csv/common.go
+++ b/go/arrow/csv/common.go
@@ -224,6 +224,7 @@ func validate(schema *arrow.Schema) {
                case *arrow.Decimal128Type, *arrow.Decimal256Type:
                case *arrow.ListType:
                case *arrow.BinaryType:
+               case arrow.ExtensionType:
                default:
                        panic(fmt.Errorf("arrow/csv: field %d (%s) has invalid 
data type %T", i, f.Name, ft))
                }
diff --git a/go/arrow/csv/reader.go b/go/arrow/csv/reader.go
index abb3db3627..fd24365c0a 100644
--- a/go/arrow/csv/reader.go
+++ b/go/arrow/csv/reader.go
@@ -35,6 +35,7 @@ import (
        "github.com/apache/arrow/go/v12/arrow/decimal256"
        "github.com/apache/arrow/go/v12/arrow/internal/debug"
        "github.com/apache/arrow/go/v12/arrow/memory"
+       "github.com/goccy/go-json"
 )
 
 // Reader wraps encoding/csv.Reader and creates array.Records from a schema.
@@ -474,6 +475,10 @@ func (r *Reader) initFieldConverter(bldr array.Builder) 
func(string) {
                return func(s string) {
                        r.parseBinaryType(bldr, s)
                }
+       case arrow.ExtensionType:
+               return func(s string) {
+                       r.parseExtension(bldr, s)
+               }
        default:
                panic(fmt.Errorf("arrow/csv: unhandled field type %T", 
bldr.Type()))
        }
@@ -773,6 +778,18 @@ func (r *Reader) parseBinaryType(field array.Builder, str 
string) {
        field.(*array.BinaryBuilder).Append(decodedVal)
 }
 
+func (r *Reader) parseExtension(field array.Builder, str string) {
+       if r.isNull(str) {
+               field.AppendNull()
+               return
+       }
+       dec := json.NewDecoder(strings.NewReader(`"` + str + `"`))
+       if err := field.UnmarshalOne(dec); err != nil {
+               r.err = err
+               return
+       }
+}
+
 // Retain increases the reference count by 1.
 // Retain may be called simultaneously from multiple goroutines.
 func (r *Reader) Retain() {
diff --git a/go/arrow/csv/reader_test.go b/go/arrow/csv/reader_test.go
index 0257ca6a49..c5bd90d76e 100644
--- a/go/arrow/csv/reader_test.go
+++ b/go/arrow/csv/reader_test.go
@@ -31,6 +31,7 @@ import (
        "github.com/apache/arrow/go/v12/arrow/csv"
        "github.com/apache/arrow/go/v12/arrow/decimal128"
        "github.com/apache/arrow/go/v12/arrow/decimal256"
+       "github.com/apache/arrow/go/v12/arrow/internal/testing/types"
        "github.com/apache/arrow/go/v12/arrow/memory"
        "github.com/stretchr/testify/assert"
        "github.com/stretchr/testify/require"
@@ -335,6 +336,7 @@ func testCSVReader(t *testing.T, filepath string, 
withHeader bool) {
                        {Name: "ts", Type: arrow.FixedWidthTypes.Timestamp_ms},
                        {Name: "list(i64)", Type: 
arrow.ListOf(arrow.PrimitiveTypes.Int64)},
                        {Name: "binary", Type: arrow.BinaryTypes.Binary},
+                       {Name: "uuid", Type: types.NewUUIDType()},
                },
                nil,
        )
@@ -384,6 +386,7 @@ rec[0]["str"]: ["str-1"]
 rec[0]["ts"]: [1652054461000]
 rec[0]["list(i64)"]: [[1 2 3]]
 rec[0]["binary"]: ["\x00\x01\x02"]
+rec[0]["uuid"]: ["00000000-0000-0000-0000-000000000001"]
 rec[1]["bool"]: [false]
 rec[1]["i8"]: [-2]
 rec[1]["i16"]: [-2]
@@ -399,6 +402,7 @@ rec[1]["str"]: ["str-2"]
 rec[1]["ts"]: [1652140799000]
 rec[1]["list(i64)"]: [[]]
 rec[1]["binary"]: [""]
+rec[1]["uuid"]: ["00000000-0000-0000-0000-000000000002"]
 rec[2]["bool"]: [(null)]
 rec[2]["i8"]: [(null)]
 rec[2]["i16"]: [(null)]
@@ -414,6 +418,7 @@ rec[2]["str"]: [(null)]
 rec[2]["ts"]: [(null)]
 rec[2]["list(i64)"]: [(null)]
 rec[2]["binary"]: [(null)]
+rec[2]["uuid"]: [(null)]
 `
        got, want := out.String(), want
        require.Equal(t, want, got)
diff --git a/go/arrow/csv/testdata/header.csv b/go/arrow/csv/testdata/header.csv
index 3ee4aa740e..0987673cd5 100644
--- a/go/arrow/csv/testdata/header.csv
+++ b/go/arrow/csv/testdata/header.csv
@@ -15,7 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 #
-bool;i8;i16;i32;i64;u8;u16;u32;u64;f32;f64;str;ts;list(i64);binary
-true;-1;-1;-1;-1;1;1;1;1;1.1;1.1;str-1;2022-05-09T00:01:01;{1,2,3};AAEC
-false;-2;-2;-2;-2;2;2;2;2;2.2;2.2;str-2;2022-05-09T23:59:59;{};
-null;NULL;null;N/A;;null;null;null;null;null;null;null;null;null;null
\ No newline at end of file
+bool;i8;i16;i32;i64;u8;u16;u32;u64;f32;f64;str;ts;list(i64);binary;uuid
+true;-1;-1;-1;-1;1;1;1;1;1.1;1.1;str-1;2022-05-09T00:01:01;{1,2,3};AAEC;00000000-0000-0000-0000-000000000001
+false;-2;-2;-2;-2;2;2;2;2;2.2;2.2;str-2;2022-05-09T23:59:59;{};;00000000-0000-0000-0000-000000000002
+null;NULL;null;N/A;;null;null;null;null;null;null;null;null;null;null;null
\ No newline at end of file
diff --git a/go/arrow/csv/testdata/types.csv b/go/arrow/csv/testdata/types.csv
index afdefa40b1..a90e35e253 100644
--- a/go/arrow/csv/testdata/types.csv
+++ b/go/arrow/csv/testdata/types.csv
@@ -15,7 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 #
-## supported types: 
bool;int8;int16;int32;int64;uint8;uint16;uint32;uint64;float32;float64;string;timestamp;list(i64);binary
-true;-1;-1;-1;-1;1;1;1;1;1.1;1.1;str-1;2022-05-09T00:01:01;{1,2,3};AAEC
-false;-2;-2;-2;-2;2;2;2;2;2.2;2.2;str-2;2022-05-09T23:59:59;{};
-null;NULL;null;N/A;;null;null;null;null;null;null;null;null;null;null
\ No newline at end of file
+## supported types: 
bool;int8;int16;int32;int64;uint8;uint16;uint32;uint64;float32;float64;string;timestamp;binary;uuid
+true;-1;-1;-1;-1;1;1;1;1;1.1;1.1;str-1;2022-05-09T00:01:01;{1,2,3};AAEC;00000000-0000-0000-0000-000000000001
+false;-2;-2;-2;-2;2;2;2;2;2.2;2.2;str-2;2022-05-09T23:59:59;{};;00000000-0000-0000-0000-000000000002
+null;NULL;null;N/A;;null;null;null;null;null;null;null;null;null;null;null
\ No newline at end of file
diff --git a/go/arrow/csv/transformer.go b/go/arrow/csv/transformer.go
index 9947439eec..46b0c4fdee 100644
--- a/go/arrow/csv/transformer.go
+++ b/go/arrow/csv/transformer.go
@@ -225,6 +225,15 @@ func (w *Writer) transformColToStringArr(typ 
arrow.DataType, col arrow.Array) []
                                res[i] = w.nullValue
                        }
                }
+       case arrow.ExtensionType:
+               arr := col.(array.ExtensionArray)
+               for i := 0; i < arr.Len(); i++ {
+                       if arr.IsNull(i) {
+                               res[i] = w.nullValue
+                       } else {
+                               res[i] = arr.ValueString(i)
+                       }
+               }
        default:
                panic(fmt.Errorf("arrow/csv: field has unsupported data type 
%s", typ.String()))
        }
diff --git a/go/arrow/csv/writer_test.go b/go/arrow/csv/writer_test.go
index b16a568231..bfe0bcddf9 100644
--- a/go/arrow/csv/writer_test.go
+++ b/go/arrow/csv/writer_test.go
@@ -30,7 +30,9 @@ import (
        "github.com/apache/arrow/go/v12/arrow/csv"
        "github.com/apache/arrow/go/v12/arrow/decimal128"
        "github.com/apache/arrow/go/v12/arrow/decimal256"
+       "github.com/apache/arrow/go/v12/arrow/internal/testing/types"
        "github.com/apache/arrow/go/v12/arrow/memory"
+       "github.com/google/uuid"
 )
 
 const (
@@ -131,18 +133,18 @@ func Example_writer() {
 
 var (
        fullData = [][]string{
-               {"bool", "i8", "i16", "i32", "i64", "u8", "u16", "u32", "u64", 
"f32", "f64", "str", "ts_s", "d32", "d64", "dec128", "dec256", "list(i64)", 
"binary"},
-               {"true", "-1", "-1", "-1", "-1", "0", "0", "0", "0", "0", "0", 
"str-0", "2014-07-28 15:04:05", "2017-05-18", "2028-04-26", "-123.45", 
"-123.45", "{1,2,3}", "AAEC"},
-               {"false", "0", "0", "0", "0", "1", "1", "1", "1", "0.1", "0.1", 
"str-1", "2016-09-08 15:04:05", "2022-11-08", "2031-06-28", "0", "0", 
"{4,5,6}", "AwQF"},
-               {"true", "1", "1", "1", "1", "2", "2", "2", "2", "0.2", "0.2", 
"str-2", "2021-09-18 15:04:05", "2025-08-04", "2034-08-28", "123.45", "123.45", 
"{7,8,9}", ""},
-               {nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, 
nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, 
nullVal, nullVal, nullVal, nullVal},
+               {"bool", "i8", "i16", "i32", "i64", "u8", "u16", "u32", "u64", 
"f32", "f64", "str", "ts_s", "d32", "d64", "dec128", "dec256", "list(i64)", 
"binary", "uuid"},
+               {"true", "-1", "-1", "-1", "-1", "0", "0", "0", "0", "0", "0", 
"str-0", "2014-07-28 15:04:05", "2017-05-18", "2028-04-26", "-123.45", 
"-123.45", "{1,2,3}", "AAEC", "00000000-0000-0000-0000-000000000001"},
+               {"false", "0", "0", "0", "0", "1", "1", "1", "1", "0.1", "0.1", 
"str-1", "2016-09-08 15:04:05", "2022-11-08", "2031-06-28", "0", "0", 
"{4,5,6}", "AwQF", "00000000-0000-0000-0000-000000000002"},
+               {"true", "1", "1", "1", "1", "2", "2", "2", "2", "0.2", "0.2", 
"str-2", "2021-09-18 15:04:05", "2025-08-04", "2034-08-28", "123.45", "123.45", 
"{7,8,9}", "", "00000000-0000-0000-0000-000000000003"},
+               {nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, 
nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, 
nullVal, nullVal, nullVal, nullVal, nullVal},
        }
        bananaData = [][]string{
-               {"bool", "i8", "i16", "i32", "i64", "u8", "u16", "u32", "u64", 
"f32", "f64", "str", "ts_s", "d32", "d64", "dec128", "dec256", "list(i64)", 
"binary"},
-               {"BANANA", "-1", "-1", "-1", "-1", "0", "0", "0", "0", "0", 
"0", "str-0", "2014-07-28 15:04:05", "2017-05-18", "2028-04-26", "-123.45", 
"-123.45", "{1,2,3}", "AAEC"},
-               {"MANGO", "0", "0", "0", "0", "1", "1", "1", "1", "0.1", "0.1", 
"str-1", "2016-09-08 15:04:05", "2022-11-08", "2031-06-28", "0", "0", 
"{4,5,6}", "AwQF"},
-               {"BANANA", "1", "1", "1", "1", "2", "2", "2", "2", "0.2", 
"0.2", "str-2", "2021-09-18 15:04:05", "2025-08-04", "2034-08-28", "123.45", 
"123.45", "{7,8,9}", ""},
-               {nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, 
nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, 
nullVal, nullVal, nullVal, nullVal},
+               {"bool", "i8", "i16", "i32", "i64", "u8", "u16", "u32", "u64", 
"f32", "f64", "str", "ts_s", "d32", "d64", "dec128", "dec256", "list(i64)", 
"binary", "uuid"},
+               {"BANANA", "-1", "-1", "-1", "-1", "0", "0", "0", "0", "0", 
"0", "str-0", "2014-07-28 15:04:05", "2017-05-18", "2028-04-26", "-123.45", 
"-123.45", "{1,2,3}", "AAEC", "00000000-0000-0000-0000-000000000001"},
+               {"MANGO", "0", "0", "0", "0", "1", "1", "1", "1", "0.1", "0.1", 
"str-1", "2016-09-08 15:04:05", "2022-11-08", "2031-06-28", "0", "0", 
"{4,5,6}", "AwQF", "00000000-0000-0000-0000-000000000002"},
+               {"BANANA", "1", "1", "1", "1", "2", "2", "2", "2", "0.2", 
"0.2", "str-2", "2021-09-18 15:04:05", "2025-08-04", "2034-08-28", "123.45", 
"123.45", "{7,8,9}", "", "00000000-0000-0000-0000-000000000003"},
+               {nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, 
nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, 
nullVal, nullVal, nullVal, nullVal, nullVal},
        }
 )
 
@@ -220,6 +222,7 @@ func testCSVWriter(t *testing.T, data [][]string, 
writeHeader bool, fmtr func(bo
                        {Name: "dec256", Type: &arrow.Decimal256Type{Precision: 
5, Scale: 2}},
                        {Name: "list(i64)", Type: 
arrow.ListOf(arrow.PrimitiveTypes.Int64)},
                        {Name: "binary", Type: arrow.BinaryTypes.Binary},
+                       {Name: "uuid", Type: types.NewUUIDType()},
                },
                nil,
        )
@@ -253,6 +256,8 @@ func testCSVWriter(t *testing.T, data [][]string, 
writeHeader bool, fmtr func(bo
        listBuilder.Append(true)
        listBuilderInt64.AppendValues([]int64{7, 8, 9}, nil)
        b.Field(18).(*array.BinaryBuilder).AppendValues([][]byte{{0, 1, 2}, {3, 
4, 5}, {}}, nil)
+       
b.Field(19).(*types.UUIDBuilder).AppendValues([]uuid.UUID{uuid.MustParse("00000000-0000-0000-0000-000000000001"),
 uuid.MustParse("00000000-0000-0000-0000-000000000002"), 
uuid.MustParse("00000000-0000-0000-0000-000000000003")}, nil)
+       
 
        for _, field := range b.Fields() {
                field.AppendNull()
diff --git a/go/arrow/internal/testing/types/extension_types.go 
b/go/arrow/internal/testing/types/extension_types.go
index c2e9133fe8..80d8111b4e 100644
--- a/go/arrow/internal/testing/types/extension_types.go
+++ b/go/arrow/internal/testing/types/extension_types.go
@@ -49,8 +49,9 @@ func (b *UUIDBuilder) Append(v uuid.UUID) {
 
 func (b *UUIDBuilder) AppendValues(v []uuid.UUID, valid []bool) {
        data := make([][]byte, len(v))
-       for i, v := range v {
-               data[i] = v[:]
+       for i, u := range v {
+               data[i] = make([]byte, 16)
+               copy(data[i][:], u[:])
        }
        
b.ExtensionBuilder.Builder.(*array.FixedSizeBinaryBuilder).AppendValues(data, 
valid)
 }
@@ -121,6 +122,18 @@ type UUIDArray struct {
        array.ExtensionArrayBase
 }
 
+func (a UUIDArray) ValueString(i int) string {
+       if a.IsNull(i) {
+               return "(null)"
+       }
+       arr := a.Storage().(*array.FixedSizeBinary)
+       uuidStr, err := uuid.FromBytes(arr.Value(i))
+       if err != nil {
+               panic(fmt.Errorf("invalid uuid: %w", err))
+       }
+       return uuidStr.String()
+}
+
 func (a UUIDArray) String() string {
        arr := a.Storage().(*array.FixedSizeBinary)
        o := new(strings.Builder)
@@ -222,6 +235,14 @@ type Parametric1Array struct {
        array.ExtensionArrayBase
 }
 
+func (a Parametric1Array) ValueString(i int) string {
+       arr := a.Storage().(*array.Int32)
+       if a.IsNull(i) {
+               return "(null)"
+       }
+       return fmt.Sprintf("%d", arr.Value(i))
+}
+
 // Parametric2Array is another simple int32 array for use with the 
Parametric2Type
 // also for testing a parameterized user-defined extension type that utilizes
 // the parameter for defining different types based on the param.
@@ -229,6 +250,15 @@ type Parametric2Array struct {
        array.ExtensionArrayBase
 }
 
+func (a Parametric2Array) ValueString(i int) string {
+       arr := a.Storage().(*array.Int32)
+       if a.IsNull(i) {
+               return "(null)"
+       }
+       return fmt.Sprintf("%d", arr.Value(i))
+}
+
+
 // A type where ExtensionName is always the same
 type Parametric1Type struct {
        arrow.ExtensionBase
@@ -236,6 +266,7 @@ type Parametric1Type struct {
        param int32
 }
 
+
 func NewParametric1Type(p int32) *Parametric1Type {
        ret := &Parametric1Type{param: p}
        ret.ExtensionBase.Storage = arrow.PrimitiveTypes.Int32
@@ -339,6 +370,18 @@ type ExtStructArray struct {
        array.ExtensionArrayBase
 }
 
+func (a ExtStructArray) ValueString(i int) string {
+       arr := a.Storage().(*array.Struct)
+       if a.IsNull(i) {
+               return "(null)"
+       }
+       b, err := arr.MarshalJSON()
+       if err != nil {
+               panic(err)
+       }
+       return string(b)
+}
+
 // ExtStructType is an extension type with a non-primitive storage type 
containing a struct
 // with fields {a: int64, b: float64}
 type ExtStructType struct {
@@ -384,6 +427,18 @@ type DictExtensionArray struct {
        array.ExtensionArrayBase
 }
 
+func (a DictExtensionArray) ValueString(i int) string {
+       arr := a.Storage().(*array.Dictionary)
+       if a.IsNull(i) {
+               return "(null)"
+       }
+       b, err := arr.MarshalJSON()
+       if err != nil {
+               panic(err)
+       }
+       return string(b)
+}
+
 type DictExtensionType struct {
        arrow.ExtensionBase
 }
@@ -423,6 +478,14 @@ type SmallintArray struct {
        array.ExtensionArrayBase
 }
 
+func (a SmallintArray) ValueString(i int) string {
+       if a.IsNull(i) {
+               return "(null)"
+       }
+       arr := a.Storage().(*array.Int16)
+       return fmt.Sprintf("%d", arr.Value(i))
+}
+
 type SmallintType struct {
        arrow.ExtensionBase
 }

Reply via email to