This is an automated email from the ASF dual-hosted git repository.
zeroshade pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new c68f76838c GH-34584: [Go][CSV] Add extension types support (#34585)
c68f76838c is described below
commit c68f76838ccaeaf97d8001237fd0ac89a58c293b
Author: Yevgeny Pats <[email protected]>
AuthorDate: Thu Mar 23 10:45:45 2023 -0400
GH-34584: [Go][CSV] Add extension types support (#34585)
Built on top of https://github.com/apache/arrow/issues/34453
### Rationale for this change
### What changes are included in this PR?
### Are these changes tested?
### Are there any user-facing changes?
* Closes: #34584
Authored-by: Yevgeny Pats <[email protected]>
Signed-off-by: Matt Topol <[email protected]>
---
go/arrow/array/extension.go | 9 ++-
go/arrow/csv/common.go | 1 +
go/arrow/csv/reader.go | 17 ++++++
go/arrow/csv/reader_test.go | 5 ++
go/arrow/csv/testdata/header.csv | 8 +--
go/arrow/csv/testdata/types.csv | 8 +--
go/arrow/csv/transformer.go | 9 +++
go/arrow/csv/writer_test.go | 25 ++++----
go/arrow/internal/testing/types/extension_types.go | 67 +++++++++++++++++++++-
9 files changed, 128 insertions(+), 21 deletions(-)
diff --git a/go/arrow/array/extension.go b/go/arrow/array/extension.go
index 10ad54c948..cd856df675 100644
--- a/go/arrow/array/extension.go
+++ b/go/arrow/array/extension.go
@@ -38,7 +38,8 @@ type ExtensionArray interface {
ExtensionType() arrow.ExtensionType
// Storage returns the underlying storage array for this array.
Storage() arrow.Array
-
+ // ValueString returns a string represenation of the value at the given
index for the extension array.
+ ValueString(i int) string
// by having a non-exported function in the interface, it means that
// consumers must embed ExtensionArrayBase in their structs in order
// to fulfill this interface.
@@ -184,6 +185,12 @@ func (e *ExtensionArrayBase) setData(data *Data) {
e.storage = MakeFromData(storageData).(arraymarshal)
}
+// ValueString returns the value at index i as a string.
+// This needs to be implemented by the extension array type.
+func (e *ExtensionArrayBase) ValueString(i int) string {
+ panic("arrow/array: ValueString wasn't implemented by this extension
array type")
+}
+
// no-op function that exists simply to force embedding this in any extension
array types.
func (ExtensionArrayBase) mustEmbedExtensionArrayBase() {}
diff --git a/go/arrow/csv/common.go b/go/arrow/csv/common.go
index 74dbf36f60..3dbc82d64b 100644
--- a/go/arrow/csv/common.go
+++ b/go/arrow/csv/common.go
@@ -224,6 +224,7 @@ func validate(schema *arrow.Schema) {
case *arrow.Decimal128Type, *arrow.Decimal256Type:
case *arrow.ListType:
case *arrow.BinaryType:
+ case arrow.ExtensionType:
default:
panic(fmt.Errorf("arrow/csv: field %d (%s) has invalid
data type %T", i, f.Name, ft))
}
diff --git a/go/arrow/csv/reader.go b/go/arrow/csv/reader.go
index abb3db3627..fd24365c0a 100644
--- a/go/arrow/csv/reader.go
+++ b/go/arrow/csv/reader.go
@@ -35,6 +35,7 @@ import (
"github.com/apache/arrow/go/v12/arrow/decimal256"
"github.com/apache/arrow/go/v12/arrow/internal/debug"
"github.com/apache/arrow/go/v12/arrow/memory"
+ "github.com/goccy/go-json"
)
// Reader wraps encoding/csv.Reader and creates array.Records from a schema.
@@ -474,6 +475,10 @@ func (r *Reader) initFieldConverter(bldr array.Builder)
func(string) {
return func(s string) {
r.parseBinaryType(bldr, s)
}
+ case arrow.ExtensionType:
+ return func(s string) {
+ r.parseExtension(bldr, s)
+ }
default:
panic(fmt.Errorf("arrow/csv: unhandled field type %T",
bldr.Type()))
}
@@ -773,6 +778,18 @@ func (r *Reader) parseBinaryType(field array.Builder, str
string) {
field.(*array.BinaryBuilder).Append(decodedVal)
}
+func (r *Reader) parseExtension(field array.Builder, str string) {
+ if r.isNull(str) {
+ field.AppendNull()
+ return
+ }
+ dec := json.NewDecoder(strings.NewReader(`"` + str + `"`))
+ if err := field.UnmarshalOne(dec); err != nil {
+ r.err = err
+ return
+ }
+}
+
// Retain increases the reference count by 1.
// Retain may be called simultaneously from multiple goroutines.
func (r *Reader) Retain() {
diff --git a/go/arrow/csv/reader_test.go b/go/arrow/csv/reader_test.go
index 0257ca6a49..c5bd90d76e 100644
--- a/go/arrow/csv/reader_test.go
+++ b/go/arrow/csv/reader_test.go
@@ -31,6 +31,7 @@ import (
"github.com/apache/arrow/go/v12/arrow/csv"
"github.com/apache/arrow/go/v12/arrow/decimal128"
"github.com/apache/arrow/go/v12/arrow/decimal256"
+ "github.com/apache/arrow/go/v12/arrow/internal/testing/types"
"github.com/apache/arrow/go/v12/arrow/memory"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
@@ -335,6 +336,7 @@ func testCSVReader(t *testing.T, filepath string,
withHeader bool) {
{Name: "ts", Type: arrow.FixedWidthTypes.Timestamp_ms},
{Name: "list(i64)", Type:
arrow.ListOf(arrow.PrimitiveTypes.Int64)},
{Name: "binary", Type: arrow.BinaryTypes.Binary},
+ {Name: "uuid", Type: types.NewUUIDType()},
},
nil,
)
@@ -384,6 +386,7 @@ rec[0]["str"]: ["str-1"]
rec[0]["ts"]: [1652054461000]
rec[0]["list(i64)"]: [[1 2 3]]
rec[0]["binary"]: ["\x00\x01\x02"]
+rec[0]["uuid"]: ["00000000-0000-0000-0000-000000000001"]
rec[1]["bool"]: [false]
rec[1]["i8"]: [-2]
rec[1]["i16"]: [-2]
@@ -399,6 +402,7 @@ rec[1]["str"]: ["str-2"]
rec[1]["ts"]: [1652140799000]
rec[1]["list(i64)"]: [[]]
rec[1]["binary"]: [""]
+rec[1]["uuid"]: ["00000000-0000-0000-0000-000000000002"]
rec[2]["bool"]: [(null)]
rec[2]["i8"]: [(null)]
rec[2]["i16"]: [(null)]
@@ -414,6 +418,7 @@ rec[2]["str"]: [(null)]
rec[2]["ts"]: [(null)]
rec[2]["list(i64)"]: [(null)]
rec[2]["binary"]: [(null)]
+rec[2]["uuid"]: [(null)]
`
got, want := out.String(), want
require.Equal(t, want, got)
diff --git a/go/arrow/csv/testdata/header.csv b/go/arrow/csv/testdata/header.csv
index 3ee4aa740e..0987673cd5 100644
--- a/go/arrow/csv/testdata/header.csv
+++ b/go/arrow/csv/testdata/header.csv
@@ -15,7 +15,7 @@
# specific language governing permissions and limitations
# under the License.
#
-bool;i8;i16;i32;i64;u8;u16;u32;u64;f32;f64;str;ts;list(i64);binary
-true;-1;-1;-1;-1;1;1;1;1;1.1;1.1;str-1;2022-05-09T00:01:01;{1,2,3};AAEC
-false;-2;-2;-2;-2;2;2;2;2;2.2;2.2;str-2;2022-05-09T23:59:59;{};
-null;NULL;null;N/A;;null;null;null;null;null;null;null;null;null;null
\ No newline at end of file
+bool;i8;i16;i32;i64;u8;u16;u32;u64;f32;f64;str;ts;list(i64);binary;uuid
+true;-1;-1;-1;-1;1;1;1;1;1.1;1.1;str-1;2022-05-09T00:01:01;{1,2,3};AAEC;00000000-0000-0000-0000-000000000001
+false;-2;-2;-2;-2;2;2;2;2;2.2;2.2;str-2;2022-05-09T23:59:59;{};;00000000-0000-0000-0000-000000000002
+null;NULL;null;N/A;;null;null;null;null;null;null;null;null;null;null;null
\ No newline at end of file
diff --git a/go/arrow/csv/testdata/types.csv b/go/arrow/csv/testdata/types.csv
index afdefa40b1..a90e35e253 100644
--- a/go/arrow/csv/testdata/types.csv
+++ b/go/arrow/csv/testdata/types.csv
@@ -15,7 +15,7 @@
# specific language governing permissions and limitations
# under the License.
#
-## supported types:
bool;int8;int16;int32;int64;uint8;uint16;uint32;uint64;float32;float64;string;timestamp;list(i64);binary
-true;-1;-1;-1;-1;1;1;1;1;1.1;1.1;str-1;2022-05-09T00:01:01;{1,2,3};AAEC
-false;-2;-2;-2;-2;2;2;2;2;2.2;2.2;str-2;2022-05-09T23:59:59;{};
-null;NULL;null;N/A;;null;null;null;null;null;null;null;null;null;null
\ No newline at end of file
+## supported types:
bool;int8;int16;int32;int64;uint8;uint16;uint32;uint64;float32;float64;string;timestamp;binary;uuid
+true;-1;-1;-1;-1;1;1;1;1;1.1;1.1;str-1;2022-05-09T00:01:01;{1,2,3};AAEC;00000000-0000-0000-0000-000000000001
+false;-2;-2;-2;-2;2;2;2;2;2.2;2.2;str-2;2022-05-09T23:59:59;{};;00000000-0000-0000-0000-000000000002
+null;NULL;null;N/A;;null;null;null;null;null;null;null;null;null;null;null
\ No newline at end of file
diff --git a/go/arrow/csv/transformer.go b/go/arrow/csv/transformer.go
index 9947439eec..46b0c4fdee 100644
--- a/go/arrow/csv/transformer.go
+++ b/go/arrow/csv/transformer.go
@@ -225,6 +225,15 @@ func (w *Writer) transformColToStringArr(typ
arrow.DataType, col arrow.Array) []
res[i] = w.nullValue
}
}
+ case arrow.ExtensionType:
+ arr := col.(array.ExtensionArray)
+ for i := 0; i < arr.Len(); i++ {
+ if arr.IsNull(i) {
+ res[i] = w.nullValue
+ } else {
+ res[i] = arr.ValueString(i)
+ }
+ }
default:
panic(fmt.Errorf("arrow/csv: field has unsupported data type
%s", typ.String()))
}
diff --git a/go/arrow/csv/writer_test.go b/go/arrow/csv/writer_test.go
index b16a568231..bfe0bcddf9 100644
--- a/go/arrow/csv/writer_test.go
+++ b/go/arrow/csv/writer_test.go
@@ -30,7 +30,9 @@ import (
"github.com/apache/arrow/go/v12/arrow/csv"
"github.com/apache/arrow/go/v12/arrow/decimal128"
"github.com/apache/arrow/go/v12/arrow/decimal256"
+ "github.com/apache/arrow/go/v12/arrow/internal/testing/types"
"github.com/apache/arrow/go/v12/arrow/memory"
+ "github.com/google/uuid"
)
const (
@@ -131,18 +133,18 @@ func Example_writer() {
var (
fullData = [][]string{
- {"bool", "i8", "i16", "i32", "i64", "u8", "u16", "u32", "u64",
"f32", "f64", "str", "ts_s", "d32", "d64", "dec128", "dec256", "list(i64)",
"binary"},
- {"true", "-1", "-1", "-1", "-1", "0", "0", "0", "0", "0", "0",
"str-0", "2014-07-28 15:04:05", "2017-05-18", "2028-04-26", "-123.45",
"-123.45", "{1,2,3}", "AAEC"},
- {"false", "0", "0", "0", "0", "1", "1", "1", "1", "0.1", "0.1",
"str-1", "2016-09-08 15:04:05", "2022-11-08", "2031-06-28", "0", "0",
"{4,5,6}", "AwQF"},
- {"true", "1", "1", "1", "1", "2", "2", "2", "2", "0.2", "0.2",
"str-2", "2021-09-18 15:04:05", "2025-08-04", "2034-08-28", "123.45", "123.45",
"{7,8,9}", ""},
- {nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal,
nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal,
nullVal, nullVal, nullVal, nullVal},
+ {"bool", "i8", "i16", "i32", "i64", "u8", "u16", "u32", "u64",
"f32", "f64", "str", "ts_s", "d32", "d64", "dec128", "dec256", "list(i64)",
"binary", "uuid"},
+ {"true", "-1", "-1", "-1", "-1", "0", "0", "0", "0", "0", "0",
"str-0", "2014-07-28 15:04:05", "2017-05-18", "2028-04-26", "-123.45",
"-123.45", "{1,2,3}", "AAEC", "00000000-0000-0000-0000-000000000001"},
+ {"false", "0", "0", "0", "0", "1", "1", "1", "1", "0.1", "0.1",
"str-1", "2016-09-08 15:04:05", "2022-11-08", "2031-06-28", "0", "0",
"{4,5,6}", "AwQF", "00000000-0000-0000-0000-000000000002"},
+ {"true", "1", "1", "1", "1", "2", "2", "2", "2", "0.2", "0.2",
"str-2", "2021-09-18 15:04:05", "2025-08-04", "2034-08-28", "123.45", "123.45",
"{7,8,9}", "", "00000000-0000-0000-0000-000000000003"},
+ {nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal,
nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal,
nullVal, nullVal, nullVal, nullVal, nullVal},
}
bananaData = [][]string{
- {"bool", "i8", "i16", "i32", "i64", "u8", "u16", "u32", "u64",
"f32", "f64", "str", "ts_s", "d32", "d64", "dec128", "dec256", "list(i64)",
"binary"},
- {"BANANA", "-1", "-1", "-1", "-1", "0", "0", "0", "0", "0",
"0", "str-0", "2014-07-28 15:04:05", "2017-05-18", "2028-04-26", "-123.45",
"-123.45", "{1,2,3}", "AAEC"},
- {"MANGO", "0", "0", "0", "0", "1", "1", "1", "1", "0.1", "0.1",
"str-1", "2016-09-08 15:04:05", "2022-11-08", "2031-06-28", "0", "0",
"{4,5,6}", "AwQF"},
- {"BANANA", "1", "1", "1", "1", "2", "2", "2", "2", "0.2",
"0.2", "str-2", "2021-09-18 15:04:05", "2025-08-04", "2034-08-28", "123.45",
"123.45", "{7,8,9}", ""},
- {nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal,
nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal,
nullVal, nullVal, nullVal, nullVal},
+ {"bool", "i8", "i16", "i32", "i64", "u8", "u16", "u32", "u64",
"f32", "f64", "str", "ts_s", "d32", "d64", "dec128", "dec256", "list(i64)",
"binary", "uuid"},
+ {"BANANA", "-1", "-1", "-1", "-1", "0", "0", "0", "0", "0",
"0", "str-0", "2014-07-28 15:04:05", "2017-05-18", "2028-04-26", "-123.45",
"-123.45", "{1,2,3}", "AAEC", "00000000-0000-0000-0000-000000000001"},
+ {"MANGO", "0", "0", "0", "0", "1", "1", "1", "1", "0.1", "0.1",
"str-1", "2016-09-08 15:04:05", "2022-11-08", "2031-06-28", "0", "0",
"{4,5,6}", "AwQF", "00000000-0000-0000-0000-000000000002"},
+ {"BANANA", "1", "1", "1", "1", "2", "2", "2", "2", "0.2",
"0.2", "str-2", "2021-09-18 15:04:05", "2025-08-04", "2034-08-28", "123.45",
"123.45", "{7,8,9}", "", "00000000-0000-0000-0000-000000000003"},
+ {nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal,
nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal, nullVal,
nullVal, nullVal, nullVal, nullVal, nullVal},
}
)
@@ -220,6 +222,7 @@ func testCSVWriter(t *testing.T, data [][]string,
writeHeader bool, fmtr func(bo
{Name: "dec256", Type: &arrow.Decimal256Type{Precision:
5, Scale: 2}},
{Name: "list(i64)", Type:
arrow.ListOf(arrow.PrimitiveTypes.Int64)},
{Name: "binary", Type: arrow.BinaryTypes.Binary},
+ {Name: "uuid", Type: types.NewUUIDType()},
},
nil,
)
@@ -253,6 +256,8 @@ func testCSVWriter(t *testing.T, data [][]string,
writeHeader bool, fmtr func(bo
listBuilder.Append(true)
listBuilderInt64.AppendValues([]int64{7, 8, 9}, nil)
b.Field(18).(*array.BinaryBuilder).AppendValues([][]byte{{0, 1, 2}, {3,
4, 5}, {}}, nil)
+
b.Field(19).(*types.UUIDBuilder).AppendValues([]uuid.UUID{uuid.MustParse("00000000-0000-0000-0000-000000000001"),
uuid.MustParse("00000000-0000-0000-0000-000000000002"),
uuid.MustParse("00000000-0000-0000-0000-000000000003")}, nil)
+
for _, field := range b.Fields() {
field.AppendNull()
diff --git a/go/arrow/internal/testing/types/extension_types.go
b/go/arrow/internal/testing/types/extension_types.go
index c2e9133fe8..80d8111b4e 100644
--- a/go/arrow/internal/testing/types/extension_types.go
+++ b/go/arrow/internal/testing/types/extension_types.go
@@ -49,8 +49,9 @@ func (b *UUIDBuilder) Append(v uuid.UUID) {
func (b *UUIDBuilder) AppendValues(v []uuid.UUID, valid []bool) {
data := make([][]byte, len(v))
- for i, v := range v {
- data[i] = v[:]
+ for i, u := range v {
+ data[i] = make([]byte, 16)
+ copy(data[i][:], u[:])
}
b.ExtensionBuilder.Builder.(*array.FixedSizeBinaryBuilder).AppendValues(data,
valid)
}
@@ -121,6 +122,18 @@ type UUIDArray struct {
array.ExtensionArrayBase
}
+func (a UUIDArray) ValueString(i int) string {
+ if a.IsNull(i) {
+ return "(null)"
+ }
+ arr := a.Storage().(*array.FixedSizeBinary)
+ uuidStr, err := uuid.FromBytes(arr.Value(i))
+ if err != nil {
+ panic(fmt.Errorf("invalid uuid: %w", err))
+ }
+ return uuidStr.String()
+}
+
func (a UUIDArray) String() string {
arr := a.Storage().(*array.FixedSizeBinary)
o := new(strings.Builder)
@@ -222,6 +235,14 @@ type Parametric1Array struct {
array.ExtensionArrayBase
}
+func (a Parametric1Array) ValueString(i int) string {
+ arr := a.Storage().(*array.Int32)
+ if a.IsNull(i) {
+ return "(null)"
+ }
+ return fmt.Sprintf("%d", arr.Value(i))
+}
+
// Parametric2Array is another simple int32 array for use with the
Parametric2Type
// also for testing a parameterized user-defined extension type that utilizes
// the parameter for defining different types based on the param.
@@ -229,6 +250,15 @@ type Parametric2Array struct {
array.ExtensionArrayBase
}
+func (a Parametric2Array) ValueString(i int) string {
+ arr := a.Storage().(*array.Int32)
+ if a.IsNull(i) {
+ return "(null)"
+ }
+ return fmt.Sprintf("%d", arr.Value(i))
+}
+
+
// A type where ExtensionName is always the same
type Parametric1Type struct {
arrow.ExtensionBase
@@ -236,6 +266,7 @@ type Parametric1Type struct {
param int32
}
+
func NewParametric1Type(p int32) *Parametric1Type {
ret := &Parametric1Type{param: p}
ret.ExtensionBase.Storage = arrow.PrimitiveTypes.Int32
@@ -339,6 +370,18 @@ type ExtStructArray struct {
array.ExtensionArrayBase
}
+func (a ExtStructArray) ValueString(i int) string {
+ arr := a.Storage().(*array.Struct)
+ if a.IsNull(i) {
+ return "(null)"
+ }
+ b, err := arr.MarshalJSON()
+ if err != nil {
+ panic(err)
+ }
+ return string(b)
+}
+
// ExtStructType is an extension type with a non-primitive storage type
containing a struct
// with fields {a: int64, b: float64}
type ExtStructType struct {
@@ -384,6 +427,18 @@ type DictExtensionArray struct {
array.ExtensionArrayBase
}
+func (a DictExtensionArray) ValueString(i int) string {
+ arr := a.Storage().(*array.Dictionary)
+ if a.IsNull(i) {
+ return "(null)"
+ }
+ b, err := arr.MarshalJSON()
+ if err != nil {
+ panic(err)
+ }
+ return string(b)
+}
+
type DictExtensionType struct {
arrow.ExtensionBase
}
@@ -423,6 +478,14 @@ type SmallintArray struct {
array.ExtensionArrayBase
}
+func (a SmallintArray) ValueString(i int) string {
+ if a.IsNull(i) {
+ return "(null)"
+ }
+ arr := a.Storage().(*array.Int16)
+ return fmt.Sprintf("%d", arr.Value(i))
+}
+
type SmallintType struct {
arrow.ExtensionBase
}