This is an automated email from the ASF dual-hosted git repository.
zeroshade pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new c170af41ba GH-39552: [Go] inclusion of option to use replacer when
creating csv strings with go library (#39576)
c170af41ba is described below
commit c170af41ba0c30b80aa4172da0b3637206368cf2
Author: Jânio <[email protected]>
AuthorDate: Wed Jan 17 14:00:39 2024 -0300
GH-39552: [Go] inclusion of option to use replacer when creating csv
strings with go library (#39576)
Rationale for this change
Make it possible to remove unwanted characters from strings
What changes are included in this PR?
Add new function to optionally setup a replacer in csv Writer Write method
Are these changes tested?
Yes
Are there any user-facing changes?
Adds an optional methods.
* Closes: #39552
Lead-authored-by: Jânio <[email protected]>
Co-authored-by: janiodev <[email protected]>
Signed-off-by: Matt Topol <[email protected]>
---
go/arrow/csv/common.go | 14 ++++++++++++++
go/arrow/csv/transformer.go | 12 ++++++------
go/arrow/csv/writer.go | 24 +++++++++++++-----------
go/arrow/csv/writer_test.go | 6 ++++--
4 files changed, 37 insertions(+), 19 deletions(-)
diff --git a/go/arrow/csv/common.go b/go/arrow/csv/common.go
index 99dac29f4d..31ca61f323 100644
--- a/go/arrow/csv/common.go
+++ b/go/arrow/csv/common.go
@@ -21,6 +21,7 @@ package csv
import (
"errors"
"fmt"
+ "strings"
"github.com/apache/arrow/go/v15/arrow"
"github.com/apache/arrow/go/v15/arrow/memory"
@@ -223,6 +224,19 @@ func WithIncludeColumns(cols []string) Option {
}
}
+// WithStringsReplacer receives a replacer to be applied in the string fields
+// of the CSV. This is useful to remove unwanted characters from the string.
+func WithStringsReplacer(replacer *strings.Replacer) Option {
+ return func(cfg config) {
+ switch cfg := cfg.(type) {
+ case *Writer:
+ cfg.stringReplacer = replacer.Replace
+ default:
+ panic(fmt.Errorf("arrow/csv: unknown config type %T",
cfg))
+ }
+ }
+}
+
func validate(schema *arrow.Schema) {
for i, f := range schema.Fields() {
switch ft := f.Type.(type) {
diff --git a/go/arrow/csv/transformer.go b/go/arrow/csv/transformer.go
index 0f0181520b..78b16446d4 100644
--- a/go/arrow/csv/transformer.go
+++ b/go/arrow/csv/transformer.go
@@ -29,7 +29,7 @@ import (
"github.com/apache/arrow/go/v15/arrow/array"
)
-func (w *Writer) transformColToStringArr(typ arrow.DataType, col arrow.Array)
[]string {
+func (w *Writer) transformColToStringArr(typ arrow.DataType, col arrow.Array,
stringsReplacer func(string)string) []string {
res := make([]string, col.Len())
switch typ.(type) {
case *arrow.BooleanType:
@@ -144,7 +144,7 @@ func (w *Writer) transformColToStringArr(typ
arrow.DataType, col arrow.Array) []
arr := col.(*array.String)
for i := 0; i < arr.Len(); i++ {
if arr.IsValid(i) {
- res[i] = arr.Value(i)
+ res[i] = stringsReplacer(arr.Value(i))
} else {
res[i] = w.nullValue
}
@@ -153,7 +153,7 @@ func (w *Writer) transformColToStringArr(typ
arrow.DataType, col arrow.Array) []
arr := col.(*array.LargeString)
for i := 0; i < arr.Len(); i++ {
if arr.IsValid(i) {
- res[i] = arr.Value(i)
+ res[i] = stringsReplacer(arr.Value(i))
} else {
res[i] = w.nullValue
}
@@ -224,7 +224,7 @@ func (w *Writer) transformColToStringArr(typ
arrow.DataType, col arrow.Array) []
var b bytes.Buffer
b.Write([]byte{'{'})
writer := csv.NewWriter(&b)
-
writer.Write(w.transformColToStringArr(list.DataType(), list))
+
writer.Write(w.transformColToStringArr(list.DataType(), list, stringsReplacer))
writer.Flush()
b.Truncate(b.Len() - 1)
b.Write([]byte{'}'})
@@ -243,7 +243,7 @@ func (w *Writer) transformColToStringArr(typ
arrow.DataType, col arrow.Array) []
var b bytes.Buffer
b.Write([]byte{'{'})
writer := csv.NewWriter(&b)
-
writer.Write(w.transformColToStringArr(list.DataType(), list))
+
writer.Write(w.transformColToStringArr(list.DataType(), list, stringsReplacer))
writer.Flush()
b.Truncate(b.Len() - 1)
b.Write([]byte{'}'})
@@ -262,7 +262,7 @@ func (w *Writer) transformColToStringArr(typ
arrow.DataType, col arrow.Array) []
var b bytes.Buffer
b.Write([]byte{'{'})
writer := csv.NewWriter(&b)
-
writer.Write(w.transformColToStringArr(list.DataType(), list))
+
writer.Write(w.transformColToStringArr(list.DataType(), list, stringsReplacer))
writer.Flush()
b.Truncate(b.Len() - 1)
b.Write([]byte{'}'})
diff --git a/go/arrow/csv/writer.go b/go/arrow/csv/writer.go
index a672008b58..b939b72984 100644
--- a/go/arrow/csv/writer.go
+++ b/go/arrow/csv/writer.go
@@ -27,12 +27,13 @@ import (
// Writer wraps encoding/csv.Writer and writes arrow.Record based on a schema.
type Writer struct {
- boolFormatter func(bool) string
- header bool
- nullValue string
- once sync.Once
- schema *arrow.Schema
- w *csv.Writer
+ boolFormatter func(bool) string
+ header bool
+ nullValue string
+ stringReplacer func(string) string
+ once sync.Once
+ schema *arrow.Schema
+ w *csv.Writer
}
// NewWriter returns a writer that writes arrow.Records to the CSV file
@@ -45,10 +46,11 @@ func NewWriter(w io.Writer, schema *arrow.Schema, opts
...Option) *Writer {
validate(schema)
ww := &Writer{
- boolFormatter: strconv.FormatBool, // override by passing
WithBoolWriter() as an option
- nullValue: "NULL", // override by passing
WithNullWriter() as an option
- schema: schema,
- w: csv.NewWriter(w),
+ boolFormatter: strconv.FormatBool, // override
by passing WithBoolWriter() as an option
+ nullValue: "NULL", // override
by passing WithNullWriter() as an option
+ stringReplacer: func(x string) string { return x }, // override
by passing WithStringsReplacer() as an option
+ schema: schema,
+ w: csv.NewWriter(w),
}
for _, opt := range opts {
opt(ww)
@@ -81,7 +83,7 @@ func (w *Writer) Write(record arrow.Record) error {
}
for j, col := range record.Columns() {
- rows := w.transformColToStringArr(w.schema.Field(j).Type, col)
+ rows := w.transformColToStringArr(w.schema.Field(j).Type, col,
w.stringReplacer)
for i, row := range rows {
recs[i][j] = row
}
diff --git a/go/arrow/csv/writer_test.go b/go/arrow/csv/writer_test.go
index 644cae0933..b1bd3251c5 100644
--- a/go/arrow/csv/writer_test.go
+++ b/go/arrow/csv/writer_test.go
@@ -23,6 +23,7 @@ import (
"fmt"
"io"
"log"
+ "strings"
"testing"
"github.com/apache/arrow/go/v15/arrow"
@@ -250,8 +251,8 @@ func testCSVWriter(t *testing.T, data [][]string,
writeHeader bool, fmtr func(bo
b.Field(9).(*array.Float16Builder).AppendValues([]float16.Num{float16.New(0.0),
float16.New(0.1), float16.New(0.2)}, nil)
b.Field(10).(*array.Float32Builder).AppendValues([]float32{0.0, 0.1,
0.2}, nil)
b.Field(11).(*array.Float64Builder).AppendValues([]float64{0.0, 0.1,
0.2}, nil)
- b.Field(12).(*array.StringBuilder).AppendValues([]string{"str-0",
"str-1", "str-2"}, nil)
- b.Field(13).(*array.LargeStringBuilder).AppendValues([]string{"str-0",
"str-1", "str-2"}, nil)
+ b.Field(12).(*array.StringBuilder).AppendValues([]string{"str_0",
"str-1", "str-2"}, nil)
+ b.Field(13).(*array.LargeStringBuilder).AppendValues([]string{"str_0",
"str-1", "str-2"}, nil)
b.Field(14).(*array.TimestampBuilder).AppendValues(genTimestamps(arrow.Second),
nil)
b.Field(15).(*array.Date32Builder).AppendValues([]arrow.Date32{17304,
19304, 20304}, nil)
b.Field(16).(*array.Date64Builder).AppendValues([]arrow.Date64{1840400000000,
1940400000000, 2040400000000}, nil)
@@ -300,6 +301,7 @@ func testCSVWriter(t *testing.T, data [][]string,
writeHeader bool, fmtr func(bo
csv.WithHeader(writeHeader),
csv.WithNullWriter(nullVal),
csv.WithBoolWriter(fmtr),
+ csv.WithStringsReplacer(strings.NewReplacer("_", "-")),
)
err := w.Write(rec)
if err != nil {