zeroshade commented on code in PR #33965:
URL: https://github.com/apache/arrow/pull/33965#discussion_r1093505946
##########
go/parquet/pqarrow/encode_arrow.go:
##########
@@ -421,12 +425,21 @@ func writeDenseArrow(ctx *arrowWriteContext, cw
file.ColumnChunkWriter, leafArr
wr.WriteBatchSpaced(leafArr.(*array.Float64).Float64Values(), defLevels,
repLevels, leafArr.NullBitmapBytes(), int64(leafArr.Data().Offset()))
}
case *file.ByteArrayColumnChunkWriter:
- if leafArr.DataType().ID() != arrow.STRING &&
leafArr.DataType().ID() != arrow.BINARY {
- return xerrors.New("invalid column type to write to
ByteArray")
+ var offsets []int64
+ switch leafArr.DataType().ID() {
+ case arrow.STRING, arrow.BINARY:
+ off32 := leafArr.(binaryarr).ValueOffsets()
+ offsets = make([]int64, len(off32))
+ for i, o := range off32 {
+ offsets[i] = int64(o)
+ }
+ case arrow.LARGE_STRING, arrow.LARGE_BINARY:
+ offsets = leafArr.(binary64arr).ValueOffsets()
+ default:
+ return xerrors.New(fmt.Sprintf("invalid column type to
write to ByteArray: %s", leafArr.DataType().Name()))
Review Comment:
Rather than pay the cost to copy the offsets, i'd rather just duplicate the
loop at line 453 based on whether you're using int32 or int64 offsets.
Duplicating the 4 lines of code is better than the cost of copying a
potentially large offset array just to adjust for the int32/int64.
When go1.20 releases I'd consider it viable to upgrade us to using go1.18 as
a default requirement, and we can make a generic version of the loop via a
function and just have that work. But for now, let's just duplicate the loop
and avoid copying the offset slice.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]