Re: [PR] [Parquet] Add SIMD-accelerated byte-stream-split decoding [arrow-go]

via GitHub Tue, 24 Feb 2026 08:50:44 -0800


Danielius1922 commented on code in PR #654:
URL: https://github.com/apache/arrow-go/pull/654#discussion_r2848350096



##########
parquet/internal/encoding/byte_stream_split_big_endian.go:
##########
@@ -19,43 +19,98 @@
 package encoding
 
 import (
-       "fmt"
+       "unsafe"
 
+       "github.com/apache/arrow-go/v18/parquet"
        "github.com/apache/arrow-go/v18/parquet/internal/debug"
 )
 
 // decodeByteStreamSplitBatchWidth4InByteOrder decodes the batch of nValues 
raw bytes representing a 4-byte datatype provided
-// by 'data', into the output buffer 'out' using BYTE_STREAM_SPLIT encoding. 
The values are expected to be in little-endian
+// by 'data', into the output buffer 'out' using BYTE_STREAM_SPLIT encoding. 
The values are expected to be in big-endian
 // byte order and are be decoded into the 'out' array in machine's native 
endianness.
 // 'out' must have space for at least len(data) bytes.
-func decodeByteStreamSplitBatchWidth4InByteOrder(data []byte, nValues, stride 
int, out []byte) {
+func decodeByteStreamSplitBatchWidth4InByteOrderDefault(data []byte, nValues, 
stride int, out []byte) {
        const width = 4
-       debug.Assert(len(out) >= nValues*width, fmt.Sprintf("not enough space 
in output buffer for decoding, out: %d bytes, data: %d bytes", len(out), 
len(data)))
-       for element := 0; element < nValues; element++ {
-               // Big Endian: most significant byte first
-               out[width*element+0] = data[3*stride+element]
-               out[width*element+1] = data[2*stride+element]
-               out[width*element+2] = data[stride+element]
-               out[width*element+3] = data[element]
+       debug.Assert(len(out) >= nValues*width, "not enough space in output 
buffer for decoding")
+       // the beginning of the data slice can be truncated, but for valid 
encoding we need at least (width-1)*stride+nValues bytes
+       debug.Assert(len(data) >= 3*stride+nValues, "not enough data for 
decoding")
+       s0 := data[:nValues]
+       s1 := data[stride : stride+nValues]
+       s2 := data[2*stride : 2*stride+nValues]
+       s3 := data[3*stride : 3*stride+nValues]
+       out = out[:width*nValues]
+       out32 := unsafe.Slice((*uint32)(unsafe.Pointer(&out[0])), nValues)
+       for i := range nValues {
+               // Big-endian machine: put s0 as MSB, s3 as LSB
+               out32[i] = uint32(s3[i])<<24 | uint32(s2[i])<<16 | 
uint32(s1[i])<<8 | uint32(s0[i])
        }
 }
 
 // decodeByteStreamSplitBatchWidth8InByteOrder decodes the batch of nValues 
raw bytes representing a 8-byte datatype provided
-// by 'data', into the output buffer 'out' using BYTE_STREAM_SPLIT encoding. 
The values are expected to be in little-endian
+// by 'data', into the output buffer 'out' using BYTE_STREAM_SPLIT encoding. 
The values are expected to be in big-endian
 // byte order and are be decoded into the 'out' array in machine's native 
endianness.
 // 'out' must have space for at least len(data) bytes.
-func decodeByteStreamSplitBatchWidth8InByteOrder(data []byte, nValues, stride 
int, out []byte) {
+func decodeByteStreamSplitBatchWidth8InByteOrderDefault(data []byte, nValues, 
stride int, out []byte) {
        const width = 8
-       debug.Assert(len(out) >= nValues*width, fmt.Sprintf("not enough space 
in output buffer for decoding, out: %d bytes, data: %d bytes", len(out), 
len(data)))
+       debug.Assert(len(out) >= nValues*width, "not enough space in output 
buffer for decoding")
+       debug.Assert(len(data) >= 7*stride+nValues, "not enough data for 
decoding")
+       s0 := data[:nValues]
+       s1 := data[stride : stride+nValues]
+       s2 := data[2*stride : 2*stride+nValues]
+       s3 := data[3*stride : 3*stride+nValues]
+       s4 := data[4*stride : 4*stride+nValues]
+       s5 := data[5*stride : 5*stride+nValues]
+       s6 := data[6*stride : 6*stride+nValues]
+       s7 := data[7*stride : 7*stride+nValues]
+       out = out[:width*nValues]
+       out64 := unsafe.Slice((*uint64)(unsafe.Pointer(&out[0])), nValues)
+       for i := range nValues {
+               // Big-endian machine: put s0 as MSB, s7 as LSB
+               out64[i] = uint64(s7[i])<<56 | uint64(s6[i])<<48 | 
uint64(s5[i])<<40 | uint64(s4[i])<<32 |
+                       uint64(s3[i])<<24 | uint64(s2[i])<<16 | 
uint64(s1[i])<<8 | uint64(s0[i])
+       }
+}
+
+// decodeByteStreamSplitBatchFLBAWidth2 decodes the batch of nValues 
FixedLenByteArrays of length 2 provided by 'data',
+// into the output slice 'out' using BYTE_STREAM_SPLIT encoding.
+// 'out' must have space for at least nValues slices.
+func decodeByteStreamSplitBatchFLBAWidth2(data []byte, nValues, stride int, 
out []parquet.FixedLenByteArray) {

Review Comment:
   @zeroshade what is the interpretation of BSS encoding when applied to a byte 
array? Since everything is little-endian in parquet, data is little endian. But 
what about the out array, should the bytes be reordered on big-endian?



##########
parquet/internal/encoding/byte_stream_split.go:
##########
@@ -30,7 +30,7 @@ import (
 // encodeByteStreamSplit encodes the raw bytes provided by 'in' into the 
output buffer 'data' using BYTE_STREAM_SPLIT encoding.
 // 'data' must have space for at least len(in) bytes.
 func encodeByteStreamSplit(data []byte, in []byte, width int) {
-       debug.Assert(len(data) >= len(in), fmt.Sprintf("not enough space in 
destination buffer for encoding, dest: %d bytes, src: %d bytes", len(data), 
len(in)))
+       debug.Assert(len(data) >= len(in), "not enough space in destination 
buffer for encoding")

Review Comment:
   the allocations by fmt.Sprintf messed with the timings, I think the shorter 
messages are fine for the asserts



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] [Parquet] Add SIMD-accelerated byte-stream-split decoding [arrow-go]

Reply via email to