kou commented on code in PR #38367:
URL: https://github.com/apache/arrow/pull/38367#discussion_r1366269242
##########
go/parquet/internal/encoding/boolean_decoder.go:
##########
@@ -109,3 +114,76 @@ func (dec *PlainBooleanDecoder) DecodeSpaced(out []bool,
nullCount int, validBit
}
return dec.Decode(out)
}
+
+type RleBooleanDecoder struct {
+ decoder
+
+ rleDec *utils.RleDecoder
+}
+
+func (RleBooleanDecoder) Type() parquet.Type {
+ return parquet.Types.Boolean
+}
+
+func (dec *RleBooleanDecoder) SetData(nvals int, data []byte) error {
+ dec.nvals = nvals
+
+ if len(data) < 4 {
+ return fmt.Errorf("invalid length - %d (corrupt data page?)",
len(data))
+ }
+
+ // load the first 4 bytes in little-endian which indicates the length
+ nbytes := binary.LittleEndian.Uint32(data[:4])
+ if nbytes > uint32(len(data)-4) {
+ return fmt.Errorf("received invalid number of bytes - %d
(corrupt data page?)", nbytes)
+ }
+
+ dec.data = data[4:]
Review Comment:
Do we need to use `nbytes` here?
```suggestion
dec.data = data[4:(4 + nbytes)]
```
##########
go/parquet/internal/encoding/boolean_decoder.go:
##########
@@ -109,3 +114,76 @@ func (dec *PlainBooleanDecoder) DecodeSpaced(out []bool,
nullCount int, validBit
}
return dec.Decode(out)
}
+
+type RleBooleanDecoder struct {
+ decoder
+
+ rleDec *utils.RleDecoder
+}
+
+func (RleBooleanDecoder) Type() parquet.Type {
+ return parquet.Types.Boolean
+}
+
+func (dec *RleBooleanDecoder) SetData(nvals int, data []byte) error {
+ dec.nvals = nvals
+
+ if len(data) < 4 {
+ return fmt.Errorf("invalid length - %d (corrupt data page?)",
len(data))
+ }
+
+ // load the first 4 bytes in little-endian which indicates the length
+ nbytes := binary.LittleEndian.Uint32(data[:4])
+ if nbytes > uint32(len(data)-4) {
+ return fmt.Errorf("received invalid number of bytes - %d
(corrupt data page?)", nbytes)
+ }
+
+ dec.data = data[4:]
+ if dec.rleDec == nil {
+ dec.rleDec = utils.NewRleDecoder(bytes.NewReader(dec.data), 1)
+ } else {
+ dec.rleDec.Reset(bytes.NewReader(dec.data), 1)
+ }
+ return nil
+}
+
+func (dec *RleBooleanDecoder) Decode(out []bool) (int, error) {
+ max := shared_utils.MinInt(len(out), dec.nvals)
+
+ var (
+ buf [1024]uint64
+ n = max
+ )
+
+ for n > 0 {
+ batch := shared_utils.MinInt(1024, n)
Review Comment:
```suggestion
batch := shared_utils.MinInt(len(buf), n)
```
##########
go/parquet/internal/encoding/boolean_encoder.go:
##########
@@ -87,3 +89,53 @@ func (enc *PlainBooleanEncoder) FlushValues() (Buffer,
error) {
return enc.sink.Finish(), nil
}
+
+type RleBooleanEncoder struct {
+ encoder
+
+ bufferedValues []bool
+}
+
+func (RleBooleanEncoder) Type() parquet.Type {
+ return parquet.Types.Boolean
+}
+
+func (enc *RleBooleanEncoder) Put(in []bool) {
+ enc.bufferedValues = append(enc.bufferedValues, in...)
+}
+
+func (enc *RleBooleanEncoder) PutSpaced(in []bool, validBits []byte,
validBitsOffset int64) {
+ bufferOut := make([]bool, len(in))
+ nvalid := spacedCompress(in, bufferOut, validBits, validBitsOffset)
+ enc.Put(bufferOut[:nvalid])
+}
+
+func (enc *RleBooleanEncoder) EstimatedDataEncodedSize() int64 {
+ const rleLengthInBytes = 4
Review Comment:
Can we share this value in here, `FlushValues()` and
`RleBooleanDecoder.SetData()` instead of defining this value in multiple places
(or use `4` as a magic number)?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]