zeroshade commented on code in PR #38367:
URL: https://github.com/apache/arrow/pull/38367#discussion_r1376372506


##########
go/parquet/internal/encoding/boolean_decoder.go:
##########
@@ -109,3 +114,76 @@ func (dec *PlainBooleanDecoder) DecodeSpaced(out []bool, 
nullCount int, validBit
        }
        return dec.Decode(out)
 }
+
+type RleBooleanDecoder struct {
+       decoder
+
+       rleDec *utils.RleDecoder
+}
+
+func (RleBooleanDecoder) Type() parquet.Type {
+       return parquet.Types.Boolean
+}
+
+func (dec *RleBooleanDecoder) SetData(nvals int, data []byte) error {
+       dec.nvals = nvals
+
+       if len(data) < 4 {
+               return fmt.Errorf("invalid length - %d (corrupt data page?)", 
len(data))
+       }
+
+       // load the first 4 bytes in little-endian which indicates the length
+       nbytes := binary.LittleEndian.Uint32(data[:4])
+       if nbytes > uint32(len(data)-4) {
+               return fmt.Errorf("received invalid number of bytes - %d 
(corrupt data page?)", nbytes)
+       }
+
+       dec.data = data[4:]
+       if dec.rleDec == nil {
+               dec.rleDec = utils.NewRleDecoder(bytes.NewReader(dec.data), 1)
+       } else {
+               dec.rleDec.Reset(bytes.NewReader(dec.data), 1)
+       }
+       return nil
+}
+
+func (dec *RleBooleanDecoder) Decode(out []bool) (int, error) {
+       max := shared_utils.MinInt(len(out), dec.nvals)
+
+       var (
+               buf [1024]uint64
+               n   = max
+       )
+
+       for n > 0 {
+               batch := shared_utils.MinInt(len(buf), n)
+               decoded := dec.rleDec.GetBatch(buf[:batch])
+               if decoded != batch {
+                       return max - n, io.ErrUnexpectedEOF

Review Comment:
   > No, max is not the real maximum of the underlying data. For rep-def 
existing scenerio, it means "rep-def" size, rather than the actual size. So it 
may greater than existing value. (At least this is the case in C++)
   
   this is a level above where we are here. This is *solely* the value decoder. 
In this function `max` is the maximum number of *values* that are decoded by a 
single call to this function (i.e. the minimum of the length of the output 
slice and the number of values left to decode). 
   
   The handling of rep and def levels is taken care of a level above this by 
the actual page reader / column chunk reader, which determines how many 
physical values it wants to read from the decoder. The intent of this function 
is that the return value is the *number of physical values populated into the 
output slice*. 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to