tustvold commented on code in PR #2089:
URL: https://github.com/apache/arrow-rs/pull/2089#discussion_r922677184
##########
parquet/src/data_type.rs:
##########
@@ -936,6 +982,24 @@ pub(crate) mod private {
Ok(num_values)
}
+ fn skip(decoder: &mut PlainDecoderDetails, num_values: usize) ->
Result<usize> {
+ let data = decoder
+ .data
+ .as_mut()
+ .expect("set_data should have been called");
+ let num_values = std::cmp::min(num_values, decoder.num_values);
Review Comment:
```suggestion
let num_values = num_values.min(decoder.num_values);
```
##########
parquet/src/data_type.rs:
##########
@@ -690,6 +692,15 @@ pub(crate) mod private {
Ok(values_read)
}
+ fn skip(decoder: &mut PlainDecoderDetails, num_values: usize) ->
Result<usize> {
+ let bit_reader = decoder.bit_reader.as_mut().unwrap();
+ let num_values = std::cmp::min(num_values, decoder.num_values);
+ let mut buffer = vec![false; num_values];
+ let values_read = bit_reader.get_batch(&mut buffer[..num_values],
1);
Review Comment:
BitReader::skip?
##########
arrow/Cargo.toml:
##########
@@ -41,7 +41,7 @@ bench = false
ahash = { version = "0.7", default-features = false }
serde = { version = "1.0", default-features = false }
serde_derive = { version = "1.0", default-features = false }
-serde_json = { version = "1.0", default-features = false, features =
["preserve_order"] }
+serde_json = { version = "1.0", default-features = false, features =
["preserve_order","std"] }
Review Comment:
This shouldn't be necessary if you update your lockfile
##########
arrow-flight/src/arrow.flight.protocol.rs:
##########
@@ -279,9 +279,9 @@ pub mod flight_service_client {
&mut self,
request: impl tonic::IntoStreamingRequest<Message =
super::HandshakeRequest>,
) -> Result<
- tonic::Response<tonic::codec::Streaming<super::HandshakeResponse>>,
- tonic::Status,
- > {
+
tonic::Response<tonic::codec::Streaming<super::HandshakeResponse>>,
Review Comment:
I wonder if you have an out of date lockfile, which is resulting in these
changes??
##########
parquet/src/data_type.rs:
##########
@@ -764,6 +775,23 @@ pub(crate) mod private {
Ok(num_values)
}
+ #[inline]
+ fn skip(decoder: &mut PlainDecoderDetails, num_values: usize)
-> Result<usize> {
+ let data = decoder.data.as_ref().expect("set_data should
have been called");
+ let num_values = std::cmp::min(num_values,
decoder.num_values);
Review Comment:
```suggestion
let num_values = num_values.min(decoder.num_values);
```
##########
parquet/src/encodings/rle.rs:
##########
@@ -631,6 +723,45 @@ mod tests {
assert_eq!(buffer, expected);
}
+ #[test]
+ fn test_rle_skip_dict() {
+ // Test RLE encoding: 3 0s followed by 4 1s followed by 5 2s
+ // 00000110 00000000 00001000 00000001 00001010 00000010
+ let dict = vec![10, 20, 30];
+ let data = ByteBufferPtr::new(vec![0x06, 0x00, 0x08, 0x01, 0x0A,
0x02]);
+ let mut decoder: RleDecoder = RleDecoder::new(3);
+ decoder.set_data(data);
+ let mut buffer = vec![0; 10];
+ let expected = vec![10, 20, 20, 20, 20, 30, 30, 30, 30, 30];
+ let skipped = decoder.skip(2).expect("skipping two values");
+ assert_eq!(skipped, 2);
+ let remainder = decoder.get_batch_with_dict::<i32>(&dict, &mut buffer,
10).expect("getting remainder");
+ assert_eq!(remainder, 10);
+ assert_eq!(buffer, expected);
+
+ // Test bit-pack encoding: 345345345455 (2 groups: 8 and 4)
+ // 011 100 101 011 100 101 011 100 101 100 101 101
+ // 00000011 01100011 11000111 10001110 00000011 01100101 00001011
+ let dict = vec!["aaa", "bbb", "ccc", "ddd", "eee", "fff"];
+ let data = ByteBufferPtr::new(vec![0x03, 0x63, 0xC7, 0x8E, 0x03, 0x65,
0x0B]);
+ let mut decoder: RleDecoder = RleDecoder::new(3);
+ decoder.set_data(data);
+ let mut buffer = vec![""; 8];
+ let expected = vec![
+ "eee", "fff", "ddd", "eee", "fff", "eee", "fff",
+ "fff",
+ ];
+ let skipped = decoder.skip(4).expect("skipping two values");
Review Comment:
```suggestion
let skipped = decoder.skip(4).expect("skipping four values");
```
##########
parquet/src/util/bit_util.rs:
##########
@@ -605,6 +627,56 @@ impl BitReader {
values_to_read
}
+ /// Skip num_value values with num_bits bit width
+ ///
+ /// Return the number of values skipped (up to num_values)
+ pub fn skip(&mut self, num_values: usize, num_bits: usize) -> usize {
+ assert!(num_bits <= 64);
+
+ let mut num_values = num_values;
+ let needed_bits = num_bits * num_values;
+ let remaining_bits = (self.total_bytes - self.byte_offset) * 8 -
self.bit_offset;
+ if remaining_bits < needed_bits {
+ num_values = remaining_bits / num_bits;
+ }
+
+ let mut values_skipped = 0;
+
+ if num_bits > 32 {
Review Comment:
We don't need this special case for skip, this exists in get_batch because
we only have unpack32 and not unpack64
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]