liukun4515 commented on code in PR #2966:
URL: https://github.com/apache/arrow-datafusion/pull/2966#discussion_r930600956
##########
datafusion/core/src/physical_plan/file_format/parquet.rs:
##########
@@ -388,28 +391,82 @@ struct RowGroupPruningStatistics<'a> {
parquet_schema: &'a Schema,
}
+// TODO: consolidate code with arrow-rs
+// Convert the bytes array to i128.
+// The endian of the input bytes array must be big-endian.
+// Copy from the arrow-rs
+fn from_bytes_to_i128(b: &[u8]) -> i128 {
+ assert!(b.len() <= 16, "Decimal128Array supports only up to size 16");
+ let first_bit = b[0] & 128u8 == 128u8;
+ let mut result = if first_bit { [255u8; 16] } else { [0u8; 16] };
+ for (i, v) in b.iter().enumerate() {
+ result[i + (16 - b.len())] = *v;
+ }
+ // The bytes array are from parquet file and must be the big-endian.
+ // The endian is defined by parquet format, and the reference document
+ //
https://github.com/apache/parquet-format/blob/54e53e5d7794d383529dd30746378f19a12afd58/src/main/thrift/parquet.thrift#L66
+ i128::from_be_bytes(result)
+}
+
/// Extract the min/max statistics from a `ParquetStatistics` object
macro_rules! get_statistic {
- ($column_statistics:expr, $func:ident, $bytes_func:ident) => {{
+ ($column_statistics:expr, $func:ident, $bytes_func:ident,
$target_arrow_type:expr) => {{
if !$column_statistics.has_min_max_set() {
return None;
}
match $column_statistics {
ParquetStatistics::Boolean(s) =>
Some(ScalarValue::Boolean(Some(*s.$func()))),
- ParquetStatistics::Int32(s) =>
Some(ScalarValue::Int32(Some(*s.$func()))),
- ParquetStatistics::Int64(s) =>
Some(ScalarValue::Int64(Some(*s.$func()))),
+ ParquetStatistics::Int32(s) => {
+ match $target_arrow_type {
+ // int32 to decimal with the precision and scale
+ Some(DataType::Decimal(precision, scale)) => {
+ Some(ScalarValue::Decimal128(
+ Some(*s.$func() as i128),
+ precision,
+ scale,
+ ))
+ }
+ _ => Some(ScalarValue::Int32(Some(*s.$func()))),
+ }
+ }
+ ParquetStatistics::Int64(s) => {
+ match $target_arrow_type {
+ // int64 to decimal with the precision and scale
+ Some(DataType::Decimal(precision, scale)) => {
+ Some(ScalarValue::Decimal128(
+ Some(*s.$func() as i128),
+ precision,
+ scale,
+ ))
+ }
+ _ => Some(ScalarValue::Int64(Some(*s.$func()))),
+ }
+ }
// 96 bit ints not supported
ParquetStatistics::Int96(_) => None,
ParquetStatistics::Float(s) =>
Some(ScalarValue::Float32(Some(*s.$func()))),
ParquetStatistics::Double(s) =>
Some(ScalarValue::Float64(Some(*s.$func()))),
ParquetStatistics::ByteArray(s) => {
+ // TODO support decimal type for byte array type
Review Comment:
tracked by https://github.com/apache/arrow-datafusion/issues/2970
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]