alamb commented on a change in pull request #643:
URL: https://github.com/apache/arrow-rs/pull/643#discussion_r680352550
##########
File path: parquet/src/column/writer.rs
##########
@@ -1687,6 +1687,137 @@ mod tests {
);
}
+ #[test]
+ fn test_bool_statistics() {
+ let stats = statistics_roundtrip::<BoolType>(&[true, false, false,
true]);
+ assert!(stats.has_min_max_set());
+ // should this be BooleanStatistics??
+ if let Statistics::Int32(stats) = stats {
+ assert_eq!(stats.min(), &0);
+ assert_eq!(stats.max(), &1);
+ } else {
+ panic!("expecting Statistics::Int32, got {:?}", stats);
+ }
+ }
+
+ #[test]
+ fn test_int32_statistics() {
+ let stats = statistics_roundtrip::<Int32Type>(&[-1, 3, -2, 2]);
+ assert!(stats.has_min_max_set());
+ if let Statistics::Int32(stats) = stats {
+ assert_eq!(stats.min(), &-2);
+ assert_eq!(stats.max(), &3);
+ } else {
+ panic!("expecting Statistics::Int32, got {:?}", stats);
+ }
+ }
+
+ #[test]
+ fn test_int64_statistics() {
+ let stats = statistics_roundtrip::<Int64Type>(&[-1, 3, -2, 2]);
+ assert!(stats.has_min_max_set());
+ if let Statistics::Int64(stats) = stats {
+ assert_eq!(stats.min(), &-2);
+ assert_eq!(stats.max(), &3);
+ } else {
+ panic!("expecting Statistics::Int64, got {:?}", stats);
+ }
+ }
+
+ // // TODO test int 96 stats -- this was failing
Review comment:
Note 3: I have no idea if this test is incorrect or if there is
something wrong with the Int96 implementation. As Int96 seems to be deprecated,
I don't plan to spend much more time on it
##########
File path: parquet/src/data_type.rs
##########
@@ -116,23 +116,18 @@ pub struct ByteArray {
impl PartialOrd for ByteArray {
fn partial_cmp(&self, other: &ByteArray) -> Option<Ordering> {
- if self.data.is_some() && other.data.is_some() {
- match self.len().cmp(&other.len()) {
- Ordering::Greater => Some(Ordering::Greater),
- Ordering::Less => Some(Ordering::Less),
- Ordering::Equal => {
- for (v1, v2) in
self.data().iter().zip(other.data().iter()) {
- match v1.cmp(v2) {
- Ordering::Greater => return
Some(Ordering::Greater),
- Ordering::Less => return Some(Ordering::Less),
- _ => {}
- }
- }
- Some(Ordering::Equal)
- }
+ // sort nulls first (consistent with PartialCmp on Option)
Review comment:
The previous implementation of `PartialOrd` seems to have been
introduced in
https://github.com/apache/arrow-rs/commit/48c3771f274f3d89a46c79d61d04bcdd036028b9
in https://github.com/apache/arrow/pull/7622 by
It would be great if @zeevm or @sunchao had any additional context or
comment to share.
##########
File path: parquet/src/column/writer.rs
##########
@@ -1687,6 +1687,137 @@ mod tests {
);
}
+ #[test]
+ fn test_bool_statistics() {
+ let stats = statistics_roundtrip::<BoolType>(&[true, false, false,
true]);
+ assert!(stats.has_min_max_set());
+ // should this be BooleanStatistics??
+ if let Statistics::Int32(stats) = stats {
Review comment:
note 1: it is strange that a `Boolean` column produces `Int32Stats` (and
not `BooleanStats`)
##########
File path: parquet/src/column/writer.rs
##########
@@ -1687,6 +1687,137 @@ mod tests {
);
}
+ #[test]
+ fn test_bool_statistics() {
+ let stats = statistics_roundtrip::<BoolType>(&[true, false, false,
true]);
+ assert!(stats.has_min_max_set());
+ // should this be BooleanStatistics??
+ if let Statistics::Int32(stats) = stats {
+ assert_eq!(stats.min(), &0);
+ assert_eq!(stats.max(), &1);
+ } else {
+ panic!("expecting Statistics::Int32, got {:?}", stats);
+ }
+ }
+
+ #[test]
+ fn test_int32_statistics() {
+ let stats = statistics_roundtrip::<Int32Type>(&[-1, 3, -2, 2]);
+ assert!(stats.has_min_max_set());
+ if let Statistics::Int32(stats) = stats {
+ assert_eq!(stats.min(), &-2);
+ assert_eq!(stats.max(), &3);
+ } else {
+ panic!("expecting Statistics::Int32, got {:?}", stats);
+ }
+ }
+
+ #[test]
+ fn test_int64_statistics() {
+ let stats = statistics_roundtrip::<Int64Type>(&[-1, 3, -2, 2]);
+ assert!(stats.has_min_max_set());
+ if let Statistics::Int64(stats) = stats {
+ assert_eq!(stats.min(), &-2);
+ assert_eq!(stats.max(), &3);
+ } else {
+ panic!("expecting Statistics::Int64, got {:?}", stats);
+ }
+ }
+
+ // // TODO test int 96 stats -- this was failing
+ // #[test]
+ // fn test_int96_statistics() {
+ // let input = vec![-1, 3, -2, 2].into_iter()
+ // .map(to_i96)
+ // .collect::<Vec<Int96>>();
+
+ // let stats = statistics_roundtrip::<Int96Type>(&input);
+ // assert!(stats.has_min_max_set());
+ // if let Statistics::Int96(stats) = stats {
+ // assert_eq!(stats.min(), &to_i96(-2));
+ // assert_eq!(stats.max(), &to_i96(3));
+ // } else {
+ // panic!("expecting Statistics::Int96, got {:?}", stats);
+ // }
+ // }
+
+ // fn to_i96(v: i64) -> Int96 {
+ // // get 64 bytes
+ // let mut bytes: [u8; 12] = [0,0,0,0,0,0,0,0,0,0,0,0];
+
+ // if v.to_le_bytes() == v.to_ne_bytes() {
+ // // little endian
+ // bytes[0..8].clone_from_slice(&v.to_ne_bytes());
+ // }
+ // else {
+ // // big endian
+ // bytes[4..12].clone_from_slice(&v.to_ne_bytes());
+ // }
+ // Int96::from_ne_bytes(bytes)
+ // }
+
+ #[test]
+ fn test_float_statistics() {
+ let stats = statistics_roundtrip::<FloatType>(&[-1.0, 3.0, -2.0, 2.0]);
+ assert!(stats.has_min_max_set());
+ if let Statistics::Float(stats) = stats {
+ assert_eq!(stats.min(), &-2.0);
+ assert_eq!(stats.max(), &3.0);
+ } else {
+ panic!("expecting Statistics::Float, got {:?}", stats);
+ }
+ }
+
+ #[test]
+ fn test_double_statistics() {
+ let stats = statistics_roundtrip::<DoubleType>(&[-1.0, 3.0, -2.0,
2.0]);
+ assert!(stats.has_min_max_set());
+ if let Statistics::Double(stats) = stats {
+ assert_eq!(stats.min(), &-2.0);
+ assert_eq!(stats.max(), &3.0);
+ } else {
+ panic!("expecting Statistics::Double, got {:?}", stats);
+ }
+ }
+
+ #[test]
+ fn test_byte_array_statistics() {
+ let input = vec!["aawaa", "zz", "aaw", "m", "qrs"]
+ .iter()
+ .map(|&s| s.into())
+ .collect::<Vec<ByteArray>>();
+
+ let stats = statistics_roundtrip::<ByteArrayType>(&input);
+ assert!(stats.has_min_max_set());
+ if let Statistics::ByteArray(stats) = stats {
+ assert_eq!(stats.min(), &ByteArray::from("aaw"));
+ assert_eq!(stats.max(), &ByteArray::from("zz"));
+ } else {
+ panic!("expecting Statistics::ByteArray, got {:?}", stats);
+ }
+ }
+
+ #[test]
+ fn test_fixed_len_byte_array_statistics() {
+ let input = vec!["aawaa", "zz ", "aaw ", "m ", "qrs "]
+ .iter()
+ .map(|&s| {
+ let b: ByteArray = s.into();
+ b.into()
+ })
+ .collect::<Vec<FixedLenByteArray>>();
+
+ let stats = statistics_roundtrip::<FixedLenByteArrayType>(&input);
+ assert!(stats.has_min_max_set());
+ // should it be FixedLenByteArray?
Review comment:
note 2: it is strange that a `FixedLenByteArray` column produces
`ByteArrayStats` (and not `FixedLenByteArrayStats`)
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]