sunchao commented on a change in pull request #643:
URL: https://github.com/apache/arrow-rs/pull/643#discussion_r680548466



##########
File path: parquet/src/column/writer.rs
##########
@@ -1687,6 +1687,137 @@ mod tests {
         );
     }
 
+    #[test]
+    fn test_bool_statistics() {
+        let stats = statistics_roundtrip::<BoolType>(&[true, false, false, 
true]);
+        assert!(stats.has_min_max_set());
+        // should this be BooleanStatistics??
+        if let Statistics::Int32(stats) = stats {
+            assert_eq!(stats.min(), &0);
+            assert_eq!(stats.max(), &1);
+        } else {
+            panic!("expecting Statistics::Int32, got {:?}", stats);
+        }
+    }
+
+    #[test]
+    fn test_int32_statistics() {
+        let stats = statistics_roundtrip::<Int32Type>(&[-1, 3, -2, 2]);
+        assert!(stats.has_min_max_set());
+        if let Statistics::Int32(stats) = stats {
+            assert_eq!(stats.min(), &-2);
+            assert_eq!(stats.max(), &3);
+        } else {
+            panic!("expecting Statistics::Int32, got {:?}", stats);
+        }
+    }
+
+    #[test]
+    fn test_int64_statistics() {
+        let stats = statistics_roundtrip::<Int64Type>(&[-1, 3, -2, 2]);
+        assert!(stats.has_min_max_set());
+        if let Statistics::Int64(stats) = stats {
+            assert_eq!(stats.min(), &-2);
+            assert_eq!(stats.max(), &3);
+        } else {
+            panic!("expecting Statistics::Int64, got {:?}", stats);
+        }
+    }
+
+    // // TODO test int 96 stats -- this was failing

Review comment:
       I think we should still keep INT96 for backward compatibility (it is 
still used today). In parquet-mr this is done by converting the binary into 
signed ints (see 
[here](https://github.com/apache/parquet-mr/blob/master/parquet-column/src/main/java/org/apache/parquet/schema/PrimitiveComparator.java#L230)).

##########
File path: parquet/src/data_type.rs
##########
@@ -116,23 +116,18 @@ pub struct ByteArray {
 
 impl PartialOrd for ByteArray {
     fn partial_cmp(&self, other: &ByteArray) -> Option<Ordering> {
-        if self.data.is_some() && other.data.is_some() {
-            match self.len().cmp(&other.len()) {
-                Ordering::Greater => Some(Ordering::Greater),
-                Ordering::Less => Some(Ordering::Less),
-                Ordering::Equal => {
-                    for (v1, v2) in 
self.data().iter().zip(other.data().iter()) {
-                        match v1.cmp(v2) {
-                            Ordering::Greater => return 
Some(Ordering::Greater),
-                            Ordering::Less => return Some(Ordering::Less),
-                            _ => {}
-                        }
-                    }
-                    Some(Ordering::Equal)
-                }
+        // sort nulls first (consistent with PartialCmp on Option)

Review comment:
       Good catch! the original PR is #7586 but yea we missed this in the 
review and I think the existing logic is incorrect.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to