Re: [PR] Parquet: read/write f16 for Arrow [arrow-rs]

via GitHub Mon, 06 Nov 2023 20:44:31 -0800


Jefffrey commented on code in PR #5003:
URL: https://github.com/apache/arrow-rs/pull/5003#discussion_r1384372963



##########
parquet/src/column/writer/mod.rs:
##########
@@ -967,18 +969,23 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, 
E> {
 }
 
 fn update_min<T: ParquetValueType>(descr: &ColumnDescriptor, val: &T, min: 
&mut Option<T>) {
-    update_stat::<T, _>(val, min, |cur| compare_greater(descr, cur, val))
+    update_stat::<T, _>(descr, val, min, |cur| compare_greater(descr, cur, 
val))
 }
 
 fn update_max<T: ParquetValueType>(descr: &ColumnDescriptor, val: &T, max: 
&mut Option<T>) {
-    update_stat::<T, _>(val, max, |cur| compare_greater(descr, val, cur))
+    update_stat::<T, _>(descr, val, max, |cur| compare_greater(descr, val, 
cur))
 }
 
 #[inline]
 #[allow(clippy::eq_op)]
-fn is_nan<T: ParquetValueType>(val: &T) -> bool {
+fn is_nan<T: ParquetValueType>(descr: &ColumnDescriptor, val: &T) -> bool {
     match T::PHYSICAL_TYPE {
         Type::FLOAT | Type::DOUBLE => val != val,
+        Type::FIXED_LEN_BYTE_ARRAY if descr.logical_type() == 
Some(LogicalType::Float16) => {
+            let val = val.as_bytes();
+            let val = f16::from_le_bytes([val[0], val[1]]);
+            val.is_nan()
+        }

Review Comment:
   Purely from type `T` we can't determine if the FixedLenByteArray represents 
a Float16 logical type, hence need `ColumnDescriptor` param to provide this 
information, to subsequently check NaN for f16



##########
parquet/src/column/writer/mod.rs:
##########
@@ -1038,6 +1049,14 @@ fn compare_greater<T: ParquetValueType>(descr: 
&ColumnDescriptor, a: &T, b: &T)
         };
     };
 
+    if let Some(LogicalType::Float16) = descr.logical_type() {
+        let a = a.as_bytes();
+        let a = f16::from_le_bytes([a[0], a[1]]);
+        let b = b.as_bytes();
+        let b = f16::from_le_bytes([b[0], b[1]]);
+        return a > b;
+    }

Review Comment:
   Don't compare as bytes, compare as f16's



##########
parquet/src/column/writer/mod.rs:
##########
@@ -2077,6 +2097,79 @@ mod tests {
         }
     }
 
+    #[test]
+    fn test_column_writer_check_float16_min_max() {
+        let input = [
+            -f16::ONE,
+            f16::from_f32(3.0),
+            -f16::from_f32(2.0),
+            f16::from_f32(2.0),
+        ]
+        .into_iter()
+        .map(|s| ByteArray::from(s).into())
+        .collect::<Vec<_>>();
+
+        let stats = float16_statistics_roundtrip(&input);
+        assert!(stats.has_min_max_set());
+        assert!(stats.is_min_max_backwards_compatible());
+        assert_eq!(stats.min(), &ByteArray::from(-f16::from_f32(2.0)));
+        assert_eq!(stats.max(), &ByteArray::from(f16::from_f32(3.0)));
+    }
+
+    #[test]
+    fn test_column_writer_check_float16_nan_middle() {
+        let input = [f16::ONE, f16::NAN, f16::ONE + f16::ONE]
+            .into_iter()
+            .map(|s| ByteArray::from(s).into())
+            .collect::<Vec<_>>();
+
+        let stats = float16_statistics_roundtrip(&input);
+        assert!(stats.has_min_max_set());
+        assert!(stats.is_min_max_backwards_compatible());
+        assert_eq!(stats.min(), &ByteArray::from(f16::ONE));
+        assert_eq!(stats.max(), &ByteArray::from(f16::ONE + f16::ONE));
+    }
+
+    #[test]
+    fn test_float16_statistics_nan_middle() {
+        let input = [f16::ONE, f16::NAN, f16::ONE + f16::ONE]
+            .into_iter()
+            .map(|s| ByteArray::from(s).into())
+            .collect::<Vec<_>>();
+
+        let stats = float16_statistics_roundtrip(&input);
+        assert!(stats.has_min_max_set());
+        assert!(stats.is_min_max_backwards_compatible());
+        assert_eq!(stats.min(), &ByteArray::from(f16::ONE));
+        assert_eq!(stats.max(), &ByteArray::from(f16::ONE + f16::ONE));
+    }
+
+    #[test]
+    fn test_float16_statistics_nan_start() {
+        let input = [f16::NAN, f16::ONE, f16::ONE + f16::ONE]
+            .into_iter()
+            .map(|s| ByteArray::from(s).into())
+            .collect::<Vec<_>>();
+
+        let stats = float16_statistics_roundtrip(&input);
+        assert!(stats.has_min_max_set());
+        assert!(stats.is_min_max_backwards_compatible());
+        assert_eq!(stats.min(), &ByteArray::from(f16::ONE));
+        assert_eq!(stats.max(), &ByteArray::from(f16::ONE + f16::ONE));
+    }
+
+    #[test]
+    fn test_float16_statistics_nan_only() {
+        let input = [f16::NAN, f16::NAN]
+            .into_iter()
+            .map(|s| ByteArray::from(s).into())
+            .collect::<Vec<_>>();
+
+        let stats = float16_statistics_roundtrip(&input);
+        assert!(!stats.has_min_max_set());
+        assert!(stats.is_min_max_backwards_compatible());
+    }

Review Comment:
   Ensuring NaN's are handled correctly in statistics (i.e. are ignored)



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] Parquet: read/write f16 for Arrow [arrow-rs]

Reply via email to