This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/main by this push:
     new d0260fcffa [Parquet] Add benchmark and test for writing NaNs to 
Parquet (#6955)
d0260fcffa is described below

commit d0260fcffa07a4cb8650cc290ab29027a3a8e65c
Author: Adam Reeve <[email protected]>
AuthorDate: Thu Jan 9 11:24:03 2025 +1300

    [Parquet] Add benchmark and test for writing NaNs to Parquet (#6955)
    
    * Add test and benchmarks for writing floats with NaNs
    
    * Remove extra benchmark with no NaNs
---
 arrow/Cargo.toml                      |  3 ++-
 arrow/src/util/bench_util.rs          | 46 +++++++++++++++++++++++++++++++++++
 parquet/benches/arrow_writer.rs       | 33 +++++++++++++++++++++++++
 parquet/src/arrow/arrow_writer/mod.rs | 39 +++++++++++++++++++++++++++++
 4 files changed, 120 insertions(+), 1 deletion(-)

diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml
index a1c9c0ab21..76119ec4ab 100644
--- a/arrow/Cargo.toml
+++ b/arrow/Cargo.toml
@@ -55,6 +55,7 @@ arrow-string = { workspace = true }
 
 rand = { version = "0.8", default-features = false, features = ["std", 
"std_rng"], optional = true }
 pyo3 = { version = "0.23", default-features = false, optional = true }
+half = { version = "2.1", default-features = false, optional = true }
 
 [package.metadata.docs.rs]
 features = ["prettyprint", "ipc_compression", "ffi", "pyarrow"]
@@ -70,7 +71,7 @@ prettyprint = ["arrow-cast/prettyprint"]
 # not the core arrow code itself. Be aware that `rand` must be kept as
 # an optional dependency for supporting compile to wasm32-unknown-unknown
 # target without assuming an environment containing JavaScript.
-test_utils = ["dep:rand"]
+test_utils = ["dep:rand", "dep:half"]
 pyarrow = ["pyo3", "ffi"]
 # force_validate runs full data validation for all arrays that are created
 # this is not enabled by default as it is too computationally expensive
diff --git a/arrow/src/util/bench_util.rs b/arrow/src/util/bench_util.rs
index 8eaae36dbe..53e0103412 100644
--- a/arrow/src/util/bench_util.rs
+++ b/arrow/src/util/bench_util.rs
@@ -21,6 +21,7 @@ use crate::array::*;
 use crate::datatypes::*;
 use crate::util::test_util::seedable_rng;
 use arrow_buffer::{Buffer, IntervalMonthDayNano};
+use half::f16;
 use rand::distributions::uniform::SampleUniform;
 use rand::thread_rng;
 use rand::Rng;
@@ -416,3 +417,48 @@ where
 
     DictionaryArray::from(data)
 }
+
+/// Creates a random (but fixed-seeded) f16 array of a given size and 
nan-value density
+pub fn create_f16_array(size: usize, nan_density: f32) -> Float16Array {
+    let mut rng = seedable_rng();
+
+    (0..size)
+        .map(|_| {
+            if rng.gen::<f32>() < nan_density {
+                Some(f16::NAN)
+            } else {
+                Some(f16::from_f32(rng.gen()))
+            }
+        })
+        .collect()
+}
+
+/// Creates a random (but fixed-seeded) f32 array of a given size and 
nan-value density
+pub fn create_f32_array(size: usize, nan_density: f32) -> Float32Array {
+    let mut rng = seedable_rng();
+
+    (0..size)
+        .map(|_| {
+            if rng.gen::<f32>() < nan_density {
+                Some(f32::NAN)
+            } else {
+                Some(rng.gen())
+            }
+        })
+        .collect()
+}
+
+/// Creates a random (but fixed-seeded) f64 array of a given size and 
nan-value density
+pub fn create_f64_array(size: usize, nan_density: f32) -> Float64Array {
+    let mut rng = seedable_rng();
+
+    (0..size)
+        .map(|_| {
+            if rng.gen::<f32>() < nan_density {
+                Some(f64::NAN)
+            } else {
+                Some(rng.gen())
+            }
+        })
+        .collect()
+}
diff --git a/parquet/benches/arrow_writer.rs b/parquet/benches/arrow_writer.rs
index bfa333db72..4166d962b5 100644
--- a/parquet/benches/arrow_writer.rs
+++ b/parquet/benches/arrow_writer.rs
@@ -28,7 +28,9 @@ extern crate parquet;
 use std::sync::Arc;
 
 use arrow::datatypes::*;
+use arrow::util::bench_util::{create_f16_array, create_f32_array, 
create_f64_array};
 use arrow::{record_batch::RecordBatch, util::data_gen::*};
+use arrow_array::RecordBatchOptions;
 use parquet::file::properties::WriterProperties;
 use parquet::{arrow::ArrowWriter, errors::Result};
 
@@ -181,6 +183,25 @@ fn create_bool_bench_batch_non_null(
     )?)
 }
 
+fn create_float_bench_batch_with_nans(size: usize, nan_density: f32) -> 
Result<RecordBatch> {
+    let fields = vec![
+        Field::new("_1", DataType::Float16, false),
+        Field::new("_2", DataType::Float32, false),
+        Field::new("_3", DataType::Float64, false),
+    ];
+    let schema = Schema::new(fields);
+    let columns: Vec<arrow_array::ArrayRef> = vec![
+        Arc::new(create_f16_array(size, nan_density)),
+        Arc::new(create_f32_array(size, nan_density)),
+        Arc::new(create_f64_array(size, nan_density)),
+    ];
+    Ok(RecordBatch::try_new_with_options(
+        Arc::new(schema),
+        columns,
+        &RecordBatchOptions::new().with_match_field_names(false),
+    )?)
+}
+
 fn create_list_primitive_bench_batch(
     size: usize,
     null_density: f32,
@@ -459,6 +480,18 @@ fn bench_primitive_writer(c: &mut Criterion) {
         b.iter(|| write_batch_enable_bloom_filter(&batch).unwrap())
     });
 
+    let batch = create_float_bench_batch_with_nans(4096, 0.5).unwrap();
+    group.throughput(Throughput::Bytes(
+        batch
+            .columns()
+            .iter()
+            .map(|f| f.get_array_memory_size() as u64)
+            .sum(),
+    ));
+    group.bench_function("4096 values float with NaNs", |b| {
+        b.iter(|| write_batch(&batch).unwrap())
+    });
+
     group.finish();
 }
 
diff --git a/parquet/src/arrow/arrow_writer/mod.rs 
b/parquet/src/arrow/arrow_writer/mod.rs
index 871b140768..41f15569fd 100644
--- a/parquet/src/arrow/arrow_writer/mod.rs
+++ b/parquet/src/arrow/arrow_writer/mod.rs
@@ -1095,6 +1095,7 @@ mod tests {
     use arrow::{array::*, buffer::Buffer};
     use arrow_buffer::{IntervalDayTime, IntervalMonthDayNano, NullBuffer};
     use arrow_schema::Fields;
+    use half::f16;
 
     use crate::basic::Encoding;
     use crate::data_type::AsBytes;
@@ -1763,6 +1764,44 @@ mod tests {
         );
     }
 
+    #[test]
+    fn arrow_writer_float_nans() {
+        let f16_field = Field::new("a", DataType::Float16, false);
+        let f32_field = Field::new("b", DataType::Float32, false);
+        let f64_field = Field::new("c", DataType::Float64, false);
+        let schema = Schema::new(vec![f16_field, f32_field, f64_field]);
+
+        let f16_values = (0..MEDIUM_SIZE)
+            .map(|i| {
+                Some(if i % 2 == 0 {
+                    f16::NAN
+                } else {
+                    f16::from_f32(i as f32)
+                })
+            })
+            .collect::<Float16Array>();
+
+        let f32_values = (0..MEDIUM_SIZE)
+            .map(|i| Some(if i % 2 == 0 { f32::NAN } else { i as f32 }))
+            .collect::<Float32Array>();
+
+        let f64_values = (0..MEDIUM_SIZE)
+            .map(|i| Some(if i % 2 == 0 { f64::NAN } else { i as f64 }))
+            .collect::<Float64Array>();
+
+        let batch = RecordBatch::try_new(
+            Arc::new(schema),
+            vec![
+                Arc::new(f16_values),
+                Arc::new(f32_values),
+                Arc::new(f64_values),
+            ],
+        )
+        .unwrap();
+
+        roundtrip(batch, None);
+    }
+
     const SMALL_SIZE: usize = 7;
     const MEDIUM_SIZE: usize = 63;
 

Reply via email to