This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new d0260fcffa [Parquet] Add benchmark and test for writing NaNs to
Parquet (#6955)
d0260fcffa is described below
commit d0260fcffa07a4cb8650cc290ab29027a3a8e65c
Author: Adam Reeve <[email protected]>
AuthorDate: Thu Jan 9 11:24:03 2025 +1300
[Parquet] Add benchmark and test for writing NaNs to Parquet (#6955)
* Add test and benchmarks for writing floats with NaNs
* Remove extra benchmark with no NaNs
---
arrow/Cargo.toml | 3 ++-
arrow/src/util/bench_util.rs | 46 +++++++++++++++++++++++++++++++++++
parquet/benches/arrow_writer.rs | 33 +++++++++++++++++++++++++
parquet/src/arrow/arrow_writer/mod.rs | 39 +++++++++++++++++++++++++++++
4 files changed, 120 insertions(+), 1 deletion(-)
diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml
index a1c9c0ab21..76119ec4ab 100644
--- a/arrow/Cargo.toml
+++ b/arrow/Cargo.toml
@@ -55,6 +55,7 @@ arrow-string = { workspace = true }
rand = { version = "0.8", default-features = false, features = ["std",
"std_rng"], optional = true }
pyo3 = { version = "0.23", default-features = false, optional = true }
+half = { version = "2.1", default-features = false, optional = true }
[package.metadata.docs.rs]
features = ["prettyprint", "ipc_compression", "ffi", "pyarrow"]
@@ -70,7 +71,7 @@ prettyprint = ["arrow-cast/prettyprint"]
# not the core arrow code itself. Be aware that `rand` must be kept as
# an optional dependency for supporting compile to wasm32-unknown-unknown
# target without assuming an environment containing JavaScript.
-test_utils = ["dep:rand"]
+test_utils = ["dep:rand", "dep:half"]
pyarrow = ["pyo3", "ffi"]
# force_validate runs full data validation for all arrays that are created
# this is not enabled by default as it is too computationally expensive
diff --git a/arrow/src/util/bench_util.rs b/arrow/src/util/bench_util.rs
index 8eaae36dbe..53e0103412 100644
--- a/arrow/src/util/bench_util.rs
+++ b/arrow/src/util/bench_util.rs
@@ -21,6 +21,7 @@ use crate::array::*;
use crate::datatypes::*;
use crate::util::test_util::seedable_rng;
use arrow_buffer::{Buffer, IntervalMonthDayNano};
+use half::f16;
use rand::distributions::uniform::SampleUniform;
use rand::thread_rng;
use rand::Rng;
@@ -416,3 +417,48 @@ where
DictionaryArray::from(data)
}
+
+/// Creates a random (but fixed-seeded) f16 array of a given size and
nan-value density
+pub fn create_f16_array(size: usize, nan_density: f32) -> Float16Array {
+ let mut rng = seedable_rng();
+
+ (0..size)
+ .map(|_| {
+ if rng.gen::<f32>() < nan_density {
+ Some(f16::NAN)
+ } else {
+ Some(f16::from_f32(rng.gen()))
+ }
+ })
+ .collect()
+}
+
+/// Creates a random (but fixed-seeded) f32 array of a given size and
nan-value density
+pub fn create_f32_array(size: usize, nan_density: f32) -> Float32Array {
+ let mut rng = seedable_rng();
+
+ (0..size)
+ .map(|_| {
+ if rng.gen::<f32>() < nan_density {
+ Some(f32::NAN)
+ } else {
+ Some(rng.gen())
+ }
+ })
+ .collect()
+}
+
+/// Creates a random (but fixed-seeded) f64 array of a given size and
nan-value density
+pub fn create_f64_array(size: usize, nan_density: f32) -> Float64Array {
+ let mut rng = seedable_rng();
+
+ (0..size)
+ .map(|_| {
+ if rng.gen::<f32>() < nan_density {
+ Some(f64::NAN)
+ } else {
+ Some(rng.gen())
+ }
+ })
+ .collect()
+}
diff --git a/parquet/benches/arrow_writer.rs b/parquet/benches/arrow_writer.rs
index bfa333db72..4166d962b5 100644
--- a/parquet/benches/arrow_writer.rs
+++ b/parquet/benches/arrow_writer.rs
@@ -28,7 +28,9 @@ extern crate parquet;
use std::sync::Arc;
use arrow::datatypes::*;
+use arrow::util::bench_util::{create_f16_array, create_f32_array,
create_f64_array};
use arrow::{record_batch::RecordBatch, util::data_gen::*};
+use arrow_array::RecordBatchOptions;
use parquet::file::properties::WriterProperties;
use parquet::{arrow::ArrowWriter, errors::Result};
@@ -181,6 +183,25 @@ fn create_bool_bench_batch_non_null(
)?)
}
+fn create_float_bench_batch_with_nans(size: usize, nan_density: f32) ->
Result<RecordBatch> {
+ let fields = vec![
+ Field::new("_1", DataType::Float16, false),
+ Field::new("_2", DataType::Float32, false),
+ Field::new("_3", DataType::Float64, false),
+ ];
+ let schema = Schema::new(fields);
+ let columns: Vec<arrow_array::ArrayRef> = vec![
+ Arc::new(create_f16_array(size, nan_density)),
+ Arc::new(create_f32_array(size, nan_density)),
+ Arc::new(create_f64_array(size, nan_density)),
+ ];
+ Ok(RecordBatch::try_new_with_options(
+ Arc::new(schema),
+ columns,
+ &RecordBatchOptions::new().with_match_field_names(false),
+ )?)
+}
+
fn create_list_primitive_bench_batch(
size: usize,
null_density: f32,
@@ -459,6 +480,18 @@ fn bench_primitive_writer(c: &mut Criterion) {
b.iter(|| write_batch_enable_bloom_filter(&batch).unwrap())
});
+ let batch = create_float_bench_batch_with_nans(4096, 0.5).unwrap();
+ group.throughput(Throughput::Bytes(
+ batch
+ .columns()
+ .iter()
+ .map(|f| f.get_array_memory_size() as u64)
+ .sum(),
+ ));
+ group.bench_function("4096 values float with NaNs", |b| {
+ b.iter(|| write_batch(&batch).unwrap())
+ });
+
group.finish();
}
diff --git a/parquet/src/arrow/arrow_writer/mod.rs
b/parquet/src/arrow/arrow_writer/mod.rs
index 871b140768..41f15569fd 100644
--- a/parquet/src/arrow/arrow_writer/mod.rs
+++ b/parquet/src/arrow/arrow_writer/mod.rs
@@ -1095,6 +1095,7 @@ mod tests {
use arrow::{array::*, buffer::Buffer};
use arrow_buffer::{IntervalDayTime, IntervalMonthDayNano, NullBuffer};
use arrow_schema::Fields;
+ use half::f16;
use crate::basic::Encoding;
use crate::data_type::AsBytes;
@@ -1763,6 +1764,44 @@ mod tests {
);
}
+ #[test]
+ fn arrow_writer_float_nans() {
+ let f16_field = Field::new("a", DataType::Float16, false);
+ let f32_field = Field::new("b", DataType::Float32, false);
+ let f64_field = Field::new("c", DataType::Float64, false);
+ let schema = Schema::new(vec![f16_field, f32_field, f64_field]);
+
+ let f16_values = (0..MEDIUM_SIZE)
+ .map(|i| {
+ Some(if i % 2 == 0 {
+ f16::NAN
+ } else {
+ f16::from_f32(i as f32)
+ })
+ })
+ .collect::<Float16Array>();
+
+ let f32_values = (0..MEDIUM_SIZE)
+ .map(|i| Some(if i % 2 == 0 { f32::NAN } else { i as f32 }))
+ .collect::<Float32Array>();
+
+ let f64_values = (0..MEDIUM_SIZE)
+ .map(|i| Some(if i % 2 == 0 { f64::NAN } else { i as f64 }))
+ .collect::<Float64Array>();
+
+ let batch = RecordBatch::try_new(
+ Arc::new(schema),
+ vec![
+ Arc::new(f16_values),
+ Arc::new(f32_values),
+ Arc::new(f64_values),
+ ],
+ )
+ .unwrap();
+
+ roundtrip(batch, None);
+ }
+
const SMALL_SIZE: usize = 7;
const MEDIUM_SIZE: usize = 63;