This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new b2583b0333 [Variant] Speedup validation (#7878)
b2583b0333 is described below
commit b2583b0333cee7aee1960d8af80bdf73bc4271ed
Author: Matthew Kim <[email protected]>
AuthorDate: Fri Jul 11 07:12:30 2025 -0400
[Variant] Speedup validation (#7878)
# Rationale for this change
- Closes https://github.com/apache/arrow-rs/issues/7869
- Closes https://github.com/apache/arrow-rs/issues/7872
This PR contains algorithmic modifications to the validation logic and
the associated benchmarks, specifically targeting complex object and
list validation.
Previously, the approach involved iterating over each element and
repeatedly fetching the same slice of the backing buffer, then slicing
_into_ that buffer again for each individual element. This led to
redundant buffer access.
This validation approach is done in multiple passes that take advantage
of the variant's memory layout. For example, dictionary field names are
stored contiguously; instead of checking whether a field name is
UTF8-encoded separately, we now validate the entire field name buffer in
a single pass.
The benchmark cases were adapted from
`test_json_to_variant_object_very_large`,
`test_json_to_variant_object_complex`, and
`test_json_to_variant_array_nested_large` test cases.
Compared to #7871, we observe a significant improvement in performance:
<img width="576" alt="Screenshot 2025-07-07 at 10 25 07 AM"
src="https://github.com/user-attachments/assets/b8644466-8259-4081-892b-c18f9f64b9f3"
/>
@scovich @alamb
---
parquet-variant-json/Cargo.toml | 1 -
parquet-variant/Cargo.toml | 4 +
parquet-variant/benches/variant_validation.rs | 138 ++++++++++++++++++++++++++
parquet-variant/src/decoder.rs | 19 ++++
parquet-variant/src/utils.rs | 8 --
parquet-variant/src/variant/list.rs | 35 ++++++-
parquet-variant/src/variant/metadata.rs | 87 ++++++++++++++--
parquet-variant/src/variant/object.rs | 80 ++++++++++++++-
8 files changed, 345 insertions(+), 27 deletions(-)
diff --git a/parquet-variant-json/Cargo.toml b/parquet-variant-json/Cargo.toml
index 830a3c0600..86281e4ae9 100644
--- a/parquet-variant-json/Cargo.toml
+++ b/parquet-variant-json/Cargo.toml
@@ -46,4 +46,3 @@ name = "parquet_variant_json"
bench = false
[dev-dependencies]
-
diff --git a/parquet-variant/Cargo.toml b/parquet-variant/Cargo.toml
index 3edfbb76ed..329399f9f6 100644
--- a/parquet-variant/Cargo.toml
+++ b/parquet-variant/Cargo.toml
@@ -55,3 +55,7 @@ rand = { version = "0.9", default-features = false, features
= [
[[bench]]
name = "variant_builder"
harness = false
+
+[[bench]]
+name = "variant_validation"
+harness = false
diff --git a/parquet-variant/benches/variant_validation.rs
b/parquet-variant/benches/variant_validation.rs
new file mode 100644
index 0000000000..0ccc101178
--- /dev/null
+++ b/parquet-variant/benches/variant_validation.rs
@@ -0,0 +1,138 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+extern crate parquet_variant;
+
+use criterion::*;
+
+use parquet_variant::{Variant, VariantBuilder};
+
+fn generate_large_object() -> (Vec<u8>, Vec<u8>) {
+ // 256 elements (keys: 000-255) - each element is an object of 256
elements (240-495) - each
+ // element a list of numbers from 0-127
+ let mut variant_builder = VariantBuilder::new();
+ let mut outer_object = variant_builder.new_object();
+
+ for i in 0..=125 {
+ let key = format!("{i:03}");
+ let mut inner_object = outer_object.new_object(&key);
+
+ for j in 125..=250 {
+ let inner_key = format!("{j}");
+ let mut list_builder = inner_object.new_list(&inner_key);
+
+ for k in 0..=127 {
+ list_builder.append_value(Variant::Int8(k));
+ }
+ list_builder.finish();
+ }
+ inner_object.finish().unwrap();
+ }
+ outer_object.finish().unwrap();
+
+ variant_builder.finish()
+}
+
+fn generate_complex_object() -> (Vec<u8>, Vec<u8>) {
+ let mut variant_builder = VariantBuilder::new();
+ let mut object_builder = variant_builder.new_object();
+ let mut inner_list_builder = object_builder.new_list("booleans");
+
+ for _ in 0..1024 {
+ inner_list_builder.append_value(Variant::BooleanTrue);
+ }
+
+ inner_list_builder.finish();
+ object_builder.insert("null", Variant::Null);
+ let mut inner_list_builder = object_builder.new_list("numbers");
+ for _ in 0..1024 {
+ inner_list_builder.append_value(Variant::Int8(4));
+ inner_list_builder.append_value(Variant::Double(-3e0));
+ inner_list_builder.append_value(Variant::Double(1001e-3));
+ }
+ inner_list_builder.finish();
+
+ let mut inner_object_builder = object_builder.new_object("nested");
+
+ for i in 0..2048 {
+ let key = format!("{}", 1024 - i);
+ inner_object_builder.insert(&key, i);
+ }
+ inner_object_builder.finish().unwrap();
+
+ object_builder.finish().unwrap();
+
+ variant_builder.finish()
+}
+
+fn generate_large_nested_list() -> (Vec<u8>, Vec<u8>) {
+ let mut variant_builder = VariantBuilder::new();
+ let mut list_builder = variant_builder.new_list();
+ for _ in 0..255 {
+ let mut list_builder_inner = list_builder.new_list();
+ for _ in 0..120 {
+ list_builder_inner.append_value(Variant::Null);
+
+ let mut list_builder_inner_inner = list_builder_inner.new_list();
+ for _ in 0..20 {
+ list_builder_inner_inner.append_value(Variant::Double(-3e0));
+ }
+
+ list_builder_inner_inner.finish();
+ }
+ list_builder_inner.finish();
+ }
+ list_builder.finish();
+ variant_builder.finish()
+}
+
+// Generates a large object and performs full validation
+fn bench_validate_large_object(c: &mut Criterion) {
+ let (metadata, value) = generate_large_object();
+ c.bench_function("bench_validate_large_object", |b| {
+ b.iter(|| {
+ std::hint::black_box(Variant::try_new(&metadata, &value).unwrap());
+ })
+ });
+}
+
+fn bench_validate_complex_object(c: &mut Criterion) {
+ let (metadata, value) = generate_complex_object();
+ c.bench_function("bench_validate_complex_object", |b| {
+ b.iter(|| {
+ std::hint::black_box(Variant::try_new(&metadata, &value).unwrap());
+ })
+ });
+}
+
+fn bench_validate_large_nested_list(c: &mut Criterion) {
+ let (metadata, value) = generate_large_nested_list();
+ c.bench_function("bench_validate_large_nested_list", |b| {
+ b.iter(|| {
+ std::hint::black_box(Variant::try_new(&metadata, &value).unwrap());
+ })
+ });
+}
+
+criterion_group!(
+ benches,
+ bench_validate_large_object,
+ bench_validate_complex_object,
+ bench_validate_large_nested_list
+);
+
+criterion_main!(benches);
diff --git a/parquet-variant/src/decoder.rs b/parquet-variant/src/decoder.rs
index e419eca6ee..5a6aab43ff 100644
--- a/parquet-variant/src/decoder.rs
+++ b/parquet-variant/src/decoder.rs
@@ -200,6 +200,25 @@ impl OffsetSizeBytes {
}
}
+/// Converts a byte buffer to offset values based on the specific offset size
+pub(crate) fn map_bytes_to_offsets(
+ buffer: &[u8],
+ offset_size: OffsetSizeBytes,
+) -> impl Iterator<Item = usize> + use<'_> {
+ buffer
+ .chunks_exact(offset_size as usize)
+ .map(move |chunk| match offset_size {
+ OffsetSizeBytes::One => chunk[0] as usize,
+ OffsetSizeBytes::Two => u16::from_le_bytes([chunk[0], chunk[1]])
as usize,
+ OffsetSizeBytes::Three => {
+ u32::from_le_bytes([chunk[0], chunk[1], chunk[2], 0]) as usize
+ }
+ OffsetSizeBytes::Four => {
+ u32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]])
as usize
+ }
+ })
+}
+
/// Extract the primitive type from a Variant value-metadata byte
pub(crate) fn get_primitive_type(metadata: u8) -> Result<VariantPrimitiveType,
ArrowError> {
// last 6 bits contain the primitive-type, see spec
diff --git a/parquet-variant/src/utils.rs b/parquet-variant/src/utils.rs
index 765ea04ae6..ef402064e9 100644
--- a/parquet-variant/src/utils.rs
+++ b/parquet-variant/src/utils.rs
@@ -122,11 +122,3 @@ where
Some(Err(start))
}
-
-/// Attempts to prove a fallible iterator is actually infallible in practice,
by consuming every
-/// element and returning the first error (if any).
-pub(crate) fn validate_fallible_iterator<T, E>(
- mut it: impl Iterator<Item = Result<T, E>>,
-) -> Result<(), E> {
- it.find(Result::is_err).transpose().map(|_| ())
-}
diff --git a/parquet-variant/src/variant/list.rs
b/parquet-variant/src/variant/list.rs
index 05ddf9b2b7..11122190b4 100644
--- a/parquet-variant/src/variant/list.rs
+++ b/parquet-variant/src/variant/list.rs
@@ -14,10 +14,9 @@
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
-use crate::decoder::OffsetSizeBytes;
+use crate::decoder::{map_bytes_to_offsets, OffsetSizeBytes};
use crate::utils::{
first_byte_from_slice, overflow_error, slice_from_slice,
slice_from_slice_at_offset,
- validate_fallible_iterator,
};
use crate::variant::{Variant, VariantMetadata};
@@ -209,9 +208,35 @@ impl<'m, 'v> VariantList<'m, 'v> {
// by value to all the children (who would otherwise re-validate
it repeatedly).
self.metadata = self.metadata.with_full_validation()?;
- // Iterate over all string keys in this dictionary in order to
prove that the offset
- // array is valid, all offsets are in bounds, and all string bytes
are valid utf-8.
- validate_fallible_iterator(self.iter_try())?;
+ let offset_buffer = slice_from_slice(
+ self.value,
+ self.header.first_offset_byte()..self.first_value_byte,
+ )?;
+
+ let offsets =
+ map_bytes_to_offsets(offset_buffer,
self.header.offset_size).collect::<Vec<_>>();
+
+ // Validate offsets are in-bounds and monotonically increasing.
+ // Since shallow verification checks whether the first and last
offsets are in-bounds,
+ // we can also verify all offsets are in-bounds by checking if
offsets are monotonically increasing.
+ let are_offsets_monotonic = offsets.is_sorted_by(|a, b| a < b);
+ if !are_offsets_monotonic {
+ return Err(ArrowError::InvalidArgumentError(
+ "offsets are not monotonically increasing".to_string(),
+ ));
+ }
+
+ let value_buffer = slice_from_slice(self.value,
self.first_value_byte..)?;
+
+ // Validate whether values are valid variant objects
+ for i in 1..offsets.len() {
+ let start_offset = offsets[i - 1];
+ let end_offset = offsets[i];
+
+ let value_bytes = slice_from_slice(value_buffer,
start_offset..end_offset)?;
+ Variant::try_new_with_metadata(self.metadata, value_bytes)?;
+ }
+
self.validated = true;
}
Ok(self)
diff --git a/parquet-variant/src/variant/metadata.rs
b/parquet-variant/src/variant/metadata.rs
index 0aad22ea72..b50a766869 100644
--- a/parquet-variant/src/variant/metadata.rs
+++ b/parquet-variant/src/variant/metadata.rs
@@ -15,11 +15,8 @@
// specific language governing permissions and limitations
// under the License.
-use crate::decoder::OffsetSizeBytes;
-use crate::utils::{
- first_byte_from_slice, overflow_error, slice_from_slice, string_from_slice,
- validate_fallible_iterator,
-};
+use crate::decoder::{map_bytes_to_offsets, OffsetSizeBytes};
+use crate::utils::{first_byte_from_slice, overflow_error, slice_from_slice,
string_from_slice};
use arrow_schema::ArrowError;
@@ -228,9 +225,47 @@ impl<'m> VariantMetadata<'m> {
/// [validation]: Self#Validation
pub fn with_full_validation(mut self) -> Result<Self, ArrowError> {
if !self.validated {
- // Iterate over all string keys in this dictionary in order to
prove that the offset
- // array is valid, all offsets are in bounds, and all string bytes
are valid utf-8.
- validate_fallible_iterator(self.iter_try())?;
+ let offset_bytes = slice_from_slice(
+ self.bytes,
+ self.header.first_offset_byte()..self.first_value_byte,
+ )?;
+
+ let offsets =
+ map_bytes_to_offsets(offset_bytes,
self.header.offset_size).collect::<Vec<_>>();
+
+ // Validate offsets are in-bounds and monotonically increasing.
+ // Since shallow validation ensures the first and last offsets are
in bounds, we can also verify all offsets
+ // are in-bounds by checking if offsets are monotonically
increasing.
+ let are_offsets_monotonic = offsets.is_sorted_by(|a, b| a < b);
+ if !are_offsets_monotonic {
+ return Err(ArrowError::InvalidArgumentError(
+ "offsets not monotonically increasing".to_string(),
+ ));
+ }
+
+ // Verify the string values in the dictionary are UTF-8 encoded
strings.
+ let value_buffer =
+ string_from_slice(self.bytes, 0,
self.first_value_byte..self.bytes.len())?;
+
+ if self.header.is_sorted {
+ // Validate the dictionary values are unique and
lexicographically sorted
+ let are_dictionary_values_unique_and_sorted =
(1..offsets.len())
+ .map(|i| {
+ let field_range = offsets[i - 1]..offsets[i];
+ value_buffer.get(field_range)
+ })
+ .is_sorted_by(|a, b| match (a, b) {
+ (Some(a), Some(b)) => a < b,
+ _ => false,
+ });
+
+ if !are_dictionary_values_unique_and_sorted {
+ return Err(ArrowError::InvalidArgumentError(
+ "dictionary values are not unique and
ordered".to_string(),
+ ));
+ }
+ }
+
self.validated = true;
}
Ok(self)
@@ -399,6 +434,42 @@ mod tests {
);
}
+ #[test]
+ fn try_new_fails_non_monotonic2() {
+ // this test case checks whether offsets are monotonic in the full
validation logic.
+
+ // 'cat', 'dog', 'lamb', "eel"
+ let bytes = &[
+ 0b0000_0001, // header, offset_size_minus_one=0 and version=1
+ 4, // dictionary_size
+ 0x00,
+ 0x02,
+ 0x01, // Doesn't increase monotonically
+ 0x10,
+ 13,
+ b'c',
+ b'a',
+ b't',
+ b'd',
+ b'o',
+ b'g',
+ b'l',
+ b'a',
+ b'm',
+ b'b',
+ b'e',
+ b'e',
+ b'l',
+ ];
+
+ let err = VariantMetadata::try_new(bytes).unwrap_err();
+
+ assert!(
+ matches!(err, ArrowError::InvalidArgumentError(_)),
+ "unexpected error: {err:?}"
+ );
+ }
+
#[test]
fn try_new_truncated_offsets_inline() {
// Missing final offset
diff --git a/parquet-variant/src/variant/object.rs
b/parquet-variant/src/variant/object.rs
index 5efca267af..ea0c6fac0f 100644
--- a/parquet-variant/src/variant/object.rs
+++ b/parquet-variant/src/variant/object.rs
@@ -14,10 +14,9 @@
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
-use crate::decoder::OffsetSizeBytes;
+use crate::decoder::{map_bytes_to_offsets, OffsetSizeBytes};
use crate::utils::{
first_byte_from_slice, overflow_error, slice_from_slice,
try_binary_search_range_by,
- validate_fallible_iterator,
};
use crate::variant::{Variant, VariantMetadata};
@@ -210,9 +209,80 @@ impl<'m, 'v> VariantObject<'m, 'v> {
// by value to all the children (who would otherwise re-validate
it repeatedly).
self.metadata = self.metadata.with_full_validation()?;
- // Iterate over all string keys in this dictionary in order to
prove that the offset
- // array is valid, all offsets are in bounds, and all string bytes
are valid utf-8.
- validate_fallible_iterator(self.iter_try())?;
+ let field_id_buffer = slice_from_slice(
+ self.value,
+
self.header.field_ids_start_byte()..self.first_field_offset_byte,
+ )?;
+
+ let field_ids = map_bytes_to_offsets(field_id_buffer,
self.header.field_id_size)
+ .collect::<Vec<_>>();
+
+ // Validate all field ids exist in the metadata dictionary and the
corresponding field names are lexicographically sorted
+ if self.metadata.is_sorted() {
+ // Since the metadata dictionary has unique and sorted field
names, we can also guarantee this object's field names
+ // are lexicographically sorted by their field id ordering
+ if !field_ids.is_sorted() {
+ return Err(ArrowError::InvalidArgumentError(
+ "field names not sorted".to_string(),
+ ));
+ }
+
+ // Since field ids are sorted, if the last field is smaller
than the dictionary size,
+ // we also know all field ids are smaller than the dictionary
size and in-bounds.
+ if let Some(&last_field_id) = field_ids.last() {
+ if last_field_id >= self.metadata.dictionary_size() {
+ return Err(ArrowError::InvalidArgumentError(
+ "field id is not valid".to_string(),
+ ));
+ }
+ }
+ } else {
+ // The metadata dictionary can't guarantee uniqueness or
sortedness, so we have to parse out the corresponding field names
+ // to check lexicographical order
+ let are_field_names_sorted = field_ids
+ .iter()
+ .map(|&i| self.metadata.get(i))
+ .collect::<Result<Vec<_>, _>>()?
+ .is_sorted();
+
+ if !are_field_names_sorted {
+ return Err(ArrowError::InvalidArgumentError(
+ "field names not sorted".to_string(),
+ ));
+ }
+
+ // Since field ids are not guaranteed to be sorted, scan over
all field ids
+ // and check that field ids are less than dictionary size
+
+ let are_field_ids_in_bounds = field_ids
+ .iter()
+ .all(|&id| id < self.metadata.dictionary_size());
+
+ if !are_field_ids_in_bounds {
+ return Err(ArrowError::InvalidArgumentError(
+ "field id is not valid".to_string(),
+ ));
+ }
+ }
+
+ // Validate whether values are valid variant objects
+ let field_offset_buffer = slice_from_slice(
+ self.value,
+ self.first_field_offset_byte..self.first_value_byte,
+ )?;
+ let num_offsets = field_offset_buffer.len() /
self.header.field_offset_size();
+
+ let value_buffer = slice_from_slice(self.value,
self.first_value_byte..)?;
+
+ map_bytes_to_offsets(field_offset_buffer,
self.header.field_offset_size)
+ .take(num_offsets.saturating_sub(1))
+ .try_for_each(|offset| {
+ let value_bytes = slice_from_slice(value_buffer,
offset..)?;
+ Variant::try_new_with_metadata(self.metadata,
value_bytes)?;
+
+ Ok::<_, ArrowError>(())
+ })?;
+
self.validated = true;
}
Ok(self)