alamb commented on code in PR #7849:
URL: https://github.com/apache/arrow-rs/pull/7849#discussion_r2183606773


##########
parquet-variant/benches/variant_builder.rs:
##########
@@ -388,6 +388,113 @@ fn bench_object_list_partially_same_schema(c: &mut 
Criterion) {
     });
 }
 
+// Benchmark validation performance
+fn bench_validation_validated_vs_unvalidated(c: &mut Criterion) {

Review Comment:
   This is a cool idea -- not related to fuzz testing I don't think but a nice 
addition anyways



##########
parquet-variant/tests/variant_interop.rs:
##########
@@ -275,3 +278,309 @@ fn variant_object_builder() {
 }
 
 // TODO: Add tests for object_nested and array_nested
+
+//
+// Validation Fuzzing Tests
+//
+// 1. Generate valid variants using the builder
+// 2. Randomly corrupt bytes in the serialized data
+// 3. Test both validation pathways:
+//    - If validation succeeds -> verify infallible APIs don't panic
+//    - If validation fails -> verify fallible APIs handle errors gracefully
+//
+
+#[test]
+fn test_validation_fuzz_integration() {
+    let mut rng = StdRng::seed_from_u64(42);
+
+    for _ in 0..1000 {
+        // Generate a random valid variant
+        let (metadata, value) = generate_random_variant(&mut rng);
+
+        // Corrupt it
+        let (corrupted_metadata, corrupted_value) = corrupt_variant_data(&mut 
rng, metadata, value);
+
+        // Test the validation workflow
+        test_validation_workflow(&corrupted_metadata, &corrupted_value);
+    }
+}
+
+fn generate_random_variant(rng: &mut StdRng) -> (Vec<u8>, Vec<u8>) {
+    let mut builder = VariantBuilder::new();
+    generate_random_value(rng, &mut builder, 3); // Max depth of 3
+    builder.finish()
+}
+
+fn generate_random_value(rng: &mut StdRng, builder: &mut VariantBuilder, 
max_depth: u32) {
+    if max_depth == 0 {
+        // Force simple values at max depth
+        builder.append_value(rng.random::<i32>());
+        return;
+    }
+
+    match rng.random_range(0..15) {
+        0 => builder.append_value(()),
+        1 => builder.append_value(rng.random::<bool>()),
+        2 => builder.append_value(rng.random::<i8>()),
+        3 => builder.append_value(rng.random::<i16>()),
+        4 => builder.append_value(rng.random::<i32>()),
+        5 => builder.append_value(rng.random::<i64>()),
+        6 => builder.append_value(rng.random::<f32>()),
+        7 => builder.append_value(rng.random::<f64>()),
+        8 => {
+            let len = rng.random_range(0..50);
+            let s: String = (0..len).map(|_| rng.random::<char>()).collect();
+            builder.append_value(s.as_str());
+        }
+        9 => {
+            let len = rng.random_range(0..50);
+            let bytes: Vec<u8> = (0..len).map(|_| rng.random()).collect();
+            builder.append_value(bytes.as_slice());
+        }
+        10 => {
+            if let Ok(decimal) = VariantDecimal4::try_new(rng.random(), 
rng.random_range(0..10)) {
+                builder.append_value(decimal);
+            } else {
+                builder.append_value(0i32);
+            }
+        }
+        11 => {
+            if let Ok(decimal) = VariantDecimal8::try_new(rng.random(), 
rng.random_range(0..19)) {
+                builder.append_value(decimal);
+            } else {
+                builder.append_value(0i64);
+            }
+        }
+        12 => {
+            if let Ok(decimal) = VariantDecimal16::try_new(rng.random(), 
rng.random_range(0..39)) {
+                builder.append_value(decimal);
+            } else {
+                builder.append_value(0i64); // Use i64 instead of i128
+            }
+        }
+        13 => {
+            // Generate a list
+            let mut list_builder = builder.new_list();
+            let list_len = rng.random_range(0..10);
+
+            for _ in 0..list_len {
+                list_builder.append_value(rng.random::<i32>());
+            }
+            list_builder.finish();
+        }
+        14 => {
+            // Generate an object
+            let mut object_builder = builder.new_object();
+            let obj_size = rng.random_range(0..10);
+
+            for i in 0..obj_size {
+                let key = format!("field_{i}");
+                object_builder.insert(&key, rng.random::<i32>());
+            }
+            object_builder.finish().unwrap();
+        }
+        _ => unreachable!(),
+    }
+}
+
+fn corrupt_variant_data(
+    rng: &mut StdRng,
+    mut metadata: Vec<u8>,
+    mut value: Vec<u8>,
+) -> (Vec<u8>, Vec<u8>) {
+    // Randomly decide what to corrupt
+    let corrupt_metadata = rng.random_bool(0.3);
+    let corrupt_value = rng.random_bool(0.7);
+
+    if corrupt_metadata && !metadata.is_empty() {
+        let num_corruptions = rng.random_range(1..=(metadata.len().min(5)));

Review Comment:
   i wonder if we need to corrupt it with more than one corruption 🤔 
   
   If you put this many corruptions I think it means you'll basically correct 
the entire thing for small variants (which might be ok
   
   Maybe it should be `max(3)` instead to do up to three corruptions 🤔 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to