This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new f2344d25ca feat: Add `Time`/`Interval`/`Decimal`/`Utf8View` in 
aggregate fuzz testing (#13226)
f2344d25ca is described below

commit f2344d25caa897988c519fdd19362888742d37b4
Author: Leslie Su <[email protected]>
AuthorDate: Wed Nov 6 02:22:20 2024 +0800

    feat: Add `Time`/`Interval`/`Decimal`/`Utf8View` in aggregate fuzz testing 
(#13226)
    
    * support Time/Interval/Decimal types in data generator.
    
    * introduce RandomNativeData trait.
    
    * fix bug.
    
    * support utf8view type in data generator.
    
    * fix clippy.
    
    * fix bug.
---
 datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs |  47 +++++-
 .../aggregation_fuzzer/data_generator.rs           | 173 ++++++++++++++++++---
 .../src/min_max/min_max_bytes.rs                   |   4 +
 test-utils/src/array_gen/decimal.rs                |  79 ++++++++++
 test-utils/src/array_gen/mod.rs                    |   3 +
 test-utils/src/array_gen/primitive.rs              |  90 +++--------
 test-utils/src/array_gen/random_data.rs            | 102 ++++++++++++
 test-utils/src/array_gen/string.rs                 |  28 +++-
 8 files changed, 433 insertions(+), 93 deletions(-)

diff --git a/datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs 
b/datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs
index 4cb2b1bfbc..16f539b759 100644
--- a/datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs
+++ b/datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs
@@ -23,6 +23,10 @@ use arrow::datatypes::DataType;
 use arrow::record_batch::RecordBatch;
 use arrow::util::pretty::pretty_format_batches;
 use arrow_array::types::Int64Type;
+use arrow_schema::{
+    IntervalUnit, TimeUnit, DECIMAL128_MAX_PRECISION, DECIMAL128_MAX_SCALE,
+    DECIMAL256_MAX_PRECISION, DECIMAL256_MAX_SCALE,
+};
 use datafusion::common::Result;
 use datafusion::datasource::MemTable;
 use datafusion::physical_expr::aggregate::AggregateExprBuilder;
@@ -45,7 +49,7 @@ use crate::fuzz_cases::aggregation_fuzzer::{
 use datafusion_common::HashMap;
 use datafusion_physical_expr_common::sort_expr::LexOrdering;
 use rand::rngs::StdRng;
-use rand::{Rng, SeedableRng};
+use rand::{thread_rng, Rng, SeedableRng};
 use tokio::task::JoinSet;
 
 // ========================================================================
@@ -151,6 +155,7 @@ async fn test_count() {
 /// 1. Floating point numbers
 /// 1. structured types
 fn baseline_config() -> DatasetGeneratorConfig {
+    let mut rng = thread_rng();
     let columns = vec![
         ColumnDescr::new("i8", DataType::Int8),
         ColumnDescr::new("i16", DataType::Int16),
@@ -162,13 +167,45 @@ fn baseline_config() -> DatasetGeneratorConfig {
         ColumnDescr::new("u64", DataType::UInt64),
         ColumnDescr::new("date32", DataType::Date32),
         ColumnDescr::new("date64", DataType::Date64),
-        // TODO: date/time columns
-        // todo decimal columns
+        ColumnDescr::new("time32_s", DataType::Time32(TimeUnit::Second)),
+        ColumnDescr::new("time32_ms", DataType::Time32(TimeUnit::Millisecond)),
+        ColumnDescr::new("time64_us", DataType::Time64(TimeUnit::Microsecond)),
+        ColumnDescr::new("time64_ns", DataType::Time64(TimeUnit::Nanosecond)),
+        ColumnDescr::new(
+            "interval_year_month",
+            DataType::Interval(IntervalUnit::YearMonth),
+        ),
+        ColumnDescr::new(
+            "interval_day_time",
+            DataType::Interval(IntervalUnit::DayTime),
+        ),
+        ColumnDescr::new(
+            "interval_month_day_nano",
+            DataType::Interval(IntervalUnit::MonthDayNano),
+        ),
+        // begin decimal columns
+        ColumnDescr::new("decimal128", {
+            // Generate valid precision and scale for Decimal128 randomly.
+            let precision: u8 = rng.gen_range(1..=DECIMAL128_MAX_PRECISION);
+            // It's safe to cast `precision` to i8 type directly.
+            let scale: i8 = rng.gen_range(
+                i8::MIN..=std::cmp::min(precision as i8, DECIMAL128_MAX_SCALE),
+            );
+            DataType::Decimal128(precision, scale)
+        }),
+        ColumnDescr::new("decimal256", {
+            // Generate valid precision and scale for Decimal256 randomly.
+            let precision: u8 = rng.gen_range(1..=DECIMAL256_MAX_PRECISION);
+            // It's safe to cast `precision` to i8 type directly.
+            let scale: i8 = rng.gen_range(
+                i8::MIN..=std::cmp::min(precision as i8, DECIMAL256_MAX_SCALE),
+            );
+            DataType::Decimal256(precision, scale)
+        }),
         // begin string columns
         ColumnDescr::new("utf8", DataType::Utf8),
         ColumnDescr::new("largeutf8", DataType::LargeUtf8),
-        // TODO add support for utf8view in data generator
-        // ColumnDescr::new("utf8view", DataType::Utf8View),
+        ColumnDescr::new("utf8view", DataType::Utf8View),
         // todo binary
         // low cardinality columns
         ColumnDescr::new("u8_low", DataType::UInt8).with_max_num_distinct(10),
diff --git 
a/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/data_generator.rs 
b/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/data_generator.rs
index aafa5ed7f6..88133a134e 100644
--- a/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/data_generator.rs
+++ b/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/data_generator.rs
@@ -18,11 +18,14 @@
 use std::sync::Arc;
 
 use arrow::datatypes::{
-    Date32Type, Date64Type, Float32Type, Float64Type, Int16Type, Int32Type, 
Int64Type,
-    Int8Type, UInt16Type, UInt32Type, UInt64Type, UInt8Type,
+    ByteArrayType, ByteViewType, Date32Type, Date64Type, Decimal128Type, 
Decimal256Type,
+    Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type,
+    IntervalDayTimeType, IntervalMonthDayNanoType, IntervalYearMonthType, 
LargeUtf8Type,
+    StringViewType, Time32MillisecondType, Time32SecondType, 
Time64MicrosecondType,
+    Time64NanosecondType, UInt16Type, UInt32Type, UInt64Type, UInt8Type, 
Utf8Type,
 };
 use arrow_array::{ArrayRef, RecordBatch};
-use arrow_schema::{DataType, Field, Schema};
+use arrow_schema::{DataType, Field, IntervalUnit, Schema, TimeUnit};
 use datafusion_common::{arrow_datafusion_err, DataFusionError, Result};
 use datafusion_physical_expr::{expressions::col, PhysicalSortExpr};
 use datafusion_physical_expr_common::sort_expr::LexOrdering;
@@ -32,7 +35,7 @@ use rand::{
     thread_rng, Rng, SeedableRng,
 };
 use test_utils::{
-    array_gen::{PrimitiveArrayGenerator, StringArrayGenerator},
+    array_gen::{DecimalArrayGenerator, PrimitiveArrayGenerator, 
StringArrayGenerator},
     stagger_batch,
 };
 
@@ -219,7 +222,7 @@ struct RecordBatchGenerator {
 }
 
 macro_rules! generate_string_array {
-    ($SELF:ident, $NUM_ROWS:ident, $MAX_NUM_DISTINCT:expr, 
$BATCH_GEN_RNG:ident, $ARRAY_GEN_RNG:ident, $OFFSET_TYPE:ty) => {{
+    ($SELF:ident, $NUM_ROWS:ident, $MAX_NUM_DISTINCT:expr, 
$BATCH_GEN_RNG:ident, $ARRAY_GEN_RNG:ident, $ARROW_TYPE: ident) => {{
         let null_pct_idx = 
$BATCH_GEN_RNG.gen_range(0..$SELF.candidate_null_pcts.len());
         let null_pct = $SELF.candidate_null_pcts[null_pct_idx];
         let max_len = $BATCH_GEN_RNG.gen_range(1..50);
@@ -232,25 +235,47 @@ macro_rules! generate_string_array {
             rng: $ARRAY_GEN_RNG,
         };
 
-        generator.gen_data::<$OFFSET_TYPE>()
+        match $ARROW_TYPE::DATA_TYPE {
+            DataType::Utf8 => generator.gen_data::<i32>(),
+            DataType::LargeUtf8 => generator.gen_data::<i64>(),
+            DataType::Utf8View => generator.gen_string_view(),
+            _ => unreachable!(),
+        }
+    }};
+}
+
+macro_rules! generate_decimal_array {
+    ($SELF:ident, $NUM_ROWS:ident, $MAX_NUM_DISTINCT: expr, 
$BATCH_GEN_RNG:ident, $ARRAY_GEN_RNG:ident, $PRECISION: ident, $SCALE: ident, 
$ARROW_TYPE: ident) => {{
+        let null_pct_idx = 
$BATCH_GEN_RNG.gen_range(0..$SELF.candidate_null_pcts.len());
+        let null_pct = $SELF.candidate_null_pcts[null_pct_idx];
+
+        let mut generator = DecimalArrayGenerator {
+            precision: $PRECISION,
+            scale: $SCALE,
+            num_decimals: $NUM_ROWS,
+            num_distinct_decimals: $MAX_NUM_DISTINCT,
+            null_pct,
+            rng: $ARRAY_GEN_RNG,
+        };
+
+        generator.gen_data::<$ARROW_TYPE>()
     }};
 }
 
 macro_rules! generate_primitive_array {
-    ($SELF:ident, $NUM_ROWS:ident, $MAX_NUM_DISTINCT:expr, 
$BATCH_GEN_RNG:ident, $ARRAY_GEN_RNG:ident, $ARROW_TYPE:ident) => {
-        paste::paste! {{
-            let null_pct_idx = 
$BATCH_GEN_RNG.gen_range(0..$SELF.candidate_null_pcts.len());
-            let null_pct = $SELF.candidate_null_pcts[null_pct_idx];
-
-            let mut generator = PrimitiveArrayGenerator {
-                num_primitives: $NUM_ROWS,
-                num_distinct_primitives: $MAX_NUM_DISTINCT,
-                null_pct,
-                rng: $ARRAY_GEN_RNG,
-            };
-
-            generator.gen_data::<$ARROW_TYPE>()
-    }}}
+    ($SELF:ident, $NUM_ROWS:ident, $MAX_NUM_DISTINCT:expr, 
$BATCH_GEN_RNG:ident, $ARRAY_GEN_RNG:ident, $ARROW_TYPE:ident) => {{
+        let null_pct_idx = 
$BATCH_GEN_RNG.gen_range(0..$SELF.candidate_null_pcts.len());
+        let null_pct = $SELF.candidate_null_pcts[null_pct_idx];
+
+        let mut generator = PrimitiveArrayGenerator {
+            num_primitives: $NUM_ROWS,
+            num_distinct_primitives: $MAX_NUM_DISTINCT,
+            null_pct,
+            rng: $ARRAY_GEN_RNG,
+        };
+
+        generator.gen_data::<$ARROW_TYPE>()
+    }};
 }
 
 impl RecordBatchGenerator {
@@ -432,6 +457,100 @@ impl RecordBatchGenerator {
                     Date64Type
                 )
             }
+            DataType::Time32(TimeUnit::Second) => {
+                generate_primitive_array!(
+                    self,
+                    num_rows,
+                    max_num_distinct,
+                    batch_gen_rng,
+                    array_gen_rng,
+                    Time32SecondType
+                )
+            }
+            DataType::Time32(TimeUnit::Millisecond) => {
+                generate_primitive_array!(
+                    self,
+                    num_rows,
+                    max_num_distinct,
+                    batch_gen_rng,
+                    array_gen_rng,
+                    Time32MillisecondType
+                )
+            }
+            DataType::Time64(TimeUnit::Microsecond) => {
+                generate_primitive_array!(
+                    self,
+                    num_rows,
+                    max_num_distinct,
+                    batch_gen_rng,
+                    array_gen_rng,
+                    Time64MicrosecondType
+                )
+            }
+            DataType::Time64(TimeUnit::Nanosecond) => {
+                generate_primitive_array!(
+                    self,
+                    num_rows,
+                    max_num_distinct,
+                    batch_gen_rng,
+                    array_gen_rng,
+                    Time64NanosecondType
+                )
+            }
+            DataType::Interval(IntervalUnit::YearMonth) => {
+                generate_primitive_array!(
+                    self,
+                    num_rows,
+                    max_num_distinct,
+                    batch_gen_rng,
+                    array_gen_rng,
+                    IntervalYearMonthType
+                )
+            }
+            DataType::Interval(IntervalUnit::DayTime) => {
+                generate_primitive_array!(
+                    self,
+                    num_rows,
+                    max_num_distinct,
+                    batch_gen_rng,
+                    array_gen_rng,
+                    IntervalDayTimeType
+                )
+            }
+            DataType::Interval(IntervalUnit::MonthDayNano) => {
+                generate_primitive_array!(
+                    self,
+                    num_rows,
+                    max_num_distinct,
+                    batch_gen_rng,
+                    array_gen_rng,
+                    IntervalMonthDayNanoType
+                )
+            }
+            DataType::Decimal128(precision, scale) => {
+                generate_decimal_array!(
+                    self,
+                    num_rows,
+                    max_num_distinct,
+                    batch_gen_rng,
+                    array_gen_rng,
+                    precision,
+                    scale,
+                    Decimal128Type
+                )
+            }
+            DataType::Decimal256(precision, scale) => {
+                generate_decimal_array!(
+                    self,
+                    num_rows,
+                    max_num_distinct,
+                    batch_gen_rng,
+                    array_gen_rng,
+                    precision,
+                    scale,
+                    Decimal256Type
+                )
+            }
             DataType::Utf8 => {
                 generate_string_array!(
                     self,
@@ -439,7 +558,7 @@ impl RecordBatchGenerator {
                     max_num_distinct,
                     batch_gen_rng,
                     array_gen_rng,
-                    i32
+                    Utf8Type
                 )
             }
             DataType::LargeUtf8 => {
@@ -449,7 +568,17 @@ impl RecordBatchGenerator {
                     max_num_distinct,
                     batch_gen_rng,
                     array_gen_rng,
-                    i64
+                    LargeUtf8Type
+                )
+            }
+            DataType::Utf8View => {
+                generate_string_array!(
+                    self,
+                    num_rows,
+                    max_num_distinct,
+                    batch_gen_rng,
+                    array_gen_rng,
+                    StringViewType
                 )
             }
             _ => {
diff --git a/datafusion/functions-aggregate/src/min_max/min_max_bytes.rs 
b/datafusion/functions-aggregate/src/min_max/min_max_bytes.rs
index 501454edf7..a09d616ec8 100644
--- a/datafusion/functions-aggregate/src/min_max/min_max_bytes.rs
+++ b/datafusion/functions-aggregate/src/min_max/min_max_bytes.rs
@@ -338,6 +338,10 @@ impl GroupsAccumulator for MinMaxBytesAccumulator {
 /// This is a heuristic to avoid allocating too many small buffers
 fn capacity_to_view_block_size(data_capacity: usize) -> u32 {
     let max_block_size = 2 * 1024 * 1024;
+    // Avoid block size equal to zero when calling `with_fixed_block_size()`.
+    if data_capacity == 0 {
+        return 1;
+    }
     if let Ok(block_size) = u32::try_from(data_capacity) {
         block_size.min(max_block_size)
     } else {
diff --git a/test-utils/src/array_gen/decimal.rs 
b/test-utils/src/array_gen/decimal.rs
new file mode 100644
index 0000000000..f878a830c4
--- /dev/null
+++ b/test-utils/src/array_gen/decimal.rs
@@ -0,0 +1,79 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{ArrayRef, PrimitiveArray, PrimitiveBuilder, UInt32Array};
+use arrow::datatypes::DecimalType;
+use rand::rngs::StdRng;
+use rand::Rng;
+
+use super::random_data::RandomNativeData;
+
+/// Randomly generate decimal arrays
+pub struct DecimalArrayGenerator {
+    /// The precision of the decimal type
+    pub precision: u8,
+    /// The scale of the decimal type
+    pub scale: i8,
+    /// The total number of decimals in the output
+    pub num_decimals: usize,
+    /// The number of distinct decimals in the columns
+    pub num_distinct_decimals: usize,
+    /// The percentage of nulls in the columns
+    pub null_pct: f64,
+    /// Random number generator
+    pub rng: StdRng,
+}
+
+impl DecimalArrayGenerator {
+    /// Create a Decimal128Array / Decimal256Array with random values.
+    pub fn gen_data<D>(&mut self) -> ArrayRef
+    where
+        D: DecimalType + RandomNativeData,
+    {
+        // table of decimals from which to draw
+        let distinct_decimals: PrimitiveArray<D> = {
+            let mut decimal_builder =
+                
PrimitiveBuilder::<D>::with_capacity(self.num_distinct_decimals);
+            for _ in 0..self.num_distinct_decimals {
+                decimal_builder
+                    .append_option(Some(D::generate_random_native_data(&mut 
self.rng)));
+            }
+
+            decimal_builder
+                .finish()
+                .with_precision_and_scale(self.precision, self.scale)
+                .unwrap()
+        };
+
+        // pick num_decimals randomly from the distinct decimal table
+        let indicies: UInt32Array = (0..self.num_decimals)
+            .map(|_| {
+                if self.rng.gen::<f64>() < self.null_pct {
+                    None
+                } else if self.num_distinct_decimals > 1 {
+                    let range = 1..(self.num_distinct_decimals as u32);
+                    Some(self.rng.gen_range(range))
+                } else {
+                    Some(0)
+                }
+            })
+            .collect();
+
+        let options = None;
+        arrow::compute::take(&distinct_decimals, &indicies, options).unwrap()
+    }
+}
diff --git a/test-utils/src/array_gen/mod.rs b/test-utils/src/array_gen/mod.rs
index 4a799ae737..8e0e39ddfd 100644
--- a/test-utils/src/array_gen/mod.rs
+++ b/test-utils/src/array_gen/mod.rs
@@ -15,8 +15,11 @@
 // specific language governing permissions and limitations
 // under the License.
 
+mod decimal;
 mod primitive;
+mod random_data;
 mod string;
 
+pub use decimal::DecimalArrayGenerator;
 pub use primitive::PrimitiveArrayGenerator;
 pub use string::StringArrayGenerator;
diff --git a/test-utils/src/array_gen/primitive.rs 
b/test-utils/src/array_gen/primitive.rs
index 0581862d63..2469cbf446 100644
--- a/test-utils/src/array_gen/primitive.rs
+++ b/test-utils/src/array_gen/primitive.rs
@@ -17,42 +17,10 @@
 
 use arrow::array::{ArrayRef, ArrowPrimitiveType, PrimitiveArray, UInt32Array};
 use arrow::datatypes::DataType;
-use rand::distributions::Standard;
-use rand::prelude::Distribution;
 use rand::rngs::StdRng;
 use rand::Rng;
 
-/// Trait for converting type safely from a native type T impl this trait.
-pub trait FromNative: std::fmt::Debug + Send + Sync + Copy + Default {
-    /// Convert native type from i64.
-    fn from_i64(_: i64) -> Option<Self> {
-        None
-    }
-}
-
-macro_rules! native_type {
-    ($t: ty $(, $from:ident)*) => {
-        impl FromNative for $t {
-            $(
-                #[inline]
-                fn $from(v: $t) -> Option<Self> {
-                    Some(v)
-                }
-            )*
-        }
-    };
-}
-
-native_type!(i8);
-native_type!(i16);
-native_type!(i32);
-native_type!(i64, from_i64);
-native_type!(u8);
-native_type!(u16);
-native_type!(u32);
-native_type!(u64);
-native_type!(f32);
-native_type!(f64);
+use super::random_data::RandomNativeData;
 
 /// Randomly generate primitive array
 pub struct PrimitiveArrayGenerator {
@@ -70,41 +38,33 @@ pub struct PrimitiveArrayGenerator {
 impl PrimitiveArrayGenerator {
     pub fn gen_data<A>(&mut self) -> ArrayRef
     where
-        A: ArrowPrimitiveType,
-        A::Native: FromNative,
-        Standard: Distribution<<A as ArrowPrimitiveType>::Native>,
+        A: ArrowPrimitiveType + RandomNativeData,
     {
         // table of primitives from which to draw
-        let distinct_primitives: PrimitiveArray<A> = 
(0..self.num_distinct_primitives)
-            .map(|_| {
-                Some(match A::DATA_TYPE {
-                    DataType::Int8
-                    | DataType::Int16
-                    | DataType::Int32
-                    | DataType::Int64
-                    | DataType::UInt8
-                    | DataType::UInt16
-                    | DataType::UInt32
-                    | DataType::UInt64
-                    | DataType::Float32
-                    | DataType::Float64
-                    | DataType::Date32 => self.rng.gen::<A::Native>(),
-
-                    DataType::Date64 => {
-                        // TODO: constrain this range to valid dates if 
necessary
-                        let date_value = 
self.rng.gen_range(i64::MIN..=i64::MAX);
-                        let millis_per_day = 86_400_000;
-                        let adjusted_value = date_value - (date_value % 
millis_per_day);
-                        A::Native::from_i64(adjusted_value).unwrap()
-                    }
+        let distinct_primitives: PrimitiveArray<A> = match A::DATA_TYPE {
+            DataType::Int8
+            | DataType::Int16
+            | DataType::Int32
+            | DataType::Int64
+            | DataType::UInt8
+            | DataType::UInt16
+            | DataType::UInt32
+            | DataType::UInt64
+            | DataType::Float32
+            | DataType::Float64
+            | DataType::Date32
+            | DataType::Date64
+            | DataType::Time32(_)
+            | DataType::Time64(_)
+            | DataType::Interval(_) => (0..self.num_distinct_primitives)
+                .map(|_| Some(A::generate_random_native_data(&mut self.rng)))
+                .collect(),
 
-                    _ => {
-                        let arrow_type = A::DATA_TYPE;
-                        panic!("Unsupported arrow data type: {arrow_type}")
-                    }
-                })
-            })
-            .collect();
+            _ => {
+                let arrow_type = A::DATA_TYPE;
+                panic!("Unsupported arrow data type: {arrow_type}")
+            }
+        };
 
         // pick num_primitves randomly from the distinct string table
         let indicies: UInt32Array = (0..self.num_primitives)
diff --git a/test-utils/src/array_gen/random_data.rs 
b/test-utils/src/array_gen/random_data.rs
new file mode 100644
index 0000000000..23227100d7
--- /dev/null
+++ b/test-utils/src/array_gen/random_data.rs
@@ -0,0 +1,102 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::ArrowPrimitiveType;
+use arrow::datatypes::{
+    i256, Date32Type, Date64Type, Decimal128Type, Decimal256Type, Float32Type,
+    Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, IntervalDayTime,
+    IntervalDayTimeType, IntervalMonthDayNano, IntervalMonthDayNanoType,
+    IntervalYearMonthType, Time32MillisecondType, Time32SecondType,
+    Time64MicrosecondType, Time64NanosecondType, UInt16Type, UInt32Type, 
UInt64Type,
+    UInt8Type,
+};
+use rand::distributions::Standard;
+use rand::prelude::Distribution;
+use rand::rngs::StdRng;
+use rand::Rng;
+
+/// Generate corresponding NativeType value randomly according to
+/// ArrowPrimitiveType.
+pub trait RandomNativeData: ArrowPrimitiveType {
+    fn generate_random_native_data(rng: &mut StdRng) -> Self::Native;
+}
+
+macro_rules! basic_random_data {
+    ($ARROW_TYPE: ty) => {
+        impl RandomNativeData for $ARROW_TYPE
+        where
+            Standard: Distribution<Self::Native>,
+        {
+            #[inline]
+            fn generate_random_native_data(rng: &mut StdRng) -> Self::Native {
+                rng.gen::<Self::Native>()
+            }
+        }
+    };
+}
+
+basic_random_data!(Int8Type);
+basic_random_data!(Int16Type);
+basic_random_data!(Int32Type);
+basic_random_data!(Int64Type);
+basic_random_data!(UInt8Type);
+basic_random_data!(UInt16Type);
+basic_random_data!(UInt32Type);
+basic_random_data!(UInt64Type);
+basic_random_data!(Float32Type);
+basic_random_data!(Float64Type);
+basic_random_data!(Date32Type);
+basic_random_data!(Time32SecondType);
+basic_random_data!(Time32MillisecondType);
+basic_random_data!(Time64MicrosecondType);
+basic_random_data!(Time64NanosecondType);
+basic_random_data!(IntervalYearMonthType);
+basic_random_data!(Decimal128Type);
+
+impl RandomNativeData for Date64Type {
+    fn generate_random_native_data(rng: &mut StdRng) -> Self::Native {
+        // TODO: constrain this range to valid dates if necessary
+        let date_value = rng.gen_range(i64::MIN..=i64::MAX);
+        let millis_per_day = 86_400_000;
+        date_value - (date_value % millis_per_day)
+    }
+}
+
+impl RandomNativeData for IntervalDayTimeType {
+    fn generate_random_native_data(rng: &mut StdRng) -> Self::Native {
+        IntervalDayTime {
+            days: rng.gen::<i32>(),
+            milliseconds: rng.gen::<i32>(),
+        }
+    }
+}
+
+impl RandomNativeData for IntervalMonthDayNanoType {
+    fn generate_random_native_data(rng: &mut StdRng) -> Self::Native {
+        IntervalMonthDayNano {
+            months: rng.gen::<i32>(),
+            days: rng.gen::<i32>(),
+            nanoseconds: rng.gen::<i64>(),
+        }
+    }
+}
+
+impl RandomNativeData for Decimal256Type {
+    fn generate_random_native_data(rng: &mut StdRng) -> Self::Native {
+        i256::from_parts(rng.gen::<u128>(), rng.gen::<i128>())
+    }
+}
diff --git a/test-utils/src/array_gen/string.rs 
b/test-utils/src/array_gen/string.rs
index fbfa2bb941..b5cef6321b 100644
--- a/test-utils/src/array_gen/string.rs
+++ b/test-utils/src/array_gen/string.rs
@@ -15,7 +15,9 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use arrow::array::{ArrayRef, GenericStringArray, OffsetSizeTrait, UInt32Array};
+use arrow::array::{
+    ArrayRef, GenericStringArray, OffsetSizeTrait, StringViewArray, 
UInt32Array,
+};
 use rand::rngs::StdRng;
 use rand::Rng;
 
@@ -59,6 +61,30 @@ impl StringArrayGenerator {
         let options = None;
         arrow::compute::take(&distinct_strings, &indicies, options).unwrap()
     }
+
+    /// Creates a StringViewArray with random strings.
+    pub fn gen_string_view(&mut self) -> ArrayRef {
+        let distinct_string_views: StringViewArray = 
(0..self.num_distinct_strings)
+            .map(|_| Some(random_string(&mut self.rng, self.max_len)))
+            .collect();
+
+        // pick num_strings randomly from the distinct string table
+        let indicies: UInt32Array = (0..self.num_strings)
+            .map(|_| {
+                if self.rng.gen::<f64>() < self.null_pct {
+                    None
+                } else if self.num_distinct_strings > 1 {
+                    let range = 1..(self.num_distinct_strings as u32);
+                    Some(self.rng.gen_range(range))
+                } else {
+                    Some(0)
+                }
+            })
+            .collect();
+
+        let options = None;
+        arrow::compute::take(&distinct_string_views, &indicies, 
options).unwrap()
+    }
 }
 
 /// Return a string of random characters of length 1..=max_len


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to