Re: [PR] Make aggr fuzzer query builder more configurable [datafusion]

via GitHub Fri, 25 Apr 2025 09:41:52 -0700


Rachelint commented on code in PR #15851:
URL: https://github.com/apache/datafusion/pull/15851#discussion_r2059930280



##########
datafusion/core/tests/fuzz_cases/aggregation_fuzzer/query_builder.rs:
##########
@@ -0,0 +1,371 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::{collections::HashSet, str::FromStr};
+
+use rand::{seq::SliceRandom, thread_rng, Rng};
+
+/// Random aggregate query builder
+///
+/// Creates queries like
+/// ```sql
+/// SELECT AGG(..) FROM table_name GROUP BY <group_by_columns>
+///```
+#[derive(Debug, Default, Clone)]
+pub struct QueryBuilder {
+    // ===================================
+    // Table settings
+    // ===================================
+    /// The name of the table to query
+    table_name: String,
+
+    // ===================================
+    // Grouping settings
+    // ===================================
+    /// Columns to be used in randomly generate `groupings`
+    ///
+    /// # Example
+    ///
+    /// Columns:
+    ///
+    /// ```text
+    ///   [a,b,c,d]
+    /// ```
+    ///
+    /// And randomly generated `groupings` (at least 1 column)
+    /// can be:
+    ///
+    /// ```text
+    ///   [a]
+    ///   [a,b]
+    ///   [a,b,d]
+    ///   ...
+    /// ```
+    ///
+    /// So the finally generated sqls will be:
+    ///
+    /// ```text
+    ///   SELECT aggr FROM t GROUP BY a;
+    ///   SELECT aggr FROM t GROUP BY a,b;
+    ///   SELECT aggr FROM t GROUP BY a,b,d;
+    ///   ...
+    /// ```
+    group_by_columns: Vec<String>,
+
+    /// Max columns num in randomly generated `groupings`
+    max_group_by_columns: usize,
+
+    /// The sort keys of dataset
+    ///
+    /// Due to optimizations will be triggered when all or some
+    /// grouping columns are the sort keys of dataset.
+    /// So it is necessary to randomly generate some `groupings` basing on
+    /// dataset sort keys for test coverage.
+    ///
+    /// # Example
+    ///
+    /// Dataset including columns [a,b,c], and sorted by [a,b]
+    ///
+    /// And we may generate sqls to try covering the sort-optimization cases 
like:
+    ///
+    /// ```text
+    ///   SELECT aggr FROM t GROUP BY b; // no permutation case
+    ///   SELECT aggr FROM t GROUP BY a,c; // partial permutation case
+    ///   SELECT aggr FROM t GROUP BY a,b,c; // full permutation case
+    ///   ...
+    /// ```
+    ///
+    /// More details can see [`GroupOrdering`].
+    ///
+    /// [`GroupOrdering`]:  
datafusion_physical_plan::aggregates::order::GroupOrdering
+    ///
+    dataset_sort_keys: Vec<Vec<String>>,
+
+    /// If we will also test the no grouping case like:
+    ///
+    /// ```text
+    ///   SELECT aggr FROM t;
+    /// ```
+    ///
+    no_grouping: bool,
+
+    // ====================================
+    // Aggregation function settings
+    // ====================================
+    /// Aggregate functions to be used in the query
+    /// (function_name, is_distinct)
+    aggregate_functions: Vec<(String, bool)>,
+
+    /// Possible columns for arguments in the aggregate functions
+    ///
+    /// Assumes each
+    arguments: Vec<String>,
+}
+
+impl QueryBuilder {
+    pub fn new() -> Self {
+        Self {
+            no_grouping: true,
+            max_group_by_columns: 3,
+            ..Default::default()
+        }
+    }
+
+    /// return the table name if any
+    pub fn table_name(&self) -> &str {
+        &self.table_name
+    }
+
+    /// Set the table name for the query builder
+    pub fn with_table_name(mut self, table_name: impl Into<String>) -> Self {
+        self.table_name = table_name.into();
+        self
+    }
+
+    /// Add a new possible aggregate function to the query builder
+    pub fn with_aggregate_function(
+        mut self,
+        aggregate_function: impl Into<String>,
+    ) -> Self {
+        self.aggregate_functions
+            .push((aggregate_function.into(), false));
+        self
+    }
+
+    /// Add a new possible `DISTINCT` aggregate function to the query
+    ///
+    /// This is different than `with_aggregate_function` because only certain
+    /// aggregates support `DISTINCT`
+    pub fn with_distinct_aggregate_function(
+        mut self,
+        aggregate_function: impl Into<String>,
+    ) -> Self {
+        self.aggregate_functions
+            .push((aggregate_function.into(), true));
+        self
+    }
+
+    /// Set the columns to be used in the group bys clauses
+    pub fn set_group_by_columns<'a>(
+        mut self,
+        group_by: impl IntoIterator<Item = &'a str>,
+    ) -> Self {
+        self.group_by_columns = 
group_by.into_iter().map(String::from).collect();
+        self
+    }
+
+    /// Add one or more columns to be used as an argument in the aggregate 
functions
+    pub fn with_aggregate_arguments<'a>(
+        mut self,
+        arguments: impl IntoIterator<Item = &'a str>,
+    ) -> Self {
+        let arguments = arguments.into_iter().map(String::from);
+        self.arguments.extend(arguments);
+        self
+    }
+
+    /// Add max columns num in group by(default: 3), for example if it is set 
to 1,
+    /// the generated sql will group by at most 1 column
+    #[allow(dead_code)]
+    pub fn with_max_group_by_columns(mut self, group_by_columns: usize) -> 
Self {
+        self.max_group_by_columns = group_by_columns;
+        self
+    }
+
+    /// Add sort keys of dataset if any, then the builder will generate 
queries basing on it
+    /// to cover the sort-optimization cases
+    pub fn with_dataset_sort_keys(mut self, dataset_sort_keys: 
Vec<Vec<String>>) -> Self {

Review Comment:
   If I don't misunderstand, the actual requirement is like that? 
   - We need to test sorted group by optimization path
   - But now, it is hard to randomly generate the right sort keys, due to the 
we need to restrict the sort keys to be the group by keys?



##########
datafusion/core/tests/fuzz_cases/aggregation_fuzzer/query_builder.rs:
##########
@@ -0,0 +1,371 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::{collections::HashSet, str::FromStr};
+
+use rand::{seq::SliceRandom, thread_rng, Rng};
+
+/// Random aggregate query builder
+///
+/// Creates queries like
+/// ```sql
+/// SELECT AGG(..) FROM table_name GROUP BY <group_by_columns>
+///```
+#[derive(Debug, Default, Clone)]
+pub struct QueryBuilder {
+    // ===================================
+    // Table settings
+    // ===================================
+    /// The name of the table to query
+    table_name: String,
+
+    // ===================================
+    // Grouping settings
+    // ===================================
+    /// Columns to be used in randomly generate `groupings`
+    ///
+    /// # Example
+    ///
+    /// Columns:
+    ///
+    /// ```text
+    ///   [a,b,c,d]
+    /// ```
+    ///
+    /// And randomly generated `groupings` (at least 1 column)
+    /// can be:
+    ///
+    /// ```text
+    ///   [a]
+    ///   [a,b]
+    ///   [a,b,d]
+    ///   ...
+    /// ```
+    ///
+    /// So the finally generated sqls will be:
+    ///
+    /// ```text
+    ///   SELECT aggr FROM t GROUP BY a;
+    ///   SELECT aggr FROM t GROUP BY a,b;
+    ///   SELECT aggr FROM t GROUP BY a,b,d;
+    ///   ...
+    /// ```
+    group_by_columns: Vec<String>,
+
+    /// Max columns num in randomly generated `groupings`
+    max_group_by_columns: usize,
+
+    /// The sort keys of dataset
+    ///
+    /// Due to optimizations will be triggered when all or some
+    /// grouping columns are the sort keys of dataset.
+    /// So it is necessary to randomly generate some `groupings` basing on
+    /// dataset sort keys for test coverage.
+    ///
+    /// # Example
+    ///
+    /// Dataset including columns [a,b,c], and sorted by [a,b]
+    ///
+    /// And we may generate sqls to try covering the sort-optimization cases 
like:
+    ///
+    /// ```text
+    ///   SELECT aggr FROM t GROUP BY b; // no permutation case
+    ///   SELECT aggr FROM t GROUP BY a,c; // partial permutation case
+    ///   SELECT aggr FROM t GROUP BY a,b,c; // full permutation case
+    ///   ...
+    /// ```
+    ///
+    /// More details can see [`GroupOrdering`].
+    ///
+    /// [`GroupOrdering`]:  
datafusion_physical_plan::aggregates::order::GroupOrdering
+    ///
+    dataset_sort_keys: Vec<Vec<String>>,
+
+    /// If we will also test the no grouping case like:
+    ///
+    /// ```text
+    ///   SELECT aggr FROM t;
+    /// ```
+    ///
+    no_grouping: bool,
+
+    // ====================================
+    // Aggregation function settings
+    // ====================================
+    /// Aggregate functions to be used in the query
+    /// (function_name, is_distinct)
+    aggregate_functions: Vec<(String, bool)>,
+
+    /// Possible columns for arguments in the aggregate functions
+    ///
+    /// Assumes each
+    arguments: Vec<String>,
+}
+
+impl QueryBuilder {
+    pub fn new() -> Self {
+        Self {
+            no_grouping: true,
+            max_group_by_columns: 3,
+            ..Default::default()
+        }
+    }
+
+    /// return the table name if any
+    pub fn table_name(&self) -> &str {
+        &self.table_name
+    }
+
+    /// Set the table name for the query builder
+    pub fn with_table_name(mut self, table_name: impl Into<String>) -> Self {
+        self.table_name = table_name.into();
+        self
+    }
+
+    /// Add a new possible aggregate function to the query builder
+    pub fn with_aggregate_function(
+        mut self,
+        aggregate_function: impl Into<String>,
+    ) -> Self {
+        self.aggregate_functions
+            .push((aggregate_function.into(), false));
+        self
+    }
+
+    /// Add a new possible `DISTINCT` aggregate function to the query
+    ///
+    /// This is different than `with_aggregate_function` because only certain
+    /// aggregates support `DISTINCT`
+    pub fn with_distinct_aggregate_function(
+        mut self,
+        aggregate_function: impl Into<String>,
+    ) -> Self {
+        self.aggregate_functions
+            .push((aggregate_function.into(), true));
+        self
+    }
+
+    /// Set the columns to be used in the group bys clauses
+    pub fn set_group_by_columns<'a>(
+        mut self,
+        group_by: impl IntoIterator<Item = &'a str>,
+    ) -> Self {
+        self.group_by_columns = 
group_by.into_iter().map(String::from).collect();
+        self
+    }
+
+    /// Add one or more columns to be used as an argument in the aggregate 
functions
+    pub fn with_aggregate_arguments<'a>(
+        mut self,
+        arguments: impl IntoIterator<Item = &'a str>,
+    ) -> Self {
+        let arguments = arguments.into_iter().map(String::from);
+        self.arguments.extend(arguments);
+        self
+    }
+
+    /// Add max columns num in group by(default: 3), for example if it is set 
to 1,
+    /// the generated sql will group by at most 1 column
+    #[allow(dead_code)]
+    pub fn with_max_group_by_columns(mut self, group_by_columns: usize) -> 
Self {
+        self.max_group_by_columns = group_by_columns;
+        self
+    }
+
+    /// Add sort keys of dataset if any, then the builder will generate 
queries basing on it
+    /// to cover the sort-optimization cases
+    pub fn with_dataset_sort_keys(mut self, dataset_sort_keys: 
Vec<Vec<String>>) -> Self {

Review Comment:
   For example, the dataset has columns `[a,b,c,d,e]`.
   
   And we randomly pick `[a,b]` as the `sort key`.
   
   But unfortunately, only `[c,d,e]` picked as the `group by key`.
   
   So finally, the testcases can not cover the sorted-optimization?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org
For additional commands, e-mail: github-h...@datafusion.apache.org

Re: [PR] Make aggr fuzzer query builder more configurable [datafusion]

Reply via email to